Fix restart/reload endpoint correctness and safety

- jail_service.restart(): replace invalid ["restart"] socket command with
  ["stop"], matching fail2ban transmitter protocol. The daemon is now
  stopped via socket; the caller starts it via subprocess.

- config_file_service: expose _start_daemon and _wait_for_fail2ban as
  public start_daemon / wait_for_fail2ban functions.

- restart_fail2ban router: orchestrate stop (socket) → start (subprocess)
  → probe (socket). Returns 204 on success, 503 when fail2ban does not
  come back within 10 s. Catches JailOperationError → 409.

- reload_fail2ban router: add JailOperationError catch → 409 Conflict,
  consistent with other jail control endpoints.

- Tests: add TestJailControls.test_restart_* (3 cases), TestReloadFail2ban
  502/409 cases, TestRestartFail2ban (5 cases), TestRollbackJail (6
  integration tests verifying file-write, subprocess invocation, socket-
  probe truthiness, active_jails count, and offline-at-call-time).
This commit is contained in:
2026-03-15 12:59:17 +01:00
parent 61daa8bbc0
commit 93dc699825
7 changed files with 487 additions and 135 deletions

View File

@@ -40,9 +40,12 @@ from __future__ import annotations
import datetime
from typing import Annotated
import structlog
from fastapi import APIRouter, HTTPException, Path, Query, Request, status
from app.dependencies import AuthDep
log: structlog.stdlib.BoundLogger = structlog.get_logger()
from app.models.config import (
ActionConfig,
ActionCreateRequest,
@@ -97,6 +100,7 @@ from app.services.config_service import (
ConfigValidationError,
JailNotFoundError,
)
from app.services.jail_service import JailOperationError
from app.tasks.health_check import _run_probe
from app.utils.fail2ban_client import Fail2BanConnectionError
@@ -357,11 +361,17 @@ async def reload_fail2ban(
_auth: Validated session.
Raises:
HTTPException: 409 when fail2ban reports the reload failed.
HTTPException: 502 when fail2ban is unreachable.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
try:
await jail_service.reload_all(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban reload failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
@@ -381,24 +391,57 @@ async def restart_fail2ban(
) -> None:
"""Trigger a full fail2ban service restart.
The fail2ban daemon is completely stopped and then started again,
re-reading all configuration files in the process.
Stops the fail2ban daemon via the Unix domain socket, then starts it
again using the configured ``fail2ban_start_command``. After starting,
probes the socket for up to 10 seconds to confirm the daemon came back
online.
Args:
request: Incoming request.
_auth: Validated session.
Raises:
HTTPException: 502 when fail2ban is unreachable.
HTTPException: 409 when fail2ban reports the stop command failed.
HTTPException: 502 when fail2ban is unreachable for the stop command.
HTTPException: 503 when fail2ban does not come back online within
10 seconds after being started. Check the fail2ban log for
initialisation errors. Use
``POST /api/config/jails/{name}/rollback`` if a specific jail
is suspect.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
start_cmd: str = request.app.state.settings.fail2ban_start_command
start_cmd_parts: list[str] = start_cmd.split()
# Step 1: stop the daemon via socket.
try:
# Perform restart by sending the restart command via the fail2ban socket.
# If fail2ban is not running, this will raise an exception, and we return 502.
await jail_service.restart(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban stop command failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
# Step 2: start the daemon via subprocess.
await config_file_service.start_daemon(start_cmd_parts)
# Step 3: probe the socket until fail2ban is responsive or the budget expires.
fail2ban_running: bool = await config_file_service.wait_for_fail2ban(
socket_path, max_wait_seconds=10.0
)
if not fail2ban_running:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=(
"fail2ban was stopped but did not come back online within 10 seconds. "
"Check the fail2ban log for initialisation errors. "
"Use POST /api/config/jails/{name}/rollback if a specific jail is suspect."
),
)
log.info("fail2ban_restarted")
# ---------------------------------------------------------------------------
# Regex tester (stateless)

View File

@@ -740,7 +740,7 @@ async def _probe_fail2ban_running(socket_path: str) -> bool:
return False
async def _wait_for_fail2ban(
async def wait_for_fail2ban(
socket_path: str,
max_wait_seconds: float = 10.0,
poll_interval: float = 2.0,
@@ -764,7 +764,7 @@ async def _wait_for_fail2ban(
return False
async def _start_daemon(start_cmd_parts: list[str]) -> bool:
async def start_daemon(start_cmd_parts: list[str]) -> bool:
"""Start the fail2ban daemon using *start_cmd_parts*.
Uses :func:`asyncio.create_subprocess_exec` (no shell interpretation)
@@ -1541,11 +1541,11 @@ async def rollback_jail(
log.info("jail_rolled_back_disabled", jail=name)
# Attempt to start the daemon.
started = await _start_daemon(start_cmd_parts)
started = await start_daemon(start_cmd_parts)
log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
# Wait for the socket to come back.
fail2ban_running = await _wait_for_fail2ban(
fail2ban_running = await wait_for_fail2ban(
socket_path, max_wait_seconds=10.0, poll_interval=2.0
)

View File

@@ -685,24 +685,29 @@ async def reload_all(
async def restart(socket_path: str) -> None:
"""Restart the fail2ban service (daemon).
"""Stop the fail2ban daemon via the Unix socket.
Sends the 'restart' command to the fail2ban daemon via the Unix socket.
All jails are stopped and the daemon is restarted, re-reading all
configuration from scratch.
Sends ``["stop"]`` to the fail2ban daemon, which calls ``server.quit()``
on the daemon side and tears down all jails. The caller is responsible
for starting the daemon again (e.g. via ``fail2ban-client start``).
Note:
``["restart"]`` is a *client-side* orchestration command that is not
handled by the fail2ban server transmitter — sending it to the socket
raises ``"Invalid command"`` in the daemon.
Args:
socket_path: Path to the fail2ban Unix domain socket.
Raises:
JailOperationError: If fail2ban reports the operation failed.
JailOperationError: If fail2ban reports the stop command failed.
~app.utils.fail2ban_client.Fail2BanConnectionError: If the socket
cannot be reached.
"""
client = Fail2BanClient(socket_path=socket_path, timeout=_SOCKET_TIMEOUT)
try:
_ok(await client.send(["restart"]))
log.info("fail2ban_restarted")
_ok(await client.send(["stop"]))
log.info("fail2ban_stopped_for_restart")
except ValueError as exc:
raise JailOperationError(str(exc)) from exc

View File

@@ -370,6 +370,124 @@ class TestReloadFail2ban:
assert resp.status_code == 204
async def test_502_when_fail2ban_unreachable(self, config_client: AsyncClient) -> None:
"""POST /api/config/reload returns 502 when fail2ban socket is unreachable."""
from app.utils.fail2ban_client import Fail2BanConnectionError
with patch(
"app.routers.config.jail_service.reload_all",
AsyncMock(side_effect=Fail2BanConnectionError("no socket", "/fake.sock")),
):
resp = await config_client.post("/api/config/reload")
assert resp.status_code == 502
async def test_409_when_reload_operation_fails(self, config_client: AsyncClient) -> None:
"""POST /api/config/reload returns 409 when fail2ban reports a reload error."""
from app.services.jail_service import JailOperationError
with patch(
"app.routers.config.jail_service.reload_all",
AsyncMock(side_effect=JailOperationError("reload rejected")),
):
resp = await config_client.post("/api/config/reload")
assert resp.status_code == 409
# ---------------------------------------------------------------------------
# POST /api/config/restart
# ---------------------------------------------------------------------------
class TestRestartFail2ban:
"""Tests for ``POST /api/config/restart``."""
async def test_204_on_success(self, config_client: AsyncClient) -> None:
"""POST /api/config/restart returns 204 when fail2ban restarts cleanly."""
with (
patch(
"app.routers.config.jail_service.restart",
AsyncMock(return_value=None),
),
patch(
"app.routers.config.config_file_service.start_daemon",
AsyncMock(return_value=True),
),
patch(
"app.routers.config.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=True),
),
):
resp = await config_client.post("/api/config/restart")
assert resp.status_code == 204
async def test_503_when_fail2ban_does_not_come_back(self, config_client: AsyncClient) -> None:
"""POST /api/config/restart returns 503 when fail2ban does not come back online."""
with (
patch(
"app.routers.config.jail_service.restart",
AsyncMock(return_value=None),
),
patch(
"app.routers.config.config_file_service.start_daemon",
AsyncMock(return_value=True),
),
patch(
"app.routers.config.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=False),
),
):
resp = await config_client.post("/api/config/restart")
assert resp.status_code == 503
async def test_409_when_stop_command_fails(self, config_client: AsyncClient) -> None:
"""POST /api/config/restart returns 409 when fail2ban rejects the stop command."""
from app.services.jail_service import JailOperationError
with patch(
"app.routers.config.jail_service.restart",
AsyncMock(side_effect=JailOperationError("stop failed")),
):
resp = await config_client.post("/api/config/restart")
assert resp.status_code == 409
async def test_502_when_fail2ban_unreachable(self, config_client: AsyncClient) -> None:
"""POST /api/config/restart returns 502 when fail2ban socket is unreachable."""
from app.utils.fail2ban_client import Fail2BanConnectionError
with patch(
"app.routers.config.jail_service.restart",
AsyncMock(side_effect=Fail2BanConnectionError("no socket", "/fake.sock")),
):
resp = await config_client.post("/api/config/restart")
assert resp.status_code == 502
async def test_start_daemon_called_after_stop(self, config_client: AsyncClient) -> None:
"""start_daemon is called after a successful stop."""
mock_start = AsyncMock(return_value=True)
with (
patch(
"app.routers.config.jail_service.restart",
AsyncMock(return_value=None),
),
patch(
"app.routers.config.config_file_service.start_daemon",
mock_start,
),
patch(
"app.routers.config.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=True),
),
):
await config_client.post("/api/config/restart")
mock_start.assert_awaited_once()
# ---------------------------------------------------------------------------
# POST /api/config/regex-test

View File

@@ -21,6 +21,7 @@ from app.services.config_file_service import (
activate_jail,
deactivate_jail,
list_inactive_jails,
rollback_jail,
)
# ---------------------------------------------------------------------------
@@ -3174,4 +3175,150 @@ class TestActivateJailRollback:
assert "logpath" in result.message.lower() or "check that all logpath" in result.message.lower()
# ---------------------------------------------------------------------------
# rollback_jail
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
class TestRollbackJail:
"""Integration tests for :func:`~app.services.config_file_service.rollback_jail`."""
async def test_local_file_written_enabled_false(self, tmp_path: Path) -> None:
"""rollback_jail writes enabled=false to jail.d/{name}.local before any socket call."""
(tmp_path / "jail.d").mkdir()
with (
patch(
"app.services.config_file_service.start_daemon",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service._get_active_jail_names",
AsyncMock(return_value={"sshd"}),
),
):
await rollback_jail(str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"])
local = tmp_path / "jail.d" / "sshd.local"
assert local.is_file(), "jail.d/sshd.local must be written"
content = local.read_text()
assert "enabled = false" in content
async def test_start_command_invoked_via_subprocess(self, tmp_path: Path) -> None:
"""rollback_jail invokes the daemon start command via start_daemon, not via socket."""
mock_start = AsyncMock(return_value=True)
with (
patch("app.services.config_file_service.start_daemon", mock_start),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service._get_active_jail_names",
AsyncMock(return_value={"other"}),
),
):
await rollback_jail(
str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"]
)
mock_start.assert_awaited_once_with(["fail2ban-client", "start"])
async def test_fail2ban_running_reflects_socket_probe_not_subprocess_exit(
self, tmp_path: Path
) -> None:
"""fail2ban_running in the response reflects the socket probe result.
Even when start_daemon returns True (subprocess exit 0), if the socket
probe returns False the response must report fail2ban_running=False.
"""
with (
patch(
"app.services.config_file_service.start_daemon",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=False), # socket still unresponsive
),
):
result = await rollback_jail(
str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"]
)
assert result.fail2ban_running is False
async def test_active_jails_zero_when_fail2ban_not_running(
self, tmp_path: Path
) -> None:
"""active_jails is 0 in the response when fail2ban_running is False."""
with (
patch(
"app.services.config_file_service.start_daemon",
AsyncMock(return_value=False),
),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=False),
),
):
result = await rollback_jail(
str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"]
)
assert result.active_jails == 0
async def test_active_jails_count_from_socket_when_running(
self, tmp_path: Path
) -> None:
"""active_jails reflects the actual jail count from the socket when fail2ban is up."""
with (
patch(
"app.services.config_file_service.start_daemon",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=True),
),
patch(
"app.services.config_file_service._get_active_jail_names",
AsyncMock(return_value={"sshd", "nginx", "apache-auth"}),
),
):
result = await rollback_jail(
str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"]
)
assert result.active_jails == 3
async def test_fail2ban_down_at_start_still_succeeds_file_write(
self, tmp_path: Path
) -> None:
"""rollback_jail writes the local file even when fail2ban is down at call time."""
# fail2ban is down: start_daemon fails and wait_for_fail2ban returns False.
with (
patch(
"app.services.config_file_service.start_daemon",
AsyncMock(return_value=False),
),
patch(
"app.services.config_file_service.wait_for_fail2ban",
AsyncMock(return_value=False),
),
):
result = await rollback_jail(
str(tmp_path), "/fake.sock", "sshd", ["fail2ban-client", "start"]
)
local = tmp_path / "jail.d" / "sshd.local"
assert local.is_file(), "local file must be written even when fail2ban is down"
assert result.disabled is True
assert result.fail2ban_running is False

View File

@@ -441,6 +441,33 @@ class TestJailControls:
)
assert exc_info.value.name == "airsonic-auth"
async def test_restart_sends_stop_command(self) -> None:
"""restart() sends the ['stop'] command to the fail2ban socket."""
with _patch_client({"stop": (0, None)}):
await jail_service.restart(_SOCKET) # should not raise
async def test_restart_operation_error_raises(self) -> None:
"""restart() raises JailOperationError when fail2ban rejects the stop."""
with _patch_client({"stop": (1, Exception("cannot stop"))}), pytest.raises(
JailOperationError
):
await jail_service.restart(_SOCKET)
async def test_restart_connection_error_propagates(self) -> None:
"""restart() propagates Fail2BanConnectionError when socket is unreachable."""
class _FailClient:
def __init__(self, **_kw: Any) -> None:
self.send = AsyncMock(
side_effect=Fail2BanConnectionError("no socket", _SOCKET)
)
with (
patch("app.services.jail_service.Fail2BanClient", _FailClient),
pytest.raises(Fail2BanConnectionError),
):
await jail_service.restart(_SOCKET)
async def test_start_not_found_raises(self) -> None:
"""start_jail raises JailNotFoundError for unknown jail."""
with _patch_client({"start|ghost": (1, Exception("Unknown jail: 'ghost'"))}), pytest.raises(JailNotFoundError):