Fix restart/reload endpoint correctness and safety

- jail_service.restart(): replace invalid ["restart"] socket command with
  ["stop"], matching fail2ban transmitter protocol. The daemon is now
  stopped via socket; the caller starts it via subprocess.

- config_file_service: expose _start_daemon and _wait_for_fail2ban as
  public start_daemon / wait_for_fail2ban functions.

- restart_fail2ban router: orchestrate stop (socket) → start (subprocess)
  → probe (socket). Returns 204 on success, 503 when fail2ban does not
  come back within 10 s. Catches JailOperationError → 409.

- reload_fail2ban router: add JailOperationError catch → 409 Conflict,
  consistent with other jail control endpoints.

- Tests: add TestJailControls.test_restart_* (3 cases), TestReloadFail2ban
  502/409 cases, TestRestartFail2ban (5 cases), TestRollbackJail (6
  integration tests verifying file-write, subprocess invocation, socket-
  probe truthiness, active_jails count, and offline-at-call-time).
This commit is contained in:
2026-03-15 12:59:17 +01:00
parent 61daa8bbc0
commit 93dc699825
7 changed files with 487 additions and 135 deletions

View File

@@ -40,9 +40,12 @@ from __future__ import annotations
import datetime
from typing import Annotated
import structlog
from fastapi import APIRouter, HTTPException, Path, Query, Request, status
from app.dependencies import AuthDep
log: structlog.stdlib.BoundLogger = structlog.get_logger()
from app.models.config import (
ActionConfig,
ActionCreateRequest,
@@ -97,6 +100,7 @@ from app.services.config_service import (
ConfigValidationError,
JailNotFoundError,
)
from app.services.jail_service import JailOperationError
from app.tasks.health_check import _run_probe
from app.utils.fail2ban_client import Fail2BanConnectionError
@@ -357,11 +361,17 @@ async def reload_fail2ban(
_auth: Validated session.
Raises:
HTTPException: 409 when fail2ban reports the reload failed.
HTTPException: 502 when fail2ban is unreachable.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
try:
await jail_service.reload_all(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban reload failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
@@ -381,24 +391,57 @@ async def restart_fail2ban(
) -> None:
"""Trigger a full fail2ban service restart.
The fail2ban daemon is completely stopped and then started again,
re-reading all configuration files in the process.
Stops the fail2ban daemon via the Unix domain socket, then starts it
again using the configured ``fail2ban_start_command``. After starting,
probes the socket for up to 10 seconds to confirm the daemon came back
online.
Args:
request: Incoming request.
_auth: Validated session.
Raises:
HTTPException: 502 when fail2ban is unreachable.
HTTPException: 409 when fail2ban reports the stop command failed.
HTTPException: 502 when fail2ban is unreachable for the stop command.
HTTPException: 503 when fail2ban does not come back online within
10 seconds after being started. Check the fail2ban log for
initialisation errors. Use
``POST /api/config/jails/{name}/rollback`` if a specific jail
is suspect.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
start_cmd: str = request.app.state.settings.fail2ban_start_command
start_cmd_parts: list[str] = start_cmd.split()
# Step 1: stop the daemon via socket.
try:
# Perform restart by sending the restart command via the fail2ban socket.
# If fail2ban is not running, this will raise an exception, and we return 502.
await jail_service.restart(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban stop command failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
# Step 2: start the daemon via subprocess.
await config_file_service.start_daemon(start_cmd_parts)
# Step 3: probe the socket until fail2ban is responsive or the budget expires.
fail2ban_running: bool = await config_file_service.wait_for_fail2ban(
socket_path, max_wait_seconds=10.0
)
if not fail2ban_running:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=(
"fail2ban was stopped but did not come back online within 10 seconds. "
"Check the fail2ban log for initialisation errors. "
"Use POST /api/config/jails/{name}/rollback if a specific jail is suspect."
),
)
log.info("fail2ban_restarted")
# ---------------------------------------------------------------------------
# Regex tester (stateless)

View File

@@ -740,7 +740,7 @@ async def _probe_fail2ban_running(socket_path: str) -> bool:
return False
async def _wait_for_fail2ban(
async def wait_for_fail2ban(
socket_path: str,
max_wait_seconds: float = 10.0,
poll_interval: float = 2.0,
@@ -764,7 +764,7 @@ async def _wait_for_fail2ban(
return False
async def _start_daemon(start_cmd_parts: list[str]) -> bool:
async def start_daemon(start_cmd_parts: list[str]) -> bool:
"""Start the fail2ban daemon using *start_cmd_parts*.
Uses :func:`asyncio.create_subprocess_exec` (no shell interpretation)
@@ -1541,11 +1541,11 @@ async def rollback_jail(
log.info("jail_rolled_back_disabled", jail=name)
# Attempt to start the daemon.
started = await _start_daemon(start_cmd_parts)
started = await start_daemon(start_cmd_parts)
log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
# Wait for the socket to come back.
fail2ban_running = await _wait_for_fail2ban(
fail2ban_running = await wait_for_fail2ban(
socket_path, max_wait_seconds=10.0, poll_interval=2.0
)

View File

@@ -685,24 +685,29 @@ async def reload_all(
async def restart(socket_path: str) -> None:
"""Restart the fail2ban service (daemon).
"""Stop the fail2ban daemon via the Unix socket.
Sends the 'restart' command to the fail2ban daemon via the Unix socket.
All jails are stopped and the daemon is restarted, re-reading all
configuration from scratch.
Sends ``["stop"]`` to the fail2ban daemon, which calls ``server.quit()``
on the daemon side and tears down all jails. The caller is responsible
for starting the daemon again (e.g. via ``fail2ban-client start``).
Note:
``["restart"]`` is a *client-side* orchestration command that is not
handled by the fail2ban server transmitter — sending it to the socket
raises ``"Invalid command"`` in the daemon.
Args:
socket_path: Path to the fail2ban Unix domain socket.
Raises:
JailOperationError: If fail2ban reports the operation failed.
JailOperationError: If fail2ban reports the stop command failed.
~app.utils.fail2ban_client.Fail2BanConnectionError: If the socket
cannot be reached.
"""
client = Fail2BanClient(socket_path=socket_path, timeout=_SOCKET_TIMEOUT)
try:
_ok(await client.send(["restart"]))
log.info("fail2ban_restarted")
_ok(await client.send(["stop"]))
log.info("fail2ban_stopped_for_restart")
except ValueError as exc:
raise JailOperationError(str(exc)) from exc