Fix restart/reload endpoint correctness and safety

- jail_service.restart(): replace invalid ["restart"] socket command with
  ["stop"], matching fail2ban transmitter protocol. The daemon is now
  stopped via socket; the caller starts it via subprocess.

- config_file_service: expose _start_daemon and _wait_for_fail2ban as
  public start_daemon / wait_for_fail2ban functions.

- restart_fail2ban router: orchestrate stop (socket) → start (subprocess)
  → probe (socket). Returns 204 on success, 503 when fail2ban does not
  come back within 10 s. Catches JailOperationError → 409.

- reload_fail2ban router: add JailOperationError catch → 409 Conflict,
  consistent with other jail control endpoints.

- Tests: add TestJailControls.test_restart_* (3 cases), TestReloadFail2ban
  502/409 cases, TestRestartFail2ban (5 cases), TestRollbackJail (6
  integration tests verifying file-write, subprocess invocation, socket-
  probe truthiness, active_jails count, and offline-at-call-time).
This commit is contained in:
2026-03-15 12:59:17 +01:00
parent 61daa8bbc0
commit 93dc699825
7 changed files with 487 additions and 135 deletions

View File

@@ -40,9 +40,12 @@ from __future__ import annotations
import datetime
from typing import Annotated
import structlog
from fastapi import APIRouter, HTTPException, Path, Query, Request, status
from app.dependencies import AuthDep
log: structlog.stdlib.BoundLogger = structlog.get_logger()
from app.models.config import (
ActionConfig,
ActionCreateRequest,
@@ -97,6 +100,7 @@ from app.services.config_service import (
ConfigValidationError,
JailNotFoundError,
)
from app.services.jail_service import JailOperationError
from app.tasks.health_check import _run_probe
from app.utils.fail2ban_client import Fail2BanConnectionError
@@ -357,11 +361,17 @@ async def reload_fail2ban(
_auth: Validated session.
Raises:
HTTPException: 409 when fail2ban reports the reload failed.
HTTPException: 502 when fail2ban is unreachable.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
try:
await jail_service.reload_all(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban reload failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
@@ -381,24 +391,57 @@ async def restart_fail2ban(
) -> None:
"""Trigger a full fail2ban service restart.
The fail2ban daemon is completely stopped and then started again,
re-reading all configuration files in the process.
Stops the fail2ban daemon via the Unix domain socket, then starts it
again using the configured ``fail2ban_start_command``. After starting,
probes the socket for up to 10 seconds to confirm the daemon came back
online.
Args:
request: Incoming request.
_auth: Validated session.
Raises:
HTTPException: 502 when fail2ban is unreachable.
HTTPException: 409 when fail2ban reports the stop command failed.
HTTPException: 502 when fail2ban is unreachable for the stop command.
HTTPException: 503 when fail2ban does not come back online within
10 seconds after being started. Check the fail2ban log for
initialisation errors. Use
``POST /api/config/jails/{name}/rollback`` if a specific jail
is suspect.
"""
socket_path: str = request.app.state.settings.fail2ban_socket
start_cmd: str = request.app.state.settings.fail2ban_start_command
start_cmd_parts: list[str] = start_cmd.split()
# Step 1: stop the daemon via socket.
try:
# Perform restart by sending the restart command via the fail2ban socket.
# If fail2ban is not running, this will raise an exception, and we return 502.
await jail_service.restart(socket_path)
except JailOperationError as exc:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail=f"fail2ban stop command failed: {exc}",
) from exc
except Fail2BanConnectionError as exc:
raise _bad_gateway(exc) from exc
# Step 2: start the daemon via subprocess.
await config_file_service.start_daemon(start_cmd_parts)
# Step 3: probe the socket until fail2ban is responsive or the budget expires.
fail2ban_running: bool = await config_file_service.wait_for_fail2ban(
socket_path, max_wait_seconds=10.0
)
if not fail2ban_running:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=(
"fail2ban was stopped but did not come back online within 10 seconds. "
"Check the fail2ban log for initialisation errors. "
"Use POST /api/config/jails/{name}/rollback if a specific jail is suspect."
),
)
log.info("fail2ban_restarted")
# ---------------------------------------------------------------------------
# Regex tester (stateless)