Fix restart/reload endpoint correctness and safety
- jail_service.restart(): replace invalid ["restart"] socket command with ["stop"], matching fail2ban transmitter protocol. The daemon is now stopped via socket; the caller starts it via subprocess. - config_file_service: expose _start_daemon and _wait_for_fail2ban as public start_daemon / wait_for_fail2ban functions. - restart_fail2ban router: orchestrate stop (socket) → start (subprocess) → probe (socket). Returns 204 on success, 503 when fail2ban does not come back within 10 s. Catches JailOperationError → 409. - reload_fail2ban router: add JailOperationError catch → 409 Conflict, consistent with other jail control endpoints. - Tests: add TestJailControls.test_restart_* (3 cases), TestReloadFail2ban 502/409 cases, TestRestartFail2ban (5 cases), TestRollbackJail (6 integration tests verifying file-write, subprocess invocation, socket- probe truthiness, active_jails count, and offline-at-call-time).
This commit is contained in:
@@ -40,9 +40,12 @@ from __future__ import annotations
|
||||
import datetime
|
||||
from typing import Annotated
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter, HTTPException, Path, Query, Request, status
|
||||
|
||||
from app.dependencies import AuthDep
|
||||
|
||||
log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
from app.models.config import (
|
||||
ActionConfig,
|
||||
ActionCreateRequest,
|
||||
@@ -97,6 +100,7 @@ from app.services.config_service import (
|
||||
ConfigValidationError,
|
||||
JailNotFoundError,
|
||||
)
|
||||
from app.services.jail_service import JailOperationError
|
||||
from app.tasks.health_check import _run_probe
|
||||
from app.utils.fail2ban_client import Fail2BanConnectionError
|
||||
|
||||
@@ -357,11 +361,17 @@ async def reload_fail2ban(
|
||||
_auth: Validated session.
|
||||
|
||||
Raises:
|
||||
HTTPException: 409 when fail2ban reports the reload failed.
|
||||
HTTPException: 502 when fail2ban is unreachable.
|
||||
"""
|
||||
socket_path: str = request.app.state.settings.fail2ban_socket
|
||||
try:
|
||||
await jail_service.reload_all(socket_path)
|
||||
except JailOperationError as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail=f"fail2ban reload failed: {exc}",
|
||||
) from exc
|
||||
except Fail2BanConnectionError as exc:
|
||||
raise _bad_gateway(exc) from exc
|
||||
|
||||
@@ -381,24 +391,57 @@ async def restart_fail2ban(
|
||||
) -> None:
|
||||
"""Trigger a full fail2ban service restart.
|
||||
|
||||
The fail2ban daemon is completely stopped and then started again,
|
||||
re-reading all configuration files in the process.
|
||||
Stops the fail2ban daemon via the Unix domain socket, then starts it
|
||||
again using the configured ``fail2ban_start_command``. After starting,
|
||||
probes the socket for up to 10 seconds to confirm the daemon came back
|
||||
online.
|
||||
|
||||
Args:
|
||||
request: Incoming request.
|
||||
_auth: Validated session.
|
||||
|
||||
Raises:
|
||||
HTTPException: 502 when fail2ban is unreachable.
|
||||
HTTPException: 409 when fail2ban reports the stop command failed.
|
||||
HTTPException: 502 when fail2ban is unreachable for the stop command.
|
||||
HTTPException: 503 when fail2ban does not come back online within
|
||||
10 seconds after being started. Check the fail2ban log for
|
||||
initialisation errors. Use
|
||||
``POST /api/config/jails/{name}/rollback`` if a specific jail
|
||||
is suspect.
|
||||
"""
|
||||
socket_path: str = request.app.state.settings.fail2ban_socket
|
||||
start_cmd: str = request.app.state.settings.fail2ban_start_command
|
||||
start_cmd_parts: list[str] = start_cmd.split()
|
||||
|
||||
# Step 1: stop the daemon via socket.
|
||||
try:
|
||||
# Perform restart by sending the restart command via the fail2ban socket.
|
||||
# If fail2ban is not running, this will raise an exception, and we return 502.
|
||||
await jail_service.restart(socket_path)
|
||||
except JailOperationError as exc:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail=f"fail2ban stop command failed: {exc}",
|
||||
) from exc
|
||||
except Fail2BanConnectionError as exc:
|
||||
raise _bad_gateway(exc) from exc
|
||||
|
||||
# Step 2: start the daemon via subprocess.
|
||||
await config_file_service.start_daemon(start_cmd_parts)
|
||||
|
||||
# Step 3: probe the socket until fail2ban is responsive or the budget expires.
|
||||
fail2ban_running: bool = await config_file_service.wait_for_fail2ban(
|
||||
socket_path, max_wait_seconds=10.0
|
||||
)
|
||||
if not fail2ban_running:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail=(
|
||||
"fail2ban was stopped but did not come back online within 10 seconds. "
|
||||
"Check the fail2ban log for initialisation errors. "
|
||||
"Use POST /api/config/jails/{name}/rollback if a specific jail is suspect."
|
||||
),
|
||||
)
|
||||
log.info("fail2ban_restarted")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regex tester (stateless)
|
||||
|
||||
@@ -740,7 +740,7 @@ async def _probe_fail2ban_running(socket_path: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
async def _wait_for_fail2ban(
|
||||
async def wait_for_fail2ban(
|
||||
socket_path: str,
|
||||
max_wait_seconds: float = 10.0,
|
||||
poll_interval: float = 2.0,
|
||||
@@ -764,7 +764,7 @@ async def _wait_for_fail2ban(
|
||||
return False
|
||||
|
||||
|
||||
async def _start_daemon(start_cmd_parts: list[str]) -> bool:
|
||||
async def start_daemon(start_cmd_parts: list[str]) -> bool:
|
||||
"""Start the fail2ban daemon using *start_cmd_parts*.
|
||||
|
||||
Uses :func:`asyncio.create_subprocess_exec` (no shell interpretation)
|
||||
@@ -1541,11 +1541,11 @@ async def rollback_jail(
|
||||
log.info("jail_rolled_back_disabled", jail=name)
|
||||
|
||||
# Attempt to start the daemon.
|
||||
started = await _start_daemon(start_cmd_parts)
|
||||
started = await start_daemon(start_cmd_parts)
|
||||
log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
|
||||
|
||||
# Wait for the socket to come back.
|
||||
fail2ban_running = await _wait_for_fail2ban(
|
||||
fail2ban_running = await wait_for_fail2ban(
|
||||
socket_path, max_wait_seconds=10.0, poll_interval=2.0
|
||||
)
|
||||
|
||||
|
||||
@@ -685,24 +685,29 @@ async def reload_all(
|
||||
|
||||
|
||||
async def restart(socket_path: str) -> None:
|
||||
"""Restart the fail2ban service (daemon).
|
||||
"""Stop the fail2ban daemon via the Unix socket.
|
||||
|
||||
Sends the 'restart' command to the fail2ban daemon via the Unix socket.
|
||||
All jails are stopped and the daemon is restarted, re-reading all
|
||||
configuration from scratch.
|
||||
Sends ``["stop"]`` to the fail2ban daemon, which calls ``server.quit()``
|
||||
on the daemon side and tears down all jails. The caller is responsible
|
||||
for starting the daemon again (e.g. via ``fail2ban-client start``).
|
||||
|
||||
Note:
|
||||
``["restart"]`` is a *client-side* orchestration command that is not
|
||||
handled by the fail2ban server transmitter — sending it to the socket
|
||||
raises ``"Invalid command"`` in the daemon.
|
||||
|
||||
Args:
|
||||
socket_path: Path to the fail2ban Unix domain socket.
|
||||
|
||||
Raises:
|
||||
JailOperationError: If fail2ban reports the operation failed.
|
||||
JailOperationError: If fail2ban reports the stop command failed.
|
||||
~app.utils.fail2ban_client.Fail2BanConnectionError: If the socket
|
||||
cannot be reached.
|
||||
"""
|
||||
client = Fail2BanClient(socket_path=socket_path, timeout=_SOCKET_TIMEOUT)
|
||||
try:
|
||||
_ok(await client.send(["restart"]))
|
||||
log.info("fail2ban_restarted")
|
||||
_ok(await client.send(["stop"]))
|
||||
log.info("fail2ban_stopped_for_restart")
|
||||
except ValueError as exc:
|
||||
raise JailOperationError(str(exc)) from exc
|
||||
|
||||
|
||||
Reference in New Issue
Block a user