feat: Task 3 — invalid jail config recovery (pre-validation, crash detection, rollback)
- Backend: extend activate_jail() with pre-validation and 4-attempt post-reload
health probe; add validate_jail_config() and rollback_jail() service functions
- Backend: new endpoints POST /api/config/jails/{name}/validate,
GET /api/config/pending-recovery, POST /api/config/jails/{name}/rollback
- Backend: extend JailActivationResponse with fail2ban_running + validation_warnings;
add JailValidationIssue, JailValidationResult, PendingRecovery, RollbackResponse models
- Backend: health_check task tracks last_activation and creates PendingRecovery
record when fail2ban goes offline within 60 s of an activation
- Backend: add fail2ban_start_command setting (configurable start cmd for rollback)
- Frontend: ActivateJailDialog — pre-validation on open, crash-detected callback,
extended spinner text during activation+verify
- Frontend: JailsTab — Validate Config button for inactive jails, validation
result panels (blocking errors + advisory warnings)
- Frontend: RecoveryBanner component — polls pending-recovery, shows full-width
alert with Disable & Restart / View Logs buttons
- Frontend: MainLayout — mount RecoveryBanner at layout level
- Tests: 19 new backend service tests (validate, rollback, filter/action parsing)
+ 6 health_check crash-detection tests + 11 router tests; 5 RecoveryBanner
frontend tests; fix mock setup in existing activate_jail tests
This commit is contained in:
@@ -50,6 +50,9 @@ from app.models.config import (
|
||||
InactiveJail,
|
||||
InactiveJailListResponse,
|
||||
JailActivationResponse,
|
||||
JailValidationIssue,
|
||||
JailValidationResult,
|
||||
RollbackResponse,
|
||||
)
|
||||
from app.services import conffile_parser, jail_service
|
||||
from app.utils.fail2ban_client import Fail2BanClient, Fail2BanConnectionError
|
||||
@@ -560,6 +563,242 @@ async def _get_active_jail_names(socket_path: str) -> set[str]:
|
||||
return set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation helpers (Task 3)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Seconds to wait between fail2ban liveness probes after a reload.
|
||||
_POST_RELOAD_PROBE_INTERVAL: float = 2.0
|
||||
|
||||
# Maximum number of post-reload probe attempts (initial attempt + retries).
|
||||
_POST_RELOAD_MAX_ATTEMPTS: int = 4
|
||||
|
||||
|
||||
def _extract_action_base_name(action_str: str) -> str | None:
|
||||
"""Return the base action name from an action assignment string.
|
||||
|
||||
Returns ``None`` for complex fail2ban expressions that cannot be resolved
|
||||
to a single filename (e.g. ``%(action_)s`` interpolations or multi-token
|
||||
composite actions).
|
||||
|
||||
Args:
|
||||
action_str: A single line from the jail's ``action`` setting.
|
||||
|
||||
Returns:
|
||||
Simple base name suitable for a filesystem lookup, or ``None``.
|
||||
"""
|
||||
if "%" in action_str or "$" in action_str:
|
||||
return None
|
||||
base = action_str.split("[")[0].strip()
|
||||
if _SAFE_ACTION_NAME_RE.match(base):
|
||||
return base
|
||||
return None
|
||||
|
||||
|
||||
def _validate_jail_config_sync(
|
||||
config_dir: Path,
|
||||
name: str,
|
||||
) -> JailValidationResult:
|
||||
"""Run synchronous pre-activation checks on a jail configuration.
|
||||
|
||||
Validates:
|
||||
1. Filter file existence in ``filter.d/``.
|
||||
2. Action file existence in ``action.d/`` (for resolvable action names).
|
||||
3. Regex compilation for every ``failregex`` and ``ignoreregex`` pattern.
|
||||
4. Log path existence on disk (generates warnings, not errors).
|
||||
|
||||
Args:
|
||||
config_dir: The fail2ban configuration root directory.
|
||||
name: Validated jail name.
|
||||
|
||||
Returns:
|
||||
:class:`~app.models.config.JailValidationResult` with any issues found.
|
||||
"""
|
||||
issues: list[JailValidationIssue] = []
|
||||
|
||||
all_jails, _ = _parse_jails_sync(config_dir)
|
||||
settings = all_jails.get(name)
|
||||
|
||||
if settings is None:
|
||||
return JailValidationResult(
|
||||
jail_name=name,
|
||||
valid=False,
|
||||
issues=[
|
||||
JailValidationIssue(
|
||||
field="name",
|
||||
message=f"Jail {name!r} not found in config files.",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
filter_d = config_dir / "filter.d"
|
||||
action_d = config_dir / "action.d"
|
||||
|
||||
# 1. Filter existence check.
|
||||
raw_filter = settings.get("filter", "")
|
||||
if raw_filter:
|
||||
mode = settings.get("mode", "normal")
|
||||
resolved = _resolve_filter(raw_filter, name, mode)
|
||||
base_filter = _extract_filter_base_name(resolved)
|
||||
if base_filter:
|
||||
conf_ok = (filter_d / f"{base_filter}.conf").is_file()
|
||||
local_ok = (filter_d / f"{base_filter}.local").is_file()
|
||||
if not conf_ok and not local_ok:
|
||||
issues.append(
|
||||
JailValidationIssue(
|
||||
field="filter",
|
||||
message=(
|
||||
f"Filter file not found: filter.d/{base_filter}.conf"
|
||||
" (or .local)"
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# 2. Action existence check.
|
||||
raw_action = settings.get("action", "")
|
||||
if raw_action:
|
||||
for action_line in _parse_multiline(raw_action):
|
||||
action_name = _extract_action_base_name(action_line)
|
||||
if action_name:
|
||||
conf_ok = (action_d / f"{action_name}.conf").is_file()
|
||||
local_ok = (action_d / f"{action_name}.local").is_file()
|
||||
if not conf_ok and not local_ok:
|
||||
issues.append(
|
||||
JailValidationIssue(
|
||||
field="action",
|
||||
message=(
|
||||
f"Action file not found: action.d/{action_name}.conf"
|
||||
" (or .local)"
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# 3. failregex compilation.
|
||||
for pattern in _parse_multiline(settings.get("failregex", "")):
|
||||
try:
|
||||
re.compile(pattern)
|
||||
except re.error as exc:
|
||||
issues.append(
|
||||
JailValidationIssue(
|
||||
field="failregex",
|
||||
message=f"Invalid regex pattern: {exc}",
|
||||
)
|
||||
)
|
||||
|
||||
# 4. ignoreregex compilation.
|
||||
for pattern in _parse_multiline(settings.get("ignoreregex", "")):
|
||||
try:
|
||||
re.compile(pattern)
|
||||
except re.error as exc:
|
||||
issues.append(
|
||||
JailValidationIssue(
|
||||
field="ignoreregex",
|
||||
message=f"Invalid regex pattern: {exc}",
|
||||
)
|
||||
)
|
||||
|
||||
# 5. Log path existence (warning only — paths may be created at runtime).
|
||||
raw_logpath = settings.get("logpath", "")
|
||||
if raw_logpath:
|
||||
for log_path in _parse_multiline(raw_logpath):
|
||||
# Skip glob patterns and fail2ban variable references.
|
||||
if "*" in log_path or "?" in log_path or "%(" in log_path:
|
||||
continue
|
||||
if not Path(log_path).exists():
|
||||
issues.append(
|
||||
JailValidationIssue(
|
||||
field="logpath",
|
||||
message=f"Log file not found on disk: {log_path}",
|
||||
)
|
||||
)
|
||||
|
||||
valid = len(issues) == 0
|
||||
log.debug(
|
||||
"jail_validation_complete",
|
||||
jail=name,
|
||||
valid=valid,
|
||||
issue_count=len(issues),
|
||||
)
|
||||
return JailValidationResult(jail_name=name, valid=valid, issues=issues)
|
||||
|
||||
|
||||
async def _probe_fail2ban_running(socket_path: str) -> bool:
|
||||
"""Return ``True`` if the fail2ban socket responds to a ping.
|
||||
|
||||
Args:
|
||||
socket_path: Path to the fail2ban Unix domain socket.
|
||||
|
||||
Returns:
|
||||
``True`` when fail2ban is reachable, ``False`` otherwise.
|
||||
"""
|
||||
try:
|
||||
client = Fail2BanClient(socket_path=socket_path, timeout=5.0)
|
||||
resp = await client.send(["ping"])
|
||||
return isinstance(resp, (list, tuple)) and resp[0] == 0
|
||||
except Exception: # noqa: BLE001
|
||||
return False
|
||||
|
||||
|
||||
async def _wait_for_fail2ban(
|
||||
socket_path: str,
|
||||
max_wait_seconds: float = 10.0,
|
||||
poll_interval: float = 2.0,
|
||||
) -> bool:
|
||||
"""Poll the fail2ban socket until it responds or the timeout expires.
|
||||
|
||||
Args:
|
||||
socket_path: Path to the fail2ban Unix domain socket.
|
||||
max_wait_seconds: Total time budget in seconds.
|
||||
poll_interval: Delay between probe attempts in seconds.
|
||||
|
||||
Returns:
|
||||
``True`` if fail2ban came online within the budget.
|
||||
"""
|
||||
elapsed = 0.0
|
||||
while elapsed < max_wait_seconds:
|
||||
if await _probe_fail2ban_running(socket_path):
|
||||
return True
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
return False
|
||||
|
||||
|
||||
async def _start_daemon(start_cmd_parts: list[str]) -> bool:
|
||||
"""Start the fail2ban daemon using *start_cmd_parts*.
|
||||
|
||||
Uses :func:`asyncio.create_subprocess_exec` (no shell interpretation)
|
||||
to avoid command injection.
|
||||
|
||||
Args:
|
||||
start_cmd_parts: Command and arguments, e.g.
|
||||
``["fail2ban-client", "start"]``.
|
||||
|
||||
Returns:
|
||||
``True`` when the process exited with code 0.
|
||||
"""
|
||||
if not start_cmd_parts:
|
||||
log.warning("fail2ban_start_cmd_empty")
|
||||
return False
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*start_cmd_parts,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
await asyncio.wait_for(proc.wait(), timeout=30.0)
|
||||
success = proc.returncode == 0
|
||||
if not success:
|
||||
log.warning(
|
||||
"fail2ban_start_cmd_nonzero",
|
||||
cmd=start_cmd_parts,
|
||||
returncode=proc.returncode,
|
||||
)
|
||||
return success
|
||||
except (TimeoutError, OSError) as exc:
|
||||
log.warning("fail2ban_start_cmd_error", cmd=start_cmd_parts, error=str(exc))
|
||||
return False
|
||||
|
||||
|
||||
def _write_local_override_sync(
|
||||
config_dir: Path,
|
||||
jail_name: str,
|
||||
@@ -846,9 +1085,10 @@ async def activate_jail(
|
||||
) -> JailActivationResponse:
|
||||
"""Enable an inactive jail and reload fail2ban.
|
||||
|
||||
Writes ``enabled = true`` (plus any override values from *req*) to
|
||||
``jail.d/{name}.local`` and then triggers a full fail2ban reload so the
|
||||
jail starts immediately.
|
||||
Performs pre-activation validation, writes ``enabled = true`` (plus any
|
||||
override values from *req*) to ``jail.d/{name}.local``, and triggers a
|
||||
full fail2ban reload. After the reload a multi-attempt health probe
|
||||
determines whether fail2ban (and the specific jail) are still running.
|
||||
|
||||
Args:
|
||||
config_dir: Absolute path to the fail2ban configuration directory.
|
||||
@@ -857,7 +1097,8 @@ async def activate_jail(
|
||||
req: Optional override values to write alongside ``enabled = true``.
|
||||
|
||||
Returns:
|
||||
:class:`~app.models.config.JailActivationResponse`.
|
||||
:class:`~app.models.config.JailActivationResponse` including
|
||||
``fail2ban_running`` and ``validation_warnings`` fields.
|
||||
|
||||
Raises:
|
||||
JailNameError: If *name* contains invalid characters.
|
||||
@@ -881,6 +1122,20 @@ async def activate_jail(
|
||||
if name in active_names:
|
||||
raise JailAlreadyActiveError(name)
|
||||
|
||||
# ---------------------------------------------------------------------- #
|
||||
# Pre-activation validation — collect warnings but do not block #
|
||||
# ---------------------------------------------------------------------- #
|
||||
validation_result: JailValidationResult = await loop.run_in_executor(
|
||||
None, _validate_jail_config_sync, Path(config_dir), name
|
||||
)
|
||||
warnings: list[str] = [f"{i.field}: {i.message}" for i in validation_result.issues]
|
||||
if warnings:
|
||||
log.warning(
|
||||
"jail_activation_validation_warnings",
|
||||
jail=name,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
overrides: dict[str, Any] = {
|
||||
"bantime": req.bantime,
|
||||
"findtime": req.findtime,
|
||||
@@ -903,9 +1158,35 @@ async def activate_jail(
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("reload_after_activate_failed", jail=name, error=str(exc))
|
||||
|
||||
# Verify the jail actually started after the reload. A config error
|
||||
# (bad regex, missing log file, etc.) may silently prevent fail2ban from
|
||||
# starting the jail even though the reload command succeeded.
|
||||
# ---------------------------------------------------------------------- #
|
||||
# Post-reload health probe with retries #
|
||||
# ---------------------------------------------------------------------- #
|
||||
fail2ban_running = False
|
||||
for attempt in range(_POST_RELOAD_MAX_ATTEMPTS):
|
||||
if attempt > 0:
|
||||
await asyncio.sleep(_POST_RELOAD_PROBE_INTERVAL)
|
||||
if await _probe_fail2ban_running(socket_path):
|
||||
fail2ban_running = True
|
||||
break
|
||||
|
||||
if not fail2ban_running:
|
||||
log.warning(
|
||||
"fail2ban_down_after_activate",
|
||||
jail=name,
|
||||
message="fail2ban socket unreachable after reload — daemon may have crashed.",
|
||||
)
|
||||
return JailActivationResponse(
|
||||
name=name,
|
||||
active=False,
|
||||
fail2ban_running=False,
|
||||
validation_warnings=warnings,
|
||||
message=(
|
||||
f"Jail {name!r} was written to config but fail2ban stopped "
|
||||
"responding after reload. The jail configuration may be invalid."
|
||||
),
|
||||
)
|
||||
|
||||
# Verify the jail actually started (config error may prevent it silently).
|
||||
post_reload_names = await _get_active_jail_names(socket_path)
|
||||
actually_running = name in post_reload_names
|
||||
if not actually_running:
|
||||
@@ -917,6 +1198,8 @@ async def activate_jail(
|
||||
return JailActivationResponse(
|
||||
name=name,
|
||||
active=False,
|
||||
fail2ban_running=True,
|
||||
validation_warnings=warnings,
|
||||
message=(
|
||||
f"Jail {name!r} was written to config but did not start after "
|
||||
"reload — check the jail configuration (filters, log paths, regex)."
|
||||
@@ -927,6 +1210,8 @@ async def activate_jail(
|
||||
return JailActivationResponse(
|
||||
name=name,
|
||||
active=True,
|
||||
fail2ban_running=True,
|
||||
validation_warnings=warnings,
|
||||
message=f"Jail {name!r} activated successfully.",
|
||||
)
|
||||
|
||||
@@ -994,6 +1279,117 @@ async def deactivate_jail(
|
||||
)
|
||||
|
||||
|
||||
async def validate_jail_config(
|
||||
config_dir: str,
|
||||
name: str,
|
||||
) -> JailValidationResult:
|
||||
"""Run pre-activation validation checks on a jail configuration.
|
||||
|
||||
Validates that referenced filter and action files exist in ``filter.d/``
|
||||
and ``action.d/``, that all regex patterns compile, and that declared log
|
||||
paths exist on disk.
|
||||
|
||||
Args:
|
||||
config_dir: Absolute path to the fail2ban configuration directory.
|
||||
name: Name of the jail to validate.
|
||||
|
||||
Returns:
|
||||
:class:`~app.models.config.JailValidationResult` with any issues found.
|
||||
|
||||
Raises:
|
||||
JailNameError: If *name* contains invalid characters.
|
||||
"""
|
||||
_safe_jail_name(name)
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(
|
||||
None,
|
||||
_validate_jail_config_sync,
|
||||
Path(config_dir),
|
||||
name,
|
||||
)
|
||||
|
||||
|
||||
async def rollback_jail(
|
||||
config_dir: str,
|
||||
socket_path: str,
|
||||
name: str,
|
||||
start_cmd_parts: list[str],
|
||||
) -> RollbackResponse:
|
||||
"""Disable a bad jail config and restart the fail2ban daemon.
|
||||
|
||||
Writes ``enabled = false`` to ``jail.d/{name}.local`` (works even when
|
||||
fail2ban is down — only a file write), then attempts to start the daemon
|
||||
with *start_cmd_parts*. Waits up to 10 seconds for the socket to respond.
|
||||
|
||||
Args:
|
||||
config_dir: Absolute path to the fail2ban configuration directory.
|
||||
socket_path: Path to the fail2ban Unix domain socket.
|
||||
name: Name of the jail to disable.
|
||||
start_cmd_parts: Argument list for the daemon start command, e.g.
|
||||
``["fail2ban-client", "start"]``.
|
||||
|
||||
Returns:
|
||||
:class:`~app.models.config.RollbackResponse`.
|
||||
|
||||
Raises:
|
||||
JailNameError: If *name* contains invalid characters.
|
||||
ConfigWriteError: If writing the ``.local`` file fails.
|
||||
"""
|
||||
_safe_jail_name(name)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Write enabled=false — this must succeed even when fail2ban is down.
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
_write_local_override_sync,
|
||||
Path(config_dir),
|
||||
name,
|
||||
False,
|
||||
{},
|
||||
)
|
||||
log.info("jail_rolled_back_disabled", jail=name)
|
||||
|
||||
# Attempt to start the daemon.
|
||||
started = await _start_daemon(start_cmd_parts)
|
||||
log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
|
||||
|
||||
# Wait for the socket to come back.
|
||||
fail2ban_running = await _wait_for_fail2ban(
|
||||
socket_path, max_wait_seconds=10.0, poll_interval=2.0
|
||||
)
|
||||
|
||||
active_jails = 0
|
||||
if fail2ban_running:
|
||||
names = await _get_active_jail_names(socket_path)
|
||||
active_jails = len(names)
|
||||
|
||||
if fail2ban_running:
|
||||
log.info("jail_rollback_success", jail=name, active_jails=active_jails)
|
||||
return RollbackResponse(
|
||||
jail_name=name,
|
||||
disabled=True,
|
||||
fail2ban_running=True,
|
||||
active_jails=active_jails,
|
||||
message=(
|
||||
f"Jail {name!r} disabled and fail2ban restarted successfully "
|
||||
f"with {active_jails} active jail(s)."
|
||||
),
|
||||
)
|
||||
|
||||
log.warning("jail_rollback_fail2ban_still_down", jail=name)
|
||||
return RollbackResponse(
|
||||
jail_name=name,
|
||||
disabled=True,
|
||||
fail2ban_running=False,
|
||||
active_jails=0,
|
||||
message=(
|
||||
f"Jail {name!r} was disabled but fail2ban did not come back online. "
|
||||
"Check the fail2ban log for additional errors."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Filter discovery helpers (Task 2.1)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user