feat: Task 3 — invalid jail config recovery (pre-validation, crash detection, rollback)

- Backend: extend activate_jail() with pre-validation and 4-attempt post-reload health probe; add validate_jail_config() and rollback_jail() service functions - Backend: new endpoints POST /api/config/jails/{name}/validate, GET /api/config/pending-recovery, POST /api/config/jails/{name}/rollback - Backend: extend JailActivationResponse with fail2ban_running + validation_warnings; add JailValidationIssue, JailValidationResult, PendingRecovery, RollbackResponse models - Backend: health_check task tracks last_activation and creates PendingRecovery record when fail2ban goes offline within 60 s of an activation - Backend: add fail2ban_start_command setting (configurable start cmd for rollback) - Frontend: ActivateJailDialog — pre-validation on open, crash-detected callback, extended spinner text during activation+verify - Frontend: JailsTab — Validate Config button for inactive jails, validation result panels (blocking errors + advisory warnings) - Frontend: RecoveryBanner component — polls pending-recovery, shows full-width alert with Disable & Restart / View Logs buttons - Frontend: MainLayout — mount RecoveryBanner at layout level - Tests: 19 new backend service tests (validate, rollback, filter/action parsing) + 6 health_check crash-detection tests + 11 router tests; 5 RecoveryBanner frontend tests; fix mock setup in existing activate_jail tests
2026-03-14 14:13:07 +01:00
parent ab11ece001
commit 0966f347c4
17 changed files with 1862 additions and 26 deletions
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -60,6 +60,15 @@ class Settings(BaseSettings):
            "Used for listing, viewing, and editing configuration files through the web UI."
        ),
    )
+    fail2ban_start_command: str = Field(
+        default="fail2ban-client start",
+        description=(
+            "Shell command used to start (not reload) the fail2ban daemon during "
+            "recovery rollback.  Split by whitespace to build the argument list — "
+            "no shell interpretation is performed.  "
+            "Example: 'systemctl start fail2ban' or 'fail2ban-client start'."
+        ),
+    )

    model_config = SettingsConfigDict(
        env_prefix="BANGUI_",
--- a/backend/app/models/config.py
+++ b/backend/app/models/config.py
@@ -3,6 +3,8 @@
 Request, response, and domain models for the config router and service.
 """

+import datetime
+
 from pydantic import BaseModel, ConfigDict, Field

 # ---------------------------------------------------------------------------
@@ -860,6 +862,102 @@ class JailActivationResponse(BaseModel):
        description="New activation state: ``True`` after activate, ``False`` after deactivate.",
    )
    message: str = Field(..., description="Human-readable result message.")
+    fail2ban_running: bool = Field(
+        default=True,
+        description=(
+            "Whether the fail2ban daemon is still running after the activation "
+            "and reload.  ``False`` signals that the daemon may have crashed."
+        ),
+    )
+    validation_warnings: list[str] = Field(
+        default_factory=list,
+        description="Non-fatal warnings from the pre-activation validation step.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Jail validation models (Task 3)
+# ---------------------------------------------------------------------------
+
+
+class JailValidationIssue(BaseModel):
+    """A single issue found during pre-activation validation of a jail config."""
+
+    model_config = ConfigDict(strict=True)
+
+    field: str = Field(
+        ...,
+        description="Config field associated with this issue, e.g. 'filter', 'failregex', 'logpath'.",
+    )
+    message: str = Field(..., description="Human-readable description of the issue.")
+
+
+class JailValidationResult(BaseModel):
+    """Result of pre-activation validation of a single jail configuration."""
+
+    model_config = ConfigDict(strict=True)
+
+    jail_name: str = Field(..., description="Name of the validated jail.")
+    valid: bool = Field(..., description="True when no issues were found.")
+    issues: list[JailValidationIssue] = Field(
+        default_factory=list,
+        description="Validation issues found.  Empty when valid=True.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Rollback response model (Task 3)
+# ---------------------------------------------------------------------------
+
+
+class RollbackResponse(BaseModel):
+    """Response for ``POST /api/config/jails/{name}/rollback``."""
+
+    model_config = ConfigDict(strict=True)
+
+    jail_name: str = Field(..., description="Name of the jail that was disabled.")
+    disabled: bool = Field(
+        ...,
+        description="Whether the jail's .local override was successfully written with enabled=false.",
+    )
+    fail2ban_running: bool = Field(
+        ...,
+        description="Whether fail2ban is online after the rollback attempt.",
+    )
+    active_jails: int = Field(
+        default=0,
+        ge=0,
+        description="Number of currently active jails after a successful restart.",
+    )
+    message: str = Field(..., description="Human-readable result message.")
+
+
+# ---------------------------------------------------------------------------
+# Pending recovery model (Task 3)
+# ---------------------------------------------------------------------------
+
+
+class PendingRecovery(BaseModel):
+    """Records a probable activation-caused fail2ban crash pending user action."""
+
+    model_config = ConfigDict(strict=True)
+
+    jail_name: str = Field(
+        ...,
+        description="Name of the jail whose activation likely caused the crash.",
+    )
+    activated_at: datetime.datetime = Field(
+        ...,
+        description="ISO-8601 UTC timestamp of when the jail was activated.",
+    )
+    detected_at: datetime.datetime = Field(
+        ...,
+        description="ISO-8601 UTC timestamp of when the crash was detected.",
+    )
+    recovered: bool = Field(
+        default=False,
+        description="Whether fail2ban has been successfully restarted.",
+    )


 # ---------------------------------------------------------------------------
--- a/backend/app/routers/config.py
+++ b/backend/app/routers/config.py
@@ -9,6 +9,9 @@ global settings, test regex patterns, add log paths, and preview log files.
 * ``GET  /api/config/jails/inactive``               — list all inactive jails
 * ``POST /api/config/jails/{name}/activate``        — activate an inactive jail
 * ``POST /api/config/jails/{name}/deactivate``      — deactivate an active jail
+* ``POST /api/config/jails/{name}/validate``        — validate jail config pre-activation (Task 3)
+* ``POST /api/config/jails/{name}/rollback``        — disable bad jail and restart fail2ban (Task 3)
+* ``GET  /api/config/pending-recovery``             — active crash-recovery record (Task 3)
 * ``POST /api/config/jails/{name}/filter``          — assign a filter to a jail
 * ``POST /api/config/jails/{name}/action``          — add an action to a jail
 * ``DELETE /api/config/jails/{name}/action/{action_name}`` — remove an action from a jail
@@ -34,6 +37,7 @@ global settings, test regex patterns, add log paths, and preview log files.

 from __future__ import annotations

+import datetime
 from typing import Annotated

 from fastapi import APIRouter, HTTPException, Path, Query, Request, status
@@ -60,12 +64,15 @@ from app.models.config import (
    JailConfigListResponse,
    JailConfigResponse,
    JailConfigUpdate,
+    JailValidationResult,
    LogPreviewRequest,
    LogPreviewResponse,
    MapColorThresholdsResponse,
    MapColorThresholdsUpdate,
+    PendingRecovery,
    RegexTestRequest,
    RegexTestResponse,
+    RollbackResponse,
    ServiceStatusResponse,
 )
 from app.services import config_file_service, config_service, jail_service
@@ -611,7 +618,7 @@ async def activate_jail(
    req = body if body is not None else ActivateJailRequest()

    try:
-        return await config_file_service.activate_jail(
+        result = await config_file_service.activate_jail(
            config_dir, socket_path, name, req
        )
    except JailNameError as exc:
@@ -631,6 +638,24 @@ async def activate_jail(
    except Fail2BanConnectionError as exc:
        raise _bad_gateway(exc) from exc

+    # Record this activation so the health-check task can attribute a
+    # subsequent fail2ban crash to it.
+    request.app.state.last_activation = {
+        "jail_name": name,
+        "at": datetime.datetime.now(tz=datetime.UTC),
+    }
+
+    # If fail2ban stopped responding after the reload, create a pending-recovery
+    # record immediately (before the background health task notices).
+    if not result.fail2ban_running:
+        request.app.state.pending_recovery = PendingRecovery(
+            jail_name=name,
+            activated_at=request.app.state.last_activation["at"],
+            detected_at=datetime.datetime.now(tz=datetime.UTC),
+        )
+
+    return result
+

@router.post(
    "/jails/{name}/deactivate",
@@ -684,6 +709,125 @@ async def deactivate_jail(
        raise _bad_gateway(exc) from exc


+# ---------------------------------------------------------------------------
+# Jail validation & rollback endpoints (Task 3)
+# ---------------------------------------------------------------------------
+
+
+@router.post(
+    "/jails/{name}/validate",
+    response_model=JailValidationResult,
+    summary="Validate jail configuration before activation",
+)
+async def validate_jail(
+    request: Request,
+    _auth: AuthDep,
+    name: _NamePath,
+) -> JailValidationResult:
+    """Run pre-activation validation checks on a jail configuration.
+
+    Validates filter and action file existence, regex pattern compilation, and
+    log path existence without modifying any files or reloading fail2ban.
+
+    Args:
+        request: FastAPI request object.
+        _auth: Validated session.
+        name: Jail name to validate.
+
+    Returns:
+        :class:`~app.models.config.JailValidationResult` with any issues found.
+
+    Raises:
+        HTTPException: 400 if *name* contains invalid characters.
+        HTTPException: 404 if *name* is not found in any config file.
+    """
+    config_dir: str = request.app.state.settings.fail2ban_config_dir
+    try:
+        return await config_file_service.validate_jail_config(config_dir, name)
+    except JailNameError as exc:
+        raise _bad_request(str(exc)) from exc
+
+
+@router.get(
+    "/pending-recovery",
+    response_model=PendingRecovery | None,
+    summary="Return active crash-recovery record if one exists",
+)
+async def get_pending_recovery(
+    request: Request,
+    _auth: AuthDep,
+) -> PendingRecovery | None:
+    """Return the current :class:`~app.models.config.PendingRecovery` record.
+
+    A non-null response means fail2ban crashed shortly after a jail activation
+    and the user should be offered a rollback option.  Returns ``null`` (HTTP
+    200 with ``null`` body) when no recovery is pending.
+
+    Args:
+        request: FastAPI request object.
+        _auth: Validated session.
+
+    Returns:
+        :class:`~app.models.config.PendingRecovery` or ``None``.
+    """
+    return getattr(request.app.state, "pending_recovery", None)
+
+
+@router.post(
+    "/jails/{name}/rollback",
+    response_model=RollbackResponse,
+    summary="Disable a bad jail config and restart fail2ban",
+)
+async def rollback_jail(
+    request: Request,
+    _auth: AuthDep,
+    name: _NamePath,
+) -> RollbackResponse:
+    """Disable the specified jail and attempt to restart fail2ban.
+
+    Writes ``enabled = false`` to ``jail.d/{name}.local`` (works even when
+    fail2ban is down — no socket is needed), then runs the configured start
+    command and waits up to ten seconds for the daemon to come back online.
+
+    On success, clears the :class:`~app.models.config.PendingRecovery` record.
+
+    Args:
+        request: FastAPI request object.
+        _auth: Validated session.
+        name: Jail name to disable and roll back.
+
+    Returns:
+        :class:`~app.models.config.RollbackResponse`.
+
+    Raises:
+        HTTPException: 400 if *name* contains invalid characters.
+        HTTPException: 500 if writing the .local override file fails.
+    """
+    config_dir: str = request.app.state.settings.fail2ban_config_dir
+    socket_path: str = request.app.state.settings.fail2ban_socket
+    start_cmd: str = request.app.state.settings.fail2ban_start_command
+    start_cmd_parts: list[str] = start_cmd.split()
+
+    try:
+        result = await config_file_service.rollback_jail(
+            config_dir, socket_path, name, start_cmd_parts
+        )
+    except JailNameError as exc:
+        raise _bad_request(str(exc)) from exc
+    except ConfigWriteError as exc:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to write config override: {exc}",
+        ) from exc
+
+    # Clear pending recovery if fail2ban came back online.
+    if result.fail2ban_running:
+        request.app.state.pending_recovery = None
+        request.app.state.last_activation = None
+
+    return result
+
+
 # ---------------------------------------------------------------------------
 # Filter discovery endpoints (Task 2.1)
 # ---------------------------------------------------------------------------
--- a/backend/app/services/config_file_service.py
+++ b/backend/app/services/config_file_service.py
@@ -50,6 +50,9 @@ from app.models.config import (
    InactiveJail,
    InactiveJailListResponse,
    JailActivationResponse,
+    JailValidationIssue,
+    JailValidationResult,
+    RollbackResponse,
 )
 from app.services import conffile_parser, jail_service
 from app.utils.fail2ban_client import Fail2BanClient, Fail2BanConnectionError
@@ -560,6 +563,242 @@ async def _get_active_jail_names(socket_path: str) -> set[str]:
        return set()


+# ---------------------------------------------------------------------------
+# Validation helpers (Task 3)
+# ---------------------------------------------------------------------------
+
+# Seconds to wait between fail2ban liveness probes after a reload.
+_POST_RELOAD_PROBE_INTERVAL: float = 2.0
+
+# Maximum number of post-reload probe attempts (initial attempt + retries).
+_POST_RELOAD_MAX_ATTEMPTS: int = 4
+
+
+def _extract_action_base_name(action_str: str) -> str | None:
+    """Return the base action name from an action assignment string.
+
+    Returns ``None`` for complex fail2ban expressions that cannot be resolved
+    to a single filename (e.g. ``%(action_)s`` interpolations or multi-token
+    composite actions).
+
+    Args:
+        action_str: A single line from the jail's ``action`` setting.
+
+    Returns:
+        Simple base name suitable for a filesystem lookup, or ``None``.
+    """
+    if "%" in action_str or "$" in action_str:
+        return None
+    base = action_str.split("[")[0].strip()
+    if _SAFE_ACTION_NAME_RE.match(base):
+        return base
+    return None
+
+
+def _validate_jail_config_sync(
+    config_dir: Path,
+    name: str,
+) -> JailValidationResult:
+    """Run synchronous pre-activation checks on a jail configuration.
+
+    Validates:
+    1. Filter file existence in ``filter.d/``.
+    2. Action file existence in ``action.d/`` (for resolvable action names).
+    3. Regex compilation for every ``failregex`` and ``ignoreregex`` pattern.
+    4. Log path existence on disk (generates warnings, not errors).
+
+    Args:
+        config_dir: The fail2ban configuration root directory.
+        name: Validated jail name.
+
+    Returns:
+        :class:`~app.models.config.JailValidationResult` with any issues found.
+    """
+    issues: list[JailValidationIssue] = []
+
+    all_jails, _ = _parse_jails_sync(config_dir)
+    settings = all_jails.get(name)
+
+    if settings is None:
+        return JailValidationResult(
+            jail_name=name,
+            valid=False,
+            issues=[
+                JailValidationIssue(
+                    field="name",
+                    message=f"Jail {name!r} not found in config files.",
+                )
+            ],
+        )
+
+    filter_d = config_dir / "filter.d"
+    action_d = config_dir / "action.d"
+
+    # 1. Filter existence check.
+    raw_filter = settings.get("filter", "")
+    if raw_filter:
+        mode = settings.get("mode", "normal")
+        resolved = _resolve_filter(raw_filter, name, mode)
+        base_filter = _extract_filter_base_name(resolved)
+        if base_filter:
+            conf_ok = (filter_d / f"{base_filter}.conf").is_file()
+            local_ok = (filter_d / f"{base_filter}.local").is_file()
+            if not conf_ok and not local_ok:
+                issues.append(
+                    JailValidationIssue(
+                        field="filter",
+                        message=(
+                            f"Filter file not found: filter.d/{base_filter}.conf"
+                            " (or .local)"
+                        ),
+                    )
+                )
+
+    # 2. Action existence check.
+    raw_action = settings.get("action", "")
+    if raw_action:
+        for action_line in _parse_multiline(raw_action):
+            action_name = _extract_action_base_name(action_line)
+            if action_name:
+                conf_ok = (action_d / f"{action_name}.conf").is_file()
+                local_ok = (action_d / f"{action_name}.local").is_file()
+                if not conf_ok and not local_ok:
+                    issues.append(
+                        JailValidationIssue(
+                            field="action",
+                            message=(
+                                f"Action file not found: action.d/{action_name}.conf"
+                                " (or .local)"
+                            ),
+                        )
+                    )
+
+    # 3. failregex compilation.
+    for pattern in _parse_multiline(settings.get("failregex", "")):
+        try:
+            re.compile(pattern)
+        except re.error as exc:
+            issues.append(
+                JailValidationIssue(
+                    field="failregex",
+                    message=f"Invalid regex pattern: {exc}",
+                )
+            )
+
+    # 4. ignoreregex compilation.
+    for pattern in _parse_multiline(settings.get("ignoreregex", "")):
+        try:
+            re.compile(pattern)
+        except re.error as exc:
+            issues.append(
+                JailValidationIssue(
+                    field="ignoreregex",
+                    message=f"Invalid regex pattern: {exc}",
+                )
+            )
+
+    # 5. Log path existence (warning only — paths may be created at runtime).
+    raw_logpath = settings.get("logpath", "")
+    if raw_logpath:
+        for log_path in _parse_multiline(raw_logpath):
+            # Skip glob patterns and fail2ban variable references.
+            if "*" in log_path or "?" in log_path or "%(" in log_path:
+                continue
+            if not Path(log_path).exists():
+                issues.append(
+                    JailValidationIssue(
+                        field="logpath",
+                        message=f"Log file not found on disk: {log_path}",
+                    )
+                )
+
+    valid = len(issues) == 0
+    log.debug(
+        "jail_validation_complete",
+        jail=name,
+        valid=valid,
+        issue_count=len(issues),
+    )
+    return JailValidationResult(jail_name=name, valid=valid, issues=issues)
+
+
+async def _probe_fail2ban_running(socket_path: str) -> bool:
+    """Return ``True`` if the fail2ban socket responds to a ping.
+
+    Args:
+        socket_path: Path to the fail2ban Unix domain socket.
+
+    Returns:
+        ``True`` when fail2ban is reachable, ``False`` otherwise.
+    """
+    try:
+        client = Fail2BanClient(socket_path=socket_path, timeout=5.0)
+        resp = await client.send(["ping"])
+        return isinstance(resp, (list, tuple)) and resp[0] == 0
+    except Exception:  # noqa: BLE001
+        return False
+
+
+async def _wait_for_fail2ban(
+    socket_path: str,
+    max_wait_seconds: float = 10.0,
+    poll_interval: float = 2.0,
+) -> bool:
+    """Poll the fail2ban socket until it responds or the timeout expires.
+
+    Args:
+        socket_path: Path to the fail2ban Unix domain socket.
+        max_wait_seconds: Total time budget in seconds.
+        poll_interval: Delay between probe attempts in seconds.
+
+    Returns:
+        ``True`` if fail2ban came online within the budget.
+    """
+    elapsed = 0.0
+    while elapsed < max_wait_seconds:
+        if await _probe_fail2ban_running(socket_path):
+            return True
+        await asyncio.sleep(poll_interval)
+        elapsed += poll_interval
+    return False
+
+
+async def _start_daemon(start_cmd_parts: list[str]) -> bool:
+    """Start the fail2ban daemon using *start_cmd_parts*.
+
+    Uses :func:`asyncio.create_subprocess_exec` (no shell interpretation)
+    to avoid command injection.
+
+    Args:
+        start_cmd_parts: Command and arguments, e.g.
+            ``["fail2ban-client", "start"]``.
+
+    Returns:
+        ``True`` when the process exited with code 0.
+    """
+    if not start_cmd_parts:
+        log.warning("fail2ban_start_cmd_empty")
+        return False
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *start_cmd_parts,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        await asyncio.wait_for(proc.wait(), timeout=30.0)
+        success = proc.returncode == 0
+        if not success:
+            log.warning(
+                "fail2ban_start_cmd_nonzero",
+                cmd=start_cmd_parts,
+                returncode=proc.returncode,
+            )
+        return success
+    except (TimeoutError, OSError) as exc:
+        log.warning("fail2ban_start_cmd_error", cmd=start_cmd_parts, error=str(exc))
+        return False
+
+
 def _write_local_override_sync(
    config_dir: Path,
    jail_name: str,
@@ -846,9 +1085,10 @@ async def activate_jail(
 ) -> JailActivationResponse:
    """Enable an inactive jail and reload fail2ban.

-    Writes ``enabled = true`` (plus any override values from *req*) to
-    ``jail.d/{name}.local`` and then triggers a full fail2ban reload so the
-    jail starts immediately.
+    Performs pre-activation validation, writes ``enabled = true`` (plus any
+    override values from *req*) to ``jail.d/{name}.local``, and triggers a
+    full fail2ban reload.  After the reload a multi-attempt health probe
+    determines whether fail2ban (and the specific jail) are still running.

    Args:
        config_dir: Absolute path to the fail2ban configuration directory.
@@ -857,7 +1097,8 @@ async def activate_jail(
        req: Optional override values to write alongside ``enabled = true``.

    Returns:
-        :class:`~app.models.config.JailActivationResponse`.
+        :class:`~app.models.config.JailActivationResponse` including
+        ``fail2ban_running`` and ``validation_warnings`` fields.

    Raises:
        JailNameError: If *name* contains invalid characters.
@@ -881,6 +1122,20 @@ async def activate_jail(
    if name in active_names:
        raise JailAlreadyActiveError(name)

+    # ---------------------------------------------------------------------- #
+    # Pre-activation validation — collect warnings but do not block           #
+    # ---------------------------------------------------------------------- #
+    validation_result: JailValidationResult = await loop.run_in_executor(
+        None, _validate_jail_config_sync, Path(config_dir), name
+    )
+    warnings: list[str] = [f"{i.field}: {i.message}" for i in validation_result.issues]
+    if warnings:
+        log.warning(
+            "jail_activation_validation_warnings",
+            jail=name,
+            warnings=warnings,
+        )
+
    overrides: dict[str, Any] = {
        "bantime": req.bantime,
        "findtime": req.findtime,
@@ -903,9 +1158,35 @@ async def activate_jail(
    except Exception as exc:  # noqa: BLE001
        log.warning("reload_after_activate_failed", jail=name, error=str(exc))

-    # Verify the jail actually started after the reload.  A config error
-    # (bad regex, missing log file, etc.) may silently prevent fail2ban from
-    # starting the jail even though the reload command succeeded.
+    # ---------------------------------------------------------------------- #
+    # Post-reload health probe with retries                                   #
+    # ---------------------------------------------------------------------- #
+    fail2ban_running = False
+    for attempt in range(_POST_RELOAD_MAX_ATTEMPTS):
+        if attempt > 0:
+            await asyncio.sleep(_POST_RELOAD_PROBE_INTERVAL)
+        if await _probe_fail2ban_running(socket_path):
+            fail2ban_running = True
+            break
+
+    if not fail2ban_running:
+        log.warning(
+            "fail2ban_down_after_activate",
+            jail=name,
+            message="fail2ban socket unreachable after reload — daemon may have crashed.",
+        )
+        return JailActivationResponse(
+            name=name,
+            active=False,
+            fail2ban_running=False,
+            validation_warnings=warnings,
+            message=(
+                f"Jail {name!r} was written to config but fail2ban stopped "
+                "responding after reload.  The jail configuration may be invalid."
+            ),
+        )
+
+    # Verify the jail actually started (config error may prevent it silently).
    post_reload_names = await _get_active_jail_names(socket_path)
    actually_running = name in post_reload_names
    if not actually_running:
@@ -917,6 +1198,8 @@ async def activate_jail(
        return JailActivationResponse(
            name=name,
            active=False,
+            fail2ban_running=True,
+            validation_warnings=warnings,
            message=(
                f"Jail {name!r} was written to config but did not start after "
                "reload — check the jail configuration (filters, log paths, regex)."
@@ -927,6 +1210,8 @@ async def activate_jail(
    return JailActivationResponse(
        name=name,
        active=True,
+        fail2ban_running=True,
+        validation_warnings=warnings,
        message=f"Jail {name!r} activated successfully.",
    )

@@ -994,6 +1279,117 @@ async def deactivate_jail(
    )


+async def validate_jail_config(
+    config_dir: str,
+    name: str,
+) -> JailValidationResult:
+    """Run pre-activation validation checks on a jail configuration.
+
+    Validates that referenced filter and action files exist in ``filter.d/``
+    and ``action.d/``, that all regex patterns compile, and that declared log
+    paths exist on disk.
+
+    Args:
+        config_dir: Absolute path to the fail2ban configuration directory.
+        name: Name of the jail to validate.
+
+    Returns:
+        :class:`~app.models.config.JailValidationResult` with any issues found.
+
+    Raises:
+        JailNameError: If *name* contains invalid characters.
+    """
+    _safe_jail_name(name)
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        _validate_jail_config_sync,
+        Path(config_dir),
+        name,
+    )
+
+
+async def rollback_jail(
+    config_dir: str,
+    socket_path: str,
+    name: str,
+    start_cmd_parts: list[str],
+) -> RollbackResponse:
+    """Disable a bad jail config and restart the fail2ban daemon.
+
+    Writes ``enabled = false`` to ``jail.d/{name}.local`` (works even when
+    fail2ban is down — only a file write), then attempts to start the daemon
+    with *start_cmd_parts*.  Waits up to 10 seconds for the socket to respond.
+
+    Args:
+        config_dir: Absolute path to the fail2ban configuration directory.
+        socket_path: Path to the fail2ban Unix domain socket.
+        name: Name of the jail to disable.
+        start_cmd_parts: Argument list for the daemon start command, e.g.
+            ``["fail2ban-client", "start"]``.
+
+    Returns:
+        :class:`~app.models.config.RollbackResponse`.
+
+    Raises:
+        JailNameError: If *name* contains invalid characters.
+        ConfigWriteError: If writing the ``.local`` file fails.
+    """
+    _safe_jail_name(name)
+
+    loop = asyncio.get_event_loop()
+
+    # Write enabled=false — this must succeed even when fail2ban is down.
+    await loop.run_in_executor(
+        None,
+        _write_local_override_sync,
+        Path(config_dir),
+        name,
+        False,
+        {},
+    )
+    log.info("jail_rolled_back_disabled", jail=name)
+
+    # Attempt to start the daemon.
+    started = await _start_daemon(start_cmd_parts)
+    log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
+
+    # Wait for the socket to come back.
+    fail2ban_running = await _wait_for_fail2ban(
+        socket_path, max_wait_seconds=10.0, poll_interval=2.0
+    )
+
+    active_jails = 0
+    if fail2ban_running:
+        names = await _get_active_jail_names(socket_path)
+        active_jails = len(names)
+
+    if fail2ban_running:
+        log.info("jail_rollback_success", jail=name, active_jails=active_jails)
+        return RollbackResponse(
+            jail_name=name,
+            disabled=True,
+            fail2ban_running=True,
+            active_jails=active_jails,
+            message=(
+                f"Jail {name!r} disabled and fail2ban restarted successfully "
+                f"with {active_jails} active jail(s)."
+            ),
+        )
+
+    log.warning("jail_rollback_fail2ban_still_down", jail=name)
+    return RollbackResponse(
+        jail_name=name,
+        disabled=True,
+        fail2ban_running=False,
+        active_jails=0,
+        message=(
+            f"Jail {name!r} was disabled but fail2ban did not come back online. "
+            "Check the fail2ban log for additional errors."
+        ),
+    )
+
+
 # ---------------------------------------------------------------------------
 # Filter discovery helpers (Task 2.1)
 # ---------------------------------------------------------------------------
--- a/backend/app/tasks/health_check.py
+++ b/backend/app/tasks/health_check.py
@@ -4,14 +4,25 @@ Registers an APScheduler job that probes the fail2ban socket every 30 seconds
 and stores the result on ``app.state.server_status``.  The dashboard endpoint
 reads from this cache, keeping HTTP responses fast and the daemon connection
 decoupled from user-facing requests.
+
+Crash detection (Task 3)
+------------------------
+When a jail activation is performed, the router stores a timestamp on
+``app.state.last_activation`` (a ``dict`` with ``jail_name`` and ``at``
+keys).  If the health probe subsequently detects an online→offline transition
+within 60 seconds of that activation, a
+:class:`~app.models.config.PendingRecovery` record is written to
+``app.state.pending_recovery`` so the UI can offer a one-click rollback.
 """

 from __future__ import annotations

+import datetime
 from typing import TYPE_CHECKING, Any

 import structlog

+from app.models.config import PendingRecovery
 from app.models.server import ServerStatus
 from app.services import health_service

@@ -23,10 +34,19 @@ log: structlog.stdlib.BoundLogger = structlog.get_logger()
 #: How often the probe fires (seconds).
 HEALTH_CHECK_INTERVAL: int = 30

+#: Maximum seconds since an activation for a subsequent crash to be attributed
+#: to that activation.
+_ACTIVATION_CRASH_WINDOW: int = 60
+

 async def _run_probe(app: Any) -> None:
    """Probe fail2ban and cache the result on *app.state*.

+    Detects online/offline state transitions.  When fail2ban goes offline
+    within :data:`_ACTIVATION_CRASH_WINDOW` seconds of the last jail
+    activation, writes a :class:`~app.models.config.PendingRecovery` record to
+    ``app.state.pending_recovery``.
+
    This is the APScheduler job callback.  It reads ``fail2ban_socket`` from
    ``app.state.settings``, runs the health probe, and writes the result to
    ``app.state.server_status``.
@@ -42,11 +62,54 @@ async def _run_probe(app: Any) -> None:
    status: ServerStatus = await health_service.probe(socket_path)
    app.state.server_status = status

+    now = datetime.datetime.now(tz=datetime.UTC)
+
    # Log transitions between online and offline states.
    if status.online and not prev_status.online:
        log.info("fail2ban_came_online", version=status.version)
+        # Clear any pending recovery once fail2ban is back online.
+        existing: PendingRecovery | None = getattr(
+            app.state, "pending_recovery", None
+        )
+        if existing is not None and not existing.recovered:
+            app.state.pending_recovery = PendingRecovery(
+                jail_name=existing.jail_name,
+                activated_at=existing.activated_at,
+                detected_at=existing.detected_at,
+                recovered=True,
+            )
+            log.info(
+                "pending_recovery_resolved",
+                jail=existing.jail_name,
+            )
+
    elif not status.online and prev_status.online:
        log.warning("fail2ban_went_offline")
+        # Check whether this crash happened shortly after a jail activation.
+        last_activation: dict[str, Any] | None = getattr(
+            app.state, "last_activation", None
+        )
+        if last_activation is not None:
+            activated_at: datetime.datetime = last_activation["at"]
+            seconds_since = (now - activated_at).total_seconds()
+            if seconds_since <= _ACTIVATION_CRASH_WINDOW:
+                jail_name: str = last_activation["jail_name"]
+                # Only create a new record when there is not already an
+                # unresolved one for the same jail.
+                current: PendingRecovery | None = getattr(
+                    app.state, "pending_recovery", None
+                )
+                if current is None or current.recovered:
+                    app.state.pending_recovery = PendingRecovery(
+                        jail_name=jail_name,
+                        activated_at=activated_at,
+                        detected_at=now,
+                    )
+                    log.warning(
+                        "activation_crash_detected",
+                        jail=jail_name,
+                        seconds_since_activation=seconds_since,
+                    )

    log.debug(
        "health_check_complete",
@@ -71,6 +134,10 @@ def register(app: FastAPI) -> None:
    # first probe fires.
    app.state.server_status = ServerStatus(online=False)

+    # Initialise activation tracking state.
+    app.state.last_activation = None
+    app.state.pending_recovery = None
+
    app.state.scheduler.add_job(
        _run_probe,
        trigger="interval",