BanGUI/backend/app/tasks/health_check.py

"""Health-check background task.

Registers an APScheduler job that probes the fail2ban socket every 30 seconds
and stores the result on ``app.state.server_status``.  The dashboard endpoint
reads from this cache, keeping HTTP responses fast and the daemon connection
decoupled from user-facing requests.

Crash detection (Task 3)
------------------------
When a jail activation is performed, the router stores a timestamp on
``app.state.last_activation`` (a ``dict`` with ``jail_name`` and ``at``
keys).  If the health probe subsequently detects an online→offline transition
within 60 seconds of that activation, a
:class:`~app.models.config.PendingRecovery` record is written to
``app.state.pending_recovery`` so the UI can offer a one-click rollback.
"""

from __future__ import annotations

import datetime
from typing import TYPE_CHECKING, Any

import structlog

from app.models.config import PendingRecovery
from app.models.server import ServerStatus
from app.services import health_service

if TYPE_CHECKING:  # pragma: no cover
    from fastapi import FastAPI

log: structlog.stdlib.BoundLogger = structlog.get_logger()

#: How often the probe fires (seconds).
HEALTH_CHECK_INTERVAL: int = 30

#: Maximum seconds since an activation for a subsequent crash to be attributed
#: to that activation.
_ACTIVATION_CRASH_WINDOW: int = 60


async def _run_probe(app: Any) -> None:
    """Probe fail2ban and cache the result on *app.state*.

    Detects online/offline state transitions.  When fail2ban goes offline
    within :data:`_ACTIVATION_CRASH_WINDOW` seconds of the last jail
    activation, writes a :class:`~app.models.config.PendingRecovery` record to
    ``app.state.pending_recovery``.

    This is the APScheduler job callback.  It reads ``fail2ban_socket`` from
    ``app.state.settings``, runs the health probe, and writes the result to
    ``app.state.server_status``.

    Args:
        app: The :class:`fastapi.FastAPI` application instance passed by the
            scheduler via the ``kwargs`` mechanism.
    """
    socket_path: str = app.state.settings.fail2ban_socket
    prev_status: ServerStatus = getattr(
        app.state, "server_status", ServerStatus(online=False)
    )
    status: ServerStatus = await health_service.probe(socket_path)
    app.state.server_status = status

    now = datetime.datetime.now(tz=datetime.UTC)

    # Log transitions between online and offline states.
    if status.online and not prev_status.online:
        log.info("fail2ban_came_online", version=status.version)
        # Clear any pending recovery once fail2ban is back online.
        existing: PendingRecovery | None = getattr(
            app.state, "pending_recovery", None
        )
        if existing is not None and not existing.recovered:
            app.state.pending_recovery = PendingRecovery(
                jail_name=existing.jail_name,
                activated_at=existing.activated_at,
                detected_at=existing.detected_at,
                recovered=True,
            )
            log.info(
                "pending_recovery_resolved",
                jail=existing.jail_name,
            )

    elif not status.online and prev_status.online:
        log.warning("fail2ban_went_offline")
        # Check whether this crash happened shortly after a jail activation.
        last_activation: dict[str, Any] | None = getattr(
            app.state, "last_activation", None
        )
        if last_activation is not None:
            activated_at: datetime.datetime = last_activation["at"]
            seconds_since = (now - activated_at).total_seconds()
            if seconds_since <= _ACTIVATION_CRASH_WINDOW:
                jail_name: str = last_activation["jail_name"]
                # Only create a new record when there is not already an
                # unresolved one for the same jail.
                current: PendingRecovery | None = getattr(
                    app.state, "pending_recovery", None
                )
                if current is None or current.recovered:
                    app.state.pending_recovery = PendingRecovery(
                        jail_name=jail_name,
                        activated_at=activated_at,
                        detected_at=now,
                    )
                    log.warning(
                        "activation_crash_detected",
                        jail=jail_name,
                        seconds_since_activation=seconds_since,
                    )

    log.debug(
        "health_check_complete",
        online=status.online,
        version=status.version,
        active_jails=status.active_jails,
    )


def register(app: FastAPI) -> None:
    """Add the health-check job to the application scheduler.

    Must be called after the scheduler has been started (i.e., inside the
    lifespan handler, after ``scheduler.start()``).

    Args:
        app: The :class:`fastapi.FastAPI` application instance whose
            ``app.state.scheduler`` will receive the job.
    """
    # Initialise the cache with an offline placeholder so the dashboard
    # endpoint is always able to return a valid response even before the
    # first probe fires.
    app.state.server_status = ServerStatus(online=False)

    # Initialise activation tracking state.
    app.state.last_activation = None
    app.state.pending_recovery = None

    app.state.scheduler.add_job(
        _run_probe,
        trigger="interval",
        seconds=HEALTH_CHECK_INTERVAL,
        kwargs={"app": app},
        id="health_check",
        replace_existing=True,
        # Fire immediately on startup too, so the UI isn't dark for 30 s.
        next_run_time=__import__("datetime").datetime.now(
            tz=__import__("datetime").timezone.utc
        ),
    )
    log.info(
        "health_check_scheduled",
        interval_seconds=HEALTH_CHECK_INTERVAL,
    )