"""Health-check background task.

Registers an APScheduler job that probes the fail2ban socket every 30 seconds
and stores the result on ``app.state.server_status``.  The dashboard endpoint
reads from this cache, keeping HTTP responses fast and the daemon connection
decoupled from user-facing requests.

Crash detection (Task 3)
------------------------
When a jail activation is performed, the router stores a timestamp on
``app.state.last_activation`` (a ``dict`` with ``jail_name`` and ``at``
keys).  If the health probe subsequently detects an online→offline transition
within 60 seconds of that activation, a
:class:`~app.models.config.PendingRecovery` record is written to
``app.state.pending_recovery`` so the UI can offer a one-click rollback.
"""

from __future__ import annotations

import datetime
from typing import TYPE_CHECKING, TypedDict

import structlog

from app.models.config import PendingRecovery
from app.models.server import ServerStatus
from app.services import health_service
from app.utils.runtime_state import (
    RuntimeState,
    get_effective_settings,
    get_runtime_state,
)

if TYPE_CHECKING:  # pragma: no cover
    from fastapi import FastAPI

    from app.config import Settings

log: structlog.stdlib.BoundLogger = structlog.get_logger()


class ActivationRecord(TypedDict):
    """Stored timestamp data for a jail activation event."""

    jail_name: str
    at: datetime.datetime


#: How often the probe fires (seconds).
HEALTH_CHECK_INTERVAL: int = 30

#: Maximum seconds since an activation for a subsequent crash to be attributed
#: to that activation.
_ACTIVATION_CRASH_WINDOW: int = 60


async def _run_probe_with_resources(settings: Settings, runtime_state: RuntimeState) -> None:
    """Probe fail2ban and cache the result on the runtime state.

    Args:
        settings: The resolved application settings used for the probe.
        runtime_state: The mutable runtime state manager.
    """
    socket_path: str = settings.fail2ban_socket
    prev_status: ServerStatus = getattr(
        runtime_state,
        "server_status",
        ServerStatus(online=False),
    )
    status: ServerStatus = await health_service.probe(socket_path)
    runtime_state.server_status = status

    now = datetime.datetime.now(tz=datetime.UTC)

    # Log transitions between online and offline states.
    if status.online and not prev_status.online:
        log.info("fail2ban_came_online", version=status.version)
        # Clear any pending recovery once fail2ban is back online.
        existing: PendingRecovery | None = getattr(runtime_state, "pending_recovery", None)
        if existing is not None and not existing.recovered:
            runtime_state.pending_recovery = PendingRecovery(
                jail_name=existing.jail_name,
                activated_at=existing.activated_at,
                detected_at=existing.detected_at,
                recovered=True,
            )
            log.info(
                "pending_recovery_resolved",
                jail=existing.jail_name,
            )

    elif not status.online and prev_status.online:
        log.warning("fail2ban_went_offline")
        # Check whether this crash happened shortly after a jail activation.
        last_activation: ActivationRecord | None = getattr(runtime_state, "last_activation", None)
        if last_activation is not None:
            activated_at: datetime.datetime = last_activation["at"]
            seconds_since = (now - activated_at).total_seconds()
            if seconds_since <= _ACTIVATION_CRASH_WINDOW:
                jail_name: str = last_activation["jail_name"]
                # Only create a new record when there is not already an
                # unresolved one for the same jail.
                current: PendingRecovery | None = getattr(runtime_state, "pending_recovery", None)
                if current is None or current.recovered:
                    runtime_state.pending_recovery = PendingRecovery(
                        jail_name=jail_name,
                        activated_at=activated_at,
                        detected_at=now,
                    )
                    log.warning(
                        "activation_crash_detected",
                        jail=jail_name,
                        seconds_since_activation=seconds_since,
                    )

    log.debug(
        "health_check_complete",
        online=status.online,
        version=status.version,
        active_jails=status.active_jails,
    )


async def _run_probe(app: FastAPI) -> None:
    await _run_probe_with_resources(
        get_effective_settings(app),
        get_runtime_state(app),
    )


async def run_probe(app: FastAPI) -> None:
    """Run a single health probe outside the scheduled job context."""
    await _run_probe(app)


def register(app: FastAPI) -> None:
    """Add the health-check job to the application scheduler.

    Must be called after the scheduler has been started (i.e., inside the
    lifespan handler, after ``scheduler.start()``).

    Args:
        app: The :class:`fastapi.FastAPI` application instance whose
            ``app.state.scheduler`` will receive the job.
    """
    # Initialise the cache with an offline placeholder so the dashboard
    # endpoint is always able to return a valid response even before the
    # first probe fires.
    settings = get_effective_settings(app)
    runtime_state = get_runtime_state(app)

    runtime_state.server_status = ServerStatus(online=False)

    # Initialise activation tracking state.
    runtime_state.last_activation = None
    runtime_state.pending_recovery = None

    app.state.scheduler.add_job(
        _run_probe_with_resources,
        trigger="interval",
        seconds=HEALTH_CHECK_INTERVAL,
        kwargs={"settings": settings, "runtime_state": runtime_state},
        id="health_check",
        replace_existing=True,
        # Fire immediately on startup too, so the UI isn't dark for 30 s.
        next_run_time=datetime.datetime.now(tz=datetime.UTC),
    )
    log.info(
        "health_check_scheduled",
        interval_seconds=HEALTH_CHECK_INTERVAL,
    )