Extract health-check crash-detection logic into runtime state helper

This commit is contained in:
2026-04-17 16:58:24 +02:00
parent 1e2850a34e
commit 7a1cb0c46c
5 changed files with 122 additions and 69 deletions

View File

@@ -18,17 +18,17 @@ within 60 seconds of that activation, a
from __future__ import annotations
import datetime
from typing import TYPE_CHECKING, TypedDict
from typing import TYPE_CHECKING
import structlog
from app.models.config import PendingRecovery
from app.models.server import ServerStatus
from app.services import health_service
from app.utils.runtime_state import (
RuntimeState,
get_effective_settings,
get_runtime_state,
process_health_probe_result,
)
if TYPE_CHECKING: # pragma: no cover
@@ -39,20 +39,9 @@ if TYPE_CHECKING: # pragma: no cover
log: structlog.stdlib.BoundLogger = structlog.get_logger()
class ActivationRecord(TypedDict):
"""Stored timestamp data for a jail activation event."""
jail_name: str
at: datetime.datetime
#: How often the probe fires (seconds).
HEALTH_CHECK_INTERVAL: int = 30
#: Maximum seconds since an activation for a subsequent crash to be attributed
#: to that activation.
_ACTIVATION_CRASH_WINDOW: int = 60
async def _run_probe_with_resources(settings: Settings, runtime_state: RuntimeState) -> None:
"""Probe fail2ban and cache the result on the runtime state.
@@ -68,57 +57,7 @@ async def _run_probe_with_resources(settings: Settings, runtime_state: RuntimeSt
ServerStatus(online=False),
)
status: ServerStatus = await health_service.probe(socket_path)
runtime_state.server_status = status
now = datetime.datetime.now(tz=datetime.UTC)
# Log transitions between online and offline states.
if status.online and not prev_status.online:
log.info("fail2ban_came_online", version=status.version)
# Clear any pending recovery once fail2ban is back online.
existing: PendingRecovery | None = getattr(runtime_state, "pending_recovery", None)
if existing is not None and not existing.recovered:
runtime_state.pending_recovery = PendingRecovery(
jail_name=existing.jail_name,
activated_at=existing.activated_at,
detected_at=existing.detected_at,
recovered=True,
)
log.info(
"pending_recovery_resolved",
jail=existing.jail_name,
)
elif not status.online and prev_status.online:
log.warning("fail2ban_went_offline")
# Check whether this crash happened shortly after a jail activation.
last_activation: ActivationRecord | None = getattr(runtime_state, "last_activation", None)
if last_activation is not None:
activated_at: datetime.datetime = last_activation["at"]
seconds_since = (now - activated_at).total_seconds()
if seconds_since <= _ACTIVATION_CRASH_WINDOW:
jail_name: str = last_activation["jail_name"]
# Only create a new record when there is not already an
# unresolved one for the same jail.
current: PendingRecovery | None = getattr(runtime_state, "pending_recovery", None)
if current is None or current.recovered:
runtime_state.pending_recovery = PendingRecovery(
jail_name=jail_name,
activated_at=activated_at,
detected_at=now,
)
log.warning(
"activation_crash_detected",
jail=jail_name,
seconds_since_activation=seconds_since,
)
log.debug(
"health_check_complete",
online=status.online,
version=status.version,
active_jails=status.active_jails,
)
process_health_probe_result(runtime_state, status)
async def _run_probe(app: FastAPI) -> None: