"""Health check router. Two distinct probes following Kubernetes conventions: * ``GET /api/v1/health/live`` — **Liveness** — checks that the Python process is alive and the event loop is responsive. Always returns 200; a non-2xx answer tells Kubernetes to *restart* the container. * ``GET /api/v1/health/ready`` — **Readiness** — checks that all critical sub-systems (database, fail2ban socket, config directory, scheduler) are reachable. Returns 200 only when all pass; returns 503 with a JSON body listing every failed check otherwise. A non-2xx answer tells Kubernetes to *stop routing traffic* to the pod until it recovers. The combined ``GET /api/v1/health`` endpoint is retained for backward compatibility with existing Docker HEALTHCHECK definitions. """ from __future__ import annotations import asyncio import os from typing import TYPE_CHECKING, Literal import structlog from fastapi import APIRouter, status from fastapi.responses import JSONResponse from app.dependencies import AppStateDep, ServerStatusDep from app.models.response import ComponentHealth, HealthResponse, ReadyCheck, ReadyResponse if TYPE_CHECKING: from collections.abc import Coroutine router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"]) log: structlog.stdlib.BoundLogger = structlog.get_logger() @router.get( "", summary="Application health check", response_model=HealthResponse, responses={ 200: {"description": "All components healthy"}, 503: {"description": "fail2ban offline or component degraded"}, }, ) async def health_check( app_state: AppStateDep, server_status: ServerStatusDep, ) -> JSONResponse: """Return application and component status. Performs lightweight checks on key application components and returns HTTP 200 if all healthy, HTTP 503 if fail2ban is offline. Docker/orchestration health checks interpret 503 as unhealthy and restart the container if fail2ban remains unreachable. Args: app_state: Injected application state containing runtime components. server_status: Injected cached server status snapshot. Returns: HTTP 200 with :class:`~app.models.response.HealthResponse` when healthy, HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban is offline. """ components: list[ComponentHealth] = [] # --- Database check --- db_healthy: bool = True try: from app.config import Settings from app.db import open_db effective_settings: Settings = ( app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings ) test_db = await open_db(effective_settings.database_path) await test_db.close() except Exception as exc: # pragma: no cover - defensive, all paths logged log.warning("health_check_db_failed", error=str(exc)) db_healthy = False components.append( ComponentHealth(name="database", healthy=False, message="Connection failed"), ) # --- Scheduler check --- scheduler_state: Literal["running", "stopped", "unknown"] = "unknown" try: scheduler = app_state.scheduler if scheduler is not None and getattr(scheduler, "running", False): scheduler_state = "running" elif scheduler is not None: scheduler_state = "stopped" else: scheduler_state = "unknown" components.append( ComponentHealth(name="scheduler", healthy=False, message="Not initialised"), ) except AttributeError: # pragma: no cover - defensive scheduler_state = "unknown" components.append( ComponentHealth(name="scheduler", healthy=False, message="Not accessible"), ) # --- Cache check --- cache_state: Literal["initialised", "uninitialised"] = "initialised" try: if app_state.session_cache is not None: cache_state = "initialised" else: cache_state = "uninitialised" components.append( ComponentHealth(name="cache", healthy=False, message="Not initialised"), ) except AttributeError: # pragma: no cover - defensive cache_state = "uninitialised" components.append( ComponentHealth(name="cache", healthy=False, message="Not accessible"), ) fail2ban_online: bool = server_status.online if not fail2ban_online: components.append( ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"), ) # --- Overall status --- overall_status: Literal["ok", "degraded", "unavailable"] if not fail2ban_online: overall_status = "unavailable" http_status: int = status.HTTP_503_SERVICE_UNAVAILABLE elif components: overall_status = "degraded" http_status = status.HTTP_200_OK else: overall_status = "ok" http_status = status.HTTP_200_OK return JSONResponse( status_code=http_status, content=HealthResponse( status=overall_status, fail2ban="online" if fail2ban_online else "offline", database="ok" if db_healthy else "error", scheduler=scheduler_state, cache=cache_state, components=components, ).model_dump(), ) # --- Constants for subsystem checks ------------------------------------------ # SUBSYSTEM_TIMEOUT_SECONDS: float = 2.0 # --- Helper: run a blocking check in a thread pool to avoid event-loop delays -- # async def _run_check( name: str, coro: Coroutine[object, object, None], error_msg: str, ) -> ReadyCheck: """Run *coro* with a short timeout and return a ReadyCheck.""" try: await asyncio.wait_for(coro, timeout=SUBSYSTEM_TIMEOUT_SECONDS) return ReadyCheck(name=name, healthy=True) except (OSError, TimeoutError, Exception) as exc: # noqa: BLE001 log.warning("ready_check_failed", subsystem=name, error=str(exc)) return ReadyCheck(name=name, healthy=False, message=f"{error_msg}: {exc}") # --- Liveness probe ---------------------------------------------------------- # @router.get( "/live", summary="Process liveness probe", response_model=ReadyResponse, responses={ 200: {"description": "Process is alive"}, }, ) async def liveness_probe() -> JSONResponse: """Lightweight liveness check for Kubernetes. Returns 200 when the Python process and event loop are responsive. A non-2xx response tells Kubernetes to restart the container. No subsystem checks are performed — this endpoint must be fast. """ return JSONResponse( status_code=status.HTTP_200_OK, content=ReadyResponse( status="ok", checks=[ReadyCheck(name="process", healthy=True)], failed_count=0, ).model_dump(), ) # --- Readiness probe --------------------------------------------------------- # async def _check_database(app_state: AppStateDep) -> ReadyCheck: """Check database connectivity with a short timeout.""" from app.config import Settings from app.db import open_db effective_settings: Settings = ( app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings ) async def _probe() -> None: test_db = await open_db(effective_settings.database_path) await test_db.close() return await _run_check( "database", _probe(), "Connection failed", ) async def _check_fail2ban(app_state: AppStateDep, server_status: ServerStatusDep) -> ReadyCheck: """Check fail2ban socket reachability using the cached server status.""" if server_status.online: return ReadyCheck(name="fail2ban", healthy=True) return ReadyCheck(name="fail2ban", healthy=False, message="Socket not reachable") async def _check_config_dir(app_state: AppStateDep) -> ReadyCheck: """Check config directory read access.""" from app.config import Settings effective_settings: Settings = ( app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings ) async def _probe() -> None: config_path = effective_settings.fail2ban_config_dir # Quick read-test: list directory (checks both existence and readability) await asyncio.to_thread(os.access, config_path, os.R_OK) return await _run_check( "config_dir", _probe(), "Config directory not readable", ) async def _check_scheduler(app_state: AppStateDep) -> ReadyCheck: """Check scheduler liveness.""" try: scheduler = app_state.scheduler if scheduler is not None and getattr(scheduler, "running", False): return ReadyCheck(name="scheduler", healthy=True) elif scheduler is not None: return ReadyCheck(name="scheduler", healthy=False, message="Scheduler stopped") else: return ReadyCheck(name="scheduler", healthy=False, message="Not initialised") except AttributeError: return ReadyCheck(name="scheduler", healthy=False, message="Not accessible") @router.get( "/ready", summary="Subsystem readiness probe", response_model=ReadyResponse, responses={ 200: {"description": "All subsystems healthy"}, 503: {"description": "One or more subsystems unreachable"}, }, ) async def readiness_probe( app_state: AppStateDep, server_status: ServerStatusDep, ) -> JSONResponse: """Readiness check for Kubernetes. Verifies all critical sub-systems are reachable: - Database connectivity - fail2ban socket (via cached server status) - Config directory read access - Background scheduler liveness Returns HTTP 200 only when every check passes; returns HTTP 503 with a JSON body listing every failed subsystem otherwise. Each check has a short per-subsystem timeout to prevent the endpoint from overwhelming the system under load. """ db_check, f2b_check, config_check, sched_check = await asyncio.gather( _check_database(app_state), _check_fail2ban(app_state, server_status), _check_config_dir(app_state), _check_scheduler(app_state), ) checks: list[ReadyCheck] = [db_check, f2b_check, config_check, sched_check] failed_count = sum(1 for c in checks if not c.healthy) http_status = status.HTTP_200_OK if failed_count == 0 else status.HTTP_503_SERVICE_UNAVAILABLE return JSONResponse( status_code=http_status, content=ReadyResponse( status="ok" if failed_count == 0 else "error", checks=checks, failed_count=failed_count, ).model_dump(), )