Add Kubernetes liveness/readiness probes and middleware order validation

- Split /health into /health/live (liveness) and /health/ready (readiness) following Kubernetes conventions. Combined /health retained for backward compatibility with existing Docker HEALTHCHECK definitions. - Add ReadyCheck and ReadyResponse models for structured readiness output. - Add _assert_middleware_order() startup check enforcing: RateLimit → Csrf → CorrelationId middleware chain. - Register CorrelationIdMiddleware, CsrfMiddleware, RateLimitMiddleware in create_app() with documented required order (reverse of processing). - Add correlation.py, csrf.py, rate_limit.py middleware modules. - Add health probe tests in test_health_probes.py. - Update test_main.py with middleware order assertion tests. - Update frontend useFetchData hook tests. - Docs: update Deployment.md with Kubernetes probe config examples.
2026-05-04 02:42:09 +02:00
parent 65fe747cba
commit eb339efcfd
13 changed files with 882 additions and 129 deletions
--- a/backend/app/routers/health.py
+++ b/backend/app/routers/health.py
@@ -1,27 +1,36 @@
 """Health check router.

-A lightweight ``GET /api/v1/health`` endpoint that verifies the application
-is running and can serve requests.  Also reports the cached fail2ban liveness
-state so monitoring tools and Docker health checks can observe daemon status
-without probing the socket directly.
+Two distinct probes following Kubernetes conventions:

-Comprehensive checks performed:
- Database connectivity
- fail2ban socket reachability (via cached server_status)
- Background scheduler health
- Session cache initialization
+* ``GET /api/v1/health/live`` — **Liveness** — checks that the Python process is
+  alive and the event loop is responsive.  Always returns 200; a non-2xx answer
+  tells Kubernetes to *restart* the container.
+
+* ``GET /api/v1/health/ready`` — **Readiness** — checks that all critical
+  sub-systems (database, fail2ban socket, config directory, scheduler) are
+  reachable.  Returns 200 only when all pass; returns 503 with a JSON body
+  listing every failed check otherwise.  A non-2xx answer tells Kubernetes to
+  *stop routing traffic* to the pod until it recovers.
+
+The combined ``GET /api/v1/health`` endpoint is retained for backward
+compatibility with existing Docker HEALTHCHECK definitions.
 """

 from __future__ import annotations

-from typing import Annotated, Literal
+import asyncio
+import os
+from typing import TYPE_CHECKING, Literal

 import structlog
 from fastapi import APIRouter, status
 from fastapi.responses import JSONResponse

 from app.dependencies import AppStateDep, ServerStatusDep
-from app.models.response import ComponentHealth, HealthResponse
+from app.models.response import ComponentHealth, HealthResponse, ReadyCheck, ReadyResponse
+
+if TYPE_CHECKING:
+    from collections.abc import Coroutine

 router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])

@@ -142,3 +151,164 @@ async def health_check(
            components=components,
        ).model_dump(),
    )
+
+
+# --- Constants for subsystem checks ------------------------------------------ #
+
+SUBSYSTEM_TIMEOUT_SECONDS: float = 2.0
+
+
+# --- Helper: run a blocking check in a thread pool to avoid event-loop delays -- #
+
+async def _run_check(
+    name: str,
+    coro: Coroutine[object, object, None],
+    error_msg: str,
+) -> ReadyCheck:
+    """Run *coro* with a short timeout and return a ReadyCheck."""
+    try:
+        await asyncio.wait_for(coro, timeout=SUBSYSTEM_TIMEOUT_SECONDS)
+        return ReadyCheck(name=name, healthy=True)
+    except (OSError, TimeoutError, Exception) as exc:  # noqa: BLE001
+        log.warning("ready_check_failed", subsystem=name, error=str(exc))
+        return ReadyCheck(name=name, healthy=False, message=f"{error_msg}: {exc}")
+
+
+# --- Liveness probe ---------------------------------------------------------- #
+
+
+@router.get(
+    "/live",
+    summary="Process liveness probe",
+    response_model=ReadyResponse,
+    responses={
+        200: {"description": "Process is alive"},
+    },
+)
+async def liveness_probe() -> JSONResponse:
+    """Lightweight liveness check for Kubernetes.
+
+    Returns 200 when the Python process and event loop are responsive.
+    A non-2xx response tells Kubernetes to restart the container.
+    No subsystem checks are performed — this endpoint must be fast.
+    """
+    return JSONResponse(
+        status_code=status.HTTP_200_OK,
+        content=ReadyResponse(
+            status="ok",
+            checks=[ReadyCheck(name="process", healthy=True)],
+            failed_count=0,
+        ).model_dump(),
+    )
+
+
+# --- Readiness probe --------------------------------------------------------- #
+
+
+async def _check_database(app_state: AppStateDep) -> ReadyCheck:
+    """Check database connectivity with a short timeout."""
+    from app.config import Settings
+    from app.db import open_db
+
+    effective_settings: Settings = (
+        app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
+    )
+
+    async def _probe() -> None:
+        test_db = await open_db(effective_settings.database_path)
+        await test_db.close()
+
+    return await _run_check(
+        "database",
+        _probe(),
+        "Connection failed",
+    )
+
+
+async def _check_fail2ban(app_state: AppStateDep, server_status: ServerStatusDep) -> ReadyCheck:
+    """Check fail2ban socket reachability using the cached server status."""
+    if server_status.online:
+        return ReadyCheck(name="fail2ban", healthy=True)
+    return ReadyCheck(name="fail2ban", healthy=False, message="Socket not reachable")
+
+
+async def _check_config_dir(app_state: AppStateDep) -> ReadyCheck:
+    """Check config directory read access."""
+    from app.config import Settings
+
+    effective_settings: Settings = (
+        app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
+    )
+
+    async def _probe() -> None:
+        config_path = effective_settings.fail2ban_config_dir
+        # Quick read-test: list directory (checks both existence and readability)
+        await asyncio.to_thread(os.access, config_path, os.R_OK)
+
+    return await _run_check(
+        "config_dir",
+        _probe(),
+        "Config directory not readable",
+    )
+
+
+async def _check_scheduler(app_state: AppStateDep) -> ReadyCheck:
+    """Check scheduler liveness."""
+    try:
+        scheduler = app_state.scheduler
+        if scheduler is not None and getattr(scheduler, "running", False):
+            return ReadyCheck(name="scheduler", healthy=True)
+        elif scheduler is not None:
+            return ReadyCheck(name="scheduler", healthy=False, message="Scheduler stopped")
+        else:
+            return ReadyCheck(name="scheduler", healthy=False, message="Not initialised")
+    except AttributeError:
+        return ReadyCheck(name="scheduler", healthy=False, message="Not accessible")
+
+
+@router.get(
+    "/ready",
+    summary="Subsystem readiness probe",
+    response_model=ReadyResponse,
+    responses={
+        200: {"description": "All subsystems healthy"},
+        503: {"description": "One or more subsystems unreachable"},
+    },
+)
+async def readiness_probe(
+    app_state: AppStateDep,
+    server_status: ServerStatusDep,
+) -> JSONResponse:
+    """Readiness check for Kubernetes.
+
+    Verifies all critical sub-systems are reachable:
+    - Database connectivity
+    - fail2ban socket (via cached server status)
+    - Config directory read access
+    - Background scheduler liveness
+
+    Returns HTTP 200 only when every check passes; returns HTTP 503 with a
+    JSON body listing every failed subsystem otherwise.  Each check has a
+    short per-subsystem timeout to prevent the endpoint from overwhelming the
+    system under load.
+    """
+    db_check, f2b_check, config_check, sched_check = await asyncio.gather(
+        _check_database(app_state),
+        _check_fail2ban(app_state, server_status),
+        _check_config_dir(app_state),
+        _check_scheduler(app_state),
+    )
+
+    checks: list[ReadyCheck] = [db_check, f2b_check, config_check, sched_check]
+    failed_count = sum(1 for c in checks if not c.healthy)
+
+    http_status = status.HTTP_200_OK if failed_count == 0 else status.HTTP_503_SERVICE_UNAVAILABLE
+
+    return JSONResponse(
+        status_code=http_status,
+        content=ReadyResponse(
+            status="ok" if failed_count == 0 else "error",
+            checks=checks,
+            failed_count=failed_count,
+        ).model_dump(),
+    )