Add Kubernetes liveness/readiness probes and middleware order validation

- Split /health into /health/live (liveness) and /health/ready (readiness)
  following Kubernetes conventions. Combined /health retained for backward
  compatibility with existing Docker HEALTHCHECK definitions.
- Add ReadyCheck and ReadyResponse models for structured readiness output.
- Add _assert_middleware_order() startup check enforcing:
  RateLimit → Csrf → CorrelationId middleware chain.
- Register CorrelationIdMiddleware, CsrfMiddleware, RateLimitMiddleware
  in create_app() with documented required order (reverse of processing).
- Add correlation.py, csrf.py, rate_limit.py middleware modules.
- Add health probe tests in test_health_probes.py.
- Update test_main.py with middleware order assertion tests.
- Update frontend useFetchData hook tests.
- Docs: update Deployment.md with Kubernetes probe config examples.
This commit is contained in:
2026-05-04 02:42:09 +02:00
parent 65fe747cba
commit eb339efcfd
13 changed files with 882 additions and 129 deletions

View File

@@ -1,27 +1,36 @@
"""Health check router.
A lightweight ``GET /api/v1/health`` endpoint that verifies the application
is running and can serve requests. Also reports the cached fail2ban liveness
state so monitoring tools and Docker health checks can observe daemon status
without probing the socket directly.
Two distinct probes following Kubernetes conventions:
Comprehensive checks performed:
- Database connectivity
- fail2ban socket reachability (via cached server_status)
- Background scheduler health
- Session cache initialization
* ``GET /api/v1/health/live`` — **Liveness** — checks that the Python process is
alive and the event loop is responsive. Always returns 200; a non-2xx answer
tells Kubernetes to *restart* the container.
* ``GET /api/v1/health/ready`` — **Readiness** — checks that all critical
sub-systems (database, fail2ban socket, config directory, scheduler) are
reachable. Returns 200 only when all pass; returns 503 with a JSON body
listing every failed check otherwise. A non-2xx answer tells Kubernetes to
*stop routing traffic* to the pod until it recovers.
The combined ``GET /api/v1/health`` endpoint is retained for backward
compatibility with existing Docker HEALTHCHECK definitions.
"""
from __future__ import annotations
from typing import Annotated, Literal
import asyncio
import os
from typing import TYPE_CHECKING, Literal
import structlog
from fastapi import APIRouter, status
from fastapi.responses import JSONResponse
from app.dependencies import AppStateDep, ServerStatusDep
from app.models.response import ComponentHealth, HealthResponse
from app.models.response import ComponentHealth, HealthResponse, ReadyCheck, ReadyResponse
if TYPE_CHECKING:
from collections.abc import Coroutine
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
@@ -142,3 +151,164 @@ async def health_check(
components=components,
).model_dump(),
)
# --- Constants for subsystem checks ------------------------------------------ #
SUBSYSTEM_TIMEOUT_SECONDS: float = 2.0
# --- Helper: run a blocking check in a thread pool to avoid event-loop delays -- #
async def _run_check(
name: str,
coro: Coroutine[object, object, None],
error_msg: str,
) -> ReadyCheck:
"""Run *coro* with a short timeout and return a ReadyCheck."""
try:
await asyncio.wait_for(coro, timeout=SUBSYSTEM_TIMEOUT_SECONDS)
return ReadyCheck(name=name, healthy=True)
except (OSError, TimeoutError, Exception) as exc: # noqa: BLE001
log.warning("ready_check_failed", subsystem=name, error=str(exc))
return ReadyCheck(name=name, healthy=False, message=f"{error_msg}: {exc}")
# --- Liveness probe ---------------------------------------------------------- #
@router.get(
"/live",
summary="Process liveness probe",
response_model=ReadyResponse,
responses={
200: {"description": "Process is alive"},
},
)
async def liveness_probe() -> JSONResponse:
"""Lightweight liveness check for Kubernetes.
Returns 200 when the Python process and event loop are responsive.
A non-2xx response tells Kubernetes to restart the container.
No subsystem checks are performed — this endpoint must be fast.
"""
return JSONResponse(
status_code=status.HTTP_200_OK,
content=ReadyResponse(
status="ok",
checks=[ReadyCheck(name="process", healthy=True)],
failed_count=0,
).model_dump(),
)
# --- Readiness probe --------------------------------------------------------- #
async def _check_database(app_state: AppStateDep) -> ReadyCheck:
"""Check database connectivity with a short timeout."""
from app.config import Settings
from app.db import open_db
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
async def _probe() -> None:
test_db = await open_db(effective_settings.database_path)
await test_db.close()
return await _run_check(
"database",
_probe(),
"Connection failed",
)
async def _check_fail2ban(app_state: AppStateDep, server_status: ServerStatusDep) -> ReadyCheck:
"""Check fail2ban socket reachability using the cached server status."""
if server_status.online:
return ReadyCheck(name="fail2ban", healthy=True)
return ReadyCheck(name="fail2ban", healthy=False, message="Socket not reachable")
async def _check_config_dir(app_state: AppStateDep) -> ReadyCheck:
"""Check config directory read access."""
from app.config import Settings
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
async def _probe() -> None:
config_path = effective_settings.fail2ban_config_dir
# Quick read-test: list directory (checks both existence and readability)
await asyncio.to_thread(os.access, config_path, os.R_OK)
return await _run_check(
"config_dir",
_probe(),
"Config directory not readable",
)
async def _check_scheduler(app_state: AppStateDep) -> ReadyCheck:
"""Check scheduler liveness."""
try:
scheduler = app_state.scheduler
if scheduler is not None and getattr(scheduler, "running", False):
return ReadyCheck(name="scheduler", healthy=True)
elif scheduler is not None:
return ReadyCheck(name="scheduler", healthy=False, message="Scheduler stopped")
else:
return ReadyCheck(name="scheduler", healthy=False, message="Not initialised")
except AttributeError:
return ReadyCheck(name="scheduler", healthy=False, message="Not accessible")
@router.get(
"/ready",
summary="Subsystem readiness probe",
response_model=ReadyResponse,
responses={
200: {"description": "All subsystems healthy"},
503: {"description": "One or more subsystems unreachable"},
},
)
async def readiness_probe(
app_state: AppStateDep,
server_status: ServerStatusDep,
) -> JSONResponse:
"""Readiness check for Kubernetes.
Verifies all critical sub-systems are reachable:
- Database connectivity
- fail2ban socket (via cached server status)
- Config directory read access
- Background scheduler liveness
Returns HTTP 200 only when every check passes; returns HTTP 503 with a
JSON body listing every failed subsystem otherwise. Each check has a
short per-subsystem timeout to prevent the endpoint from overwhelming the
system under load.
"""
db_check, f2b_check, config_check, sched_check = await asyncio.gather(
_check_database(app_state),
_check_fail2ban(app_state, server_status),
_check_config_dir(app_state),
_check_scheduler(app_state),
)
checks: list[ReadyCheck] = [db_check, f2b_check, config_check, sched_check]
failed_count = sum(1 for c in checks if not c.healthy)
http_status = status.HTTP_200_OK if failed_count == 0 else status.HTTP_503_SERVICE_UNAVAILABLE
return JSONResponse(
status_code=http_status,
content=ReadyResponse(
status="ok" if failed_count == 0 else "error",
checks=checks,
failed_count=failed_count,
).model_dump(),
)