Add Kubernetes liveness/readiness probes and middleware order validation
- Split /health into /health/live (liveness) and /health/ready (readiness) following Kubernetes conventions. Combined /health retained for backward compatibility with existing Docker HEALTHCHECK definitions. - Add ReadyCheck and ReadyResponse models for structured readiness output. - Add _assert_middleware_order() startup check enforcing: RateLimit → Csrf → CorrelationId middleware chain. - Register CorrelationIdMiddleware, CsrfMiddleware, RateLimitMiddleware in create_app() with documented required order (reverse of processing). - Add correlation.py, csrf.py, rate_limit.py middleware modules. - Add health probe tests in test_health_probes.py. - Update test_main.py with middleware order assertion tests. - Update frontend useFetchData hook tests. - Docs: update Deployment.md with Kubernetes probe config examples.
This commit is contained in:
@@ -1,27 +1,36 @@
|
||||
"""Health check router.
|
||||
|
||||
A lightweight ``GET /api/v1/health`` endpoint that verifies the application
|
||||
is running and can serve requests. Also reports the cached fail2ban liveness
|
||||
state so monitoring tools and Docker health checks can observe daemon status
|
||||
without probing the socket directly.
|
||||
Two distinct probes following Kubernetes conventions:
|
||||
|
||||
Comprehensive checks performed:
|
||||
- Database connectivity
|
||||
- fail2ban socket reachability (via cached server_status)
|
||||
- Background scheduler health
|
||||
- Session cache initialization
|
||||
* ``GET /api/v1/health/live`` — **Liveness** — checks that the Python process is
|
||||
alive and the event loop is responsive. Always returns 200; a non-2xx answer
|
||||
tells Kubernetes to *restart* the container.
|
||||
|
||||
* ``GET /api/v1/health/ready`` — **Readiness** — checks that all critical
|
||||
sub-systems (database, fail2ban socket, config directory, scheduler) are
|
||||
reachable. Returns 200 only when all pass; returns 503 with a JSON body
|
||||
listing every failed check otherwise. A non-2xx answer tells Kubernetes to
|
||||
*stop routing traffic* to the pod until it recovers.
|
||||
|
||||
The combined ``GET /api/v1/health`` endpoint is retained for backward
|
||||
compatibility with existing Docker HEALTHCHECK definitions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Annotated, Literal
|
||||
import asyncio
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Literal
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter, status
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from app.dependencies import AppStateDep, ServerStatusDep
|
||||
from app.models.response import ComponentHealth, HealthResponse
|
||||
from app.models.response import ComponentHealth, HealthResponse, ReadyCheck, ReadyResponse
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Coroutine
|
||||
|
||||
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
|
||||
|
||||
@@ -142,3 +151,164 @@ async def health_check(
|
||||
components=components,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
# --- Constants for subsystem checks ------------------------------------------ #
|
||||
|
||||
SUBSYSTEM_TIMEOUT_SECONDS: float = 2.0
|
||||
|
||||
|
||||
# --- Helper: run a blocking check in a thread pool to avoid event-loop delays -- #
|
||||
|
||||
async def _run_check(
|
||||
name: str,
|
||||
coro: Coroutine[object, object, None],
|
||||
error_msg: str,
|
||||
) -> ReadyCheck:
|
||||
"""Run *coro* with a short timeout and return a ReadyCheck."""
|
||||
try:
|
||||
await asyncio.wait_for(coro, timeout=SUBSYSTEM_TIMEOUT_SECONDS)
|
||||
return ReadyCheck(name=name, healthy=True)
|
||||
except (OSError, TimeoutError, Exception) as exc: # noqa: BLE001
|
||||
log.warning("ready_check_failed", subsystem=name, error=str(exc))
|
||||
return ReadyCheck(name=name, healthy=False, message=f"{error_msg}: {exc}")
|
||||
|
||||
|
||||
# --- Liveness probe ---------------------------------------------------------- #
|
||||
|
||||
|
||||
@router.get(
|
||||
"/live",
|
||||
summary="Process liveness probe",
|
||||
response_model=ReadyResponse,
|
||||
responses={
|
||||
200: {"description": "Process is alive"},
|
||||
},
|
||||
)
|
||||
async def liveness_probe() -> JSONResponse:
|
||||
"""Lightweight liveness check for Kubernetes.
|
||||
|
||||
Returns 200 when the Python process and event loop are responsive.
|
||||
A non-2xx response tells Kubernetes to restart the container.
|
||||
No subsystem checks are performed — this endpoint must be fast.
|
||||
"""
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_200_OK,
|
||||
content=ReadyResponse(
|
||||
status="ok",
|
||||
checks=[ReadyCheck(name="process", healthy=True)],
|
||||
failed_count=0,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
# --- Readiness probe --------------------------------------------------------- #
|
||||
|
||||
|
||||
async def _check_database(app_state: AppStateDep) -> ReadyCheck:
|
||||
"""Check database connectivity with a short timeout."""
|
||||
from app.config import Settings
|
||||
from app.db import open_db
|
||||
|
||||
effective_settings: Settings = (
|
||||
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
|
||||
)
|
||||
|
||||
async def _probe() -> None:
|
||||
test_db = await open_db(effective_settings.database_path)
|
||||
await test_db.close()
|
||||
|
||||
return await _run_check(
|
||||
"database",
|
||||
_probe(),
|
||||
"Connection failed",
|
||||
)
|
||||
|
||||
|
||||
async def _check_fail2ban(app_state: AppStateDep, server_status: ServerStatusDep) -> ReadyCheck:
|
||||
"""Check fail2ban socket reachability using the cached server status."""
|
||||
if server_status.online:
|
||||
return ReadyCheck(name="fail2ban", healthy=True)
|
||||
return ReadyCheck(name="fail2ban", healthy=False, message="Socket not reachable")
|
||||
|
||||
|
||||
async def _check_config_dir(app_state: AppStateDep) -> ReadyCheck:
|
||||
"""Check config directory read access."""
|
||||
from app.config import Settings
|
||||
|
||||
effective_settings: Settings = (
|
||||
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
|
||||
)
|
||||
|
||||
async def _probe() -> None:
|
||||
config_path = effective_settings.fail2ban_config_dir
|
||||
# Quick read-test: list directory (checks both existence and readability)
|
||||
await asyncio.to_thread(os.access, config_path, os.R_OK)
|
||||
|
||||
return await _run_check(
|
||||
"config_dir",
|
||||
_probe(),
|
||||
"Config directory not readable",
|
||||
)
|
||||
|
||||
|
||||
async def _check_scheduler(app_state: AppStateDep) -> ReadyCheck:
|
||||
"""Check scheduler liveness."""
|
||||
try:
|
||||
scheduler = app_state.scheduler
|
||||
if scheduler is not None and getattr(scheduler, "running", False):
|
||||
return ReadyCheck(name="scheduler", healthy=True)
|
||||
elif scheduler is not None:
|
||||
return ReadyCheck(name="scheduler", healthy=False, message="Scheduler stopped")
|
||||
else:
|
||||
return ReadyCheck(name="scheduler", healthy=False, message="Not initialised")
|
||||
except AttributeError:
|
||||
return ReadyCheck(name="scheduler", healthy=False, message="Not accessible")
|
||||
|
||||
|
||||
@router.get(
|
||||
"/ready",
|
||||
summary="Subsystem readiness probe",
|
||||
response_model=ReadyResponse,
|
||||
responses={
|
||||
200: {"description": "All subsystems healthy"},
|
||||
503: {"description": "One or more subsystems unreachable"},
|
||||
},
|
||||
)
|
||||
async def readiness_probe(
|
||||
app_state: AppStateDep,
|
||||
server_status: ServerStatusDep,
|
||||
) -> JSONResponse:
|
||||
"""Readiness check for Kubernetes.
|
||||
|
||||
Verifies all critical sub-systems are reachable:
|
||||
- Database connectivity
|
||||
- fail2ban socket (via cached server status)
|
||||
- Config directory read access
|
||||
- Background scheduler liveness
|
||||
|
||||
Returns HTTP 200 only when every check passes; returns HTTP 503 with a
|
||||
JSON body listing every failed subsystem otherwise. Each check has a
|
||||
short per-subsystem timeout to prevent the endpoint from overwhelming the
|
||||
system under load.
|
||||
"""
|
||||
db_check, f2b_check, config_check, sched_check = await asyncio.gather(
|
||||
_check_database(app_state),
|
||||
_check_fail2ban(app_state, server_status),
|
||||
_check_config_dir(app_state),
|
||||
_check_scheduler(app_state),
|
||||
)
|
||||
|
||||
checks: list[ReadyCheck] = [db_check, f2b_check, config_check, sched_check]
|
||||
failed_count = sum(1 for c in checks if not c.healthy)
|
||||
|
||||
http_status = status.HTTP_200_OK if failed_count == 0 else status.HTTP_503_SERVICE_UNAVAILABLE
|
||||
|
||||
return JSONResponse(
|
||||
status_code=http_status,
|
||||
content=ReadyResponse(
|
||||
status="ok" if failed_count == 0 else "error",
|
||||
checks=checks,
|
||||
failed_count=failed_count,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user