Files
BanGUI/backend/app/routers/health.py
Lukas 0a3f9c6c16 refactor(backend): external logging metrics, required mode, health checks
- Add external_logging_init_failures counter
- Add external_log_required flag, raise if init fails and required
- Health endpoint: add external_logging status check
- Blocklist service: enrich with metadata fields, update import logic
- Health check task: add runtime_state dependency, fix return typing
- Metrics: add Histogram for request latencies
- Frontend: align BlocklistImportLogSection props
- Docs: update deployment guide, remove stale tasks

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-04 03:45:13 +02:00

336 lines
12 KiB
Python

"""Health check router.
Two distinct probes following Kubernetes conventions:
* ``GET /api/v1/health/live`` — **Liveness** — checks that the Python process is
alive and the event loop is responsive. Always returns 200; a non-2xx answer
tells Kubernetes to *restart* the container.
* ``GET /api/v1/health/ready`` — **Readiness** — checks that all critical
sub-systems (database, fail2ban socket, config directory, scheduler) are
reachable. Returns 200 only when all pass; returns 503 with a JSON body
listing every failed check otherwise. A non-2xx answer tells Kubernetes to
*stop routing traffic* to the pod until it recovers.
The combined ``GET /api/v1/health`` endpoint is retained for backward
compatibility with existing Docker HEALTHCHECK definitions.
"""
from __future__ import annotations
import asyncio
import os
from typing import TYPE_CHECKING, Literal
import structlog
from fastapi import APIRouter, status
from fastapi.responses import JSONResponse
from app.dependencies import AppStateDep, ServerStatusDep
from app.models.response import ComponentHealth, HealthResponse, ReadyCheck, ReadyResponse
if TYPE_CHECKING:
from collections.abc import Coroutine
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
log: structlog.stdlib.BoundLogger = structlog.get_logger()
@router.get(
"",
summary="Application health check",
response_model=HealthResponse,
responses={
200: {"description": "All components healthy"},
503: {"description": "fail2ban offline or component degraded"},
},
)
async def health_check(
app_state: AppStateDep,
server_status: ServerStatusDep,
) -> JSONResponse:
"""Return application and component status.
Performs lightweight checks on key application components and returns
HTTP 200 if all healthy, HTTP 503 if fail2ban is offline.
Docker/orchestration health checks interpret 503 as unhealthy and restart
the container if fail2ban remains unreachable.
Args:
app_state: Injected application state containing runtime components.
server_status: Injected cached server status snapshot.
Returns:
HTTP 200 with :class:`~app.models.response.HealthResponse` when healthy,
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban
is offline.
"""
components: list[ComponentHealth] = []
# --- Database check ---
db_healthy: bool = True
try:
from app.config import Settings
from app.db import open_db
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
test_db = await open_db(effective_settings.database_path)
await test_db.close()
except Exception as exc: # pragma: no cover - defensive, all paths logged
log.warning("health_check_db_failed", error=str(exc))
db_healthy = False
components.append(
ComponentHealth(name="database", healthy=False, message="Connection failed"),
)
# --- Scheduler check ---
scheduler_state: Literal["running", "stopped", "unknown"] = "unknown"
try:
scheduler = app_state.scheduler
if scheduler is not None and getattr(scheduler, "running", False):
scheduler_state = "running"
elif scheduler is not None:
scheduler_state = "stopped"
else:
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not initialised"),
)
except AttributeError: # pragma: no cover - defensive
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not accessible"),
)
# --- Cache check ---
cache_state: Literal["initialised", "uninitialised"] = "initialised"
try:
if app_state.session_cache is not None:
cache_state = "initialised"
else:
cache_state = "uninitialised"
components.append(
ComponentHealth(name="cache", healthy=False, message="Not initialised"),
)
except AttributeError: # pragma: no cover - defensive
cache_state = "uninitialised"
components.append(
ComponentHealth(name="cache", healthy=False, message="Not accessible"),
)
fail2ban_online: bool = server_status.online
if not fail2ban_online:
components.append(
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
)
# --- External logging check ---
external_log_state: Literal["ok", "error", "disabled", "unknown"] = "unknown"
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
try:
ext_log_failed = getattr(app_state.runtime_state, "external_log_init_failed", False)
if effective_settings.external_logging_enabled and effective_settings.external_logging_provider:
if ext_log_failed:
external_log_state = "error"
components.append(
ComponentHealth(name="external_logging", healthy=False, message="Handler initialization failed"),
)
else:
external_log_state = "ok"
else:
external_log_state = "disabled"
except AttributeError: # pragma: no cover - defensive
external_log_state = "unknown"
# --- Overall status ---
overall_status: Literal["ok", "degraded", "unavailable"]
if not fail2ban_online:
overall_status = "unavailable"
http_status: int = status.HTTP_503_SERVICE_UNAVAILABLE
elif components:
overall_status = "degraded"
http_status = status.HTTP_200_OK
else:
overall_status = "ok"
http_status = status.HTTP_200_OK
return JSONResponse(
status_code=http_status,
content=HealthResponse(
status=overall_status,
fail2ban="online" if fail2ban_online else "offline",
database="ok" if db_healthy else "error",
scheduler=scheduler_state,
cache=cache_state,
external_logging=external_log_state,
components=components,
).model_dump(),
)
# --- Constants for subsystem checks ------------------------------------------ #
SUBSYSTEM_TIMEOUT_SECONDS: float = 2.0
# --- Helper: run a blocking check in a thread pool to avoid event-loop delays -- #
async def _run_check(
name: str,
coro: Coroutine[object, object, None],
error_msg: str,
) -> ReadyCheck:
"""Run *coro* with a short timeout and return a ReadyCheck."""
try:
await asyncio.wait_for(coro, timeout=SUBSYSTEM_TIMEOUT_SECONDS)
return ReadyCheck(name=name, healthy=True)
except (OSError, TimeoutError, Exception) as exc: # noqa: BLE001
log.warning("ready_check_failed", subsystem=name, error=str(exc))
return ReadyCheck(name=name, healthy=False, message=f"{error_msg}: {exc}")
# --- Liveness probe ---------------------------------------------------------- #
@router.get(
"/live",
summary="Process liveness probe",
response_model=ReadyResponse,
responses={
200: {"description": "Process is alive"},
},
)
async def liveness_probe() -> JSONResponse:
"""Lightweight liveness check for Kubernetes.
Returns 200 when the Python process and event loop are responsive.
A non-2xx response tells Kubernetes to restart the container.
No subsystem checks are performed — this endpoint must be fast.
"""
return JSONResponse(
status_code=status.HTTP_200_OK,
content=ReadyResponse(
status="ok",
checks=[ReadyCheck(name="process", healthy=True)],
failed_count=0,
).model_dump(),
)
# --- Readiness probe --------------------------------------------------------- #
async def _check_database(app_state: AppStateDep) -> ReadyCheck:
"""Check database connectivity with a short timeout."""
from app.config import Settings
from app.db import open_db
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
async def _probe() -> None:
test_db = await open_db(effective_settings.database_path)
await test_db.close()
return await _run_check(
"database",
_probe(),
"Connection failed",
)
async def _check_fail2ban(app_state: AppStateDep, server_status: ServerStatusDep) -> ReadyCheck:
"""Check fail2ban socket reachability using the cached server status."""
if server_status.online:
return ReadyCheck(name="fail2ban", healthy=True)
return ReadyCheck(name="fail2ban", healthy=False, message="Socket not reachable")
async def _check_config_dir(app_state: AppStateDep) -> ReadyCheck:
"""Check config directory read access."""
from app.config import Settings
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
async def _probe() -> None:
config_path = effective_settings.fail2ban_config_dir
# Quick read-test: list directory (checks both existence and readability)
await asyncio.to_thread(os.access, config_path, os.R_OK)
return await _run_check(
"config_dir",
_probe(),
"Config directory not readable",
)
async def _check_scheduler(app_state: AppStateDep) -> ReadyCheck:
"""Check scheduler liveness."""
try:
scheduler = app_state.scheduler
if scheduler is not None and getattr(scheduler, "running", False):
return ReadyCheck(name="scheduler", healthy=True)
elif scheduler is not None:
return ReadyCheck(name="scheduler", healthy=False, message="Scheduler stopped")
else:
return ReadyCheck(name="scheduler", healthy=False, message="Not initialised")
except AttributeError:
return ReadyCheck(name="scheduler", healthy=False, message="Not accessible")
@router.get(
"/ready",
summary="Subsystem readiness probe",
response_model=ReadyResponse,
responses={
200: {"description": "All subsystems healthy"},
503: {"description": "One or more subsystems unreachable"},
},
)
async def readiness_probe(
app_state: AppStateDep,
server_status: ServerStatusDep,
) -> JSONResponse:
"""Readiness check for Kubernetes.
Verifies all critical sub-systems are reachable:
- Database connectivity
- fail2ban socket (via cached server status)
- Config directory read access
- Background scheduler liveness
Returns HTTP 200 only when every check passes; returns HTTP 503 with a
JSON body listing every failed subsystem otherwise. Each check has a
short per-subsystem timeout to prevent the endpoint from overwhelming the
system under load.
"""
db_check, f2b_check, config_check, sched_check = await asyncio.gather(
_check_database(app_state),
_check_fail2ban(app_state, server_status),
_check_config_dir(app_state),
_check_scheduler(app_state),
)
checks: list[ReadyCheck] = [db_check, f2b_check, config_check, sched_check]
failed_count = sum(1 for c in checks if not c.healthy)
http_status = status.HTTP_200_OK if failed_count == 0 else status.HTTP_503_SERVICE_UNAVAILABLE
return JSONResponse(
status_code=http_status,
content=ReadyResponse(
status="ok" if failed_count == 0 else "error",
checks=checks,
failed_count=failed_count,
).model_dump(),
)