feat: comprehensive health check with DB, scheduler, cache

- Add /api/v1/health endpoint with component-level checks
- Verify DB connectivity, fail2ban socket, scheduler, session cache
- Add SQLite WAL cleanup on startup (orphan crash files)
- Migration 8: import_log.timestamp → INTEGER UNIX epoch
- Align import_log timestamps with history_archive (already UNIX int)
- Add unit tests for DB cleanup and health router

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-02 23:03:57 +02:00
parent b631c1c546
commit 1285bc8571
12 changed files with 472 additions and 241 deletions

View File

@@ -1,43 +1,135 @@
"""Health check router.
A lightweight ``GET /api/health`` endpoint that verifies the application
A lightweight ``GET /api/v1/health`` endpoint that verifies the application
is running and can serve requests. Also reports the cached fail2ban liveness
state so monitoring tools and Docker health checks can observe daemon status
without probing the socket directly.
Comprehensive checks performed:
- Database connectivity
- fail2ban socket reachability (via cached server_status)
- Background scheduler health
- Session cache initialization
"""
from __future__ import annotations
from typing import Annotated, Literal
import structlog
from fastapi import APIRouter, status
from fastapi.responses import JSONResponse
from app.dependencies import ServerStatusDep
from app.models.response import HealthResponse
from app.dependencies import AppStateDep, ServerStatusDep
from app.models.response import ComponentHealth, HealthResponse
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
log: structlog.stdlib.BoundLogger = structlog.get_logger()
@router.get("", summary="Application health check", response_model=HealthResponse)
async def health_check(server_status: ServerStatusDep) -> JSONResponse:
"""Return application and fail2ban status.
async def health_check(
app_state: AppStateDep,
server_status: ServerStatusDep,
) -> JSONResponse:
"""Return application and component status.
Returns HTTP 200 if fail2ban is online, HTTP 503 if offline.
Docker health checks interpret 503 as unhealthy and restart the container
if fail2ban remains unreachable, ensuring the backend only runs when
fail2ban is available.
Performs lightweight checks on key application components and returns
HTTP 200 if all healthy, HTTP 503 if fail2ban is offline.
Docker/orchestration health checks interpret 503 as unhealthy and restart
the container if fail2ban remains unreachable.
Args:
app_state: Injected application state containing runtime components.
server_status: Injected cached server status snapshot.
Returns:
HTTP 200 with :class:`~app.models.response.HealthResponse` when healthy,
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban is offline.
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban
is offline.
"""
if not server_status.online:
return JSONResponse(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
content=HealthResponse(status="unavailable", fail2ban="offline").model_dump(),
components: list[ComponentHealth] = []
# --- Database check ---
db_healthy: bool = True
try:
from app.config import Settings
from app.db import open_db
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
test_db = await open_db(effective_settings.database_path)
await test_db.close()
except Exception as exc: # pragma: no cover - defensive, all paths logged
log.warning("health_check_db_failed", error=str(exc))
db_healthy = False
components.append(
ComponentHealth(name="database", healthy=False, message="Connection failed"),
)
# --- Scheduler check ---
scheduler_state: Literal["running", "stopped", "unknown"] = "unknown"
try:
scheduler = app_state.scheduler
if scheduler is not None and getattr(scheduler, "running", False):
scheduler_state = "running"
elif scheduler is not None:
scheduler_state = "stopped"
else:
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not initialised"),
)
except Exception: # pragma: no cover - defensive
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not accessible"),
)
# --- Cache check ---
cache_state: Literal["initialised", "uninitialised"] = "initialised"
try:
if app_state.session_cache is not None:
cache_state = "initialised"
else:
cache_state = "uninitialised"
components.append(
ComponentHealth(name="cache", healthy=False, message="Not initialised"),
)
except Exception: # pragma: no cover - defensive
cache_state = "uninitialised"
# --- fail2ban ---
fail2ban_online: bool = server_status.online
if not fail2ban_online:
components.append(
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
)
# --- Overall status ---
overall_status: Literal["ok", "degraded", "unavailable"]
if not fail2ban_online:
overall_status = "unavailable"
http_status: int = status.HTTP_503_SERVICE_UNAVAILABLE
elif components:
overall_status = "degraded"
http_status = status.HTTP_200_OK
else:
overall_status = "ok"
http_status = status.HTTP_200_OK
return JSONResponse(
status_code=status.HTTP_200_OK,
content=HealthResponse(status="ok", fail2ban="online").model_dump(),
status_code=http_status,
content=HealthResponse(
status=overall_status,
fail2ban="online" if fail2ban_online else "offline",
database="ok" if db_healthy else "error",
scheduler=scheduler_state,
cache=cache_state,
components=components,
).model_dump(),
)