feat: comprehensive health check with DB, scheduler, cache
- Add /api/v1/health endpoint with component-level checks - Verify DB connectivity, fail2ban socket, scheduler, session cache - Add SQLite WAL cleanup on startup (orphan crash files) - Migration 8: import_log.timestamp → INTEGER UNIX epoch - Align import_log timestamps with history_archive (already UNIX int) - Add unit tests for DB cleanup and health router Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -1,43 +1,135 @@
|
||||
"""Health check router.
|
||||
|
||||
A lightweight ``GET /api/health`` endpoint that verifies the application
|
||||
A lightweight ``GET /api/v1/health`` endpoint that verifies the application
|
||||
is running and can serve requests. Also reports the cached fail2ban liveness
|
||||
state so monitoring tools and Docker health checks can observe daemon status
|
||||
without probing the socket directly.
|
||||
|
||||
Comprehensive checks performed:
|
||||
- Database connectivity
|
||||
- fail2ban socket reachability (via cached server_status)
|
||||
- Background scheduler health
|
||||
- Session cache initialization
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Annotated, Literal
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter, status
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from app.dependencies import ServerStatusDep
|
||||
from app.models.response import HealthResponse
|
||||
from app.dependencies import AppStateDep, ServerStatusDep
|
||||
from app.models.response import ComponentHealth, HealthResponse
|
||||
|
||||
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
|
||||
|
||||
log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
|
||||
|
||||
@router.get("", summary="Application health check", response_model=HealthResponse)
|
||||
async def health_check(server_status: ServerStatusDep) -> JSONResponse:
|
||||
"""Return application and fail2ban status.
|
||||
async def health_check(
|
||||
app_state: AppStateDep,
|
||||
server_status: ServerStatusDep,
|
||||
) -> JSONResponse:
|
||||
"""Return application and component status.
|
||||
|
||||
Returns HTTP 200 if fail2ban is online, HTTP 503 if offline.
|
||||
Docker health checks interpret 503 as unhealthy and restart the container
|
||||
if fail2ban remains unreachable, ensuring the backend only runs when
|
||||
fail2ban is available.
|
||||
Performs lightweight checks on key application components and returns
|
||||
HTTP 200 if all healthy, HTTP 503 if fail2ban is offline.
|
||||
|
||||
Docker/orchestration health checks interpret 503 as unhealthy and restart
|
||||
the container if fail2ban remains unreachable.
|
||||
|
||||
Args:
|
||||
app_state: Injected application state containing runtime components.
|
||||
server_status: Injected cached server status snapshot.
|
||||
|
||||
Returns:
|
||||
HTTP 200 with :class:`~app.models.response.HealthResponse` when healthy,
|
||||
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban is offline.
|
||||
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban
|
||||
is offline.
|
||||
"""
|
||||
if not server_status.online:
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
content=HealthResponse(status="unavailable", fail2ban="offline").model_dump(),
|
||||
components: list[ComponentHealth] = []
|
||||
|
||||
# --- Database check ---
|
||||
db_healthy: bool = True
|
||||
try:
|
||||
|
||||
from app.config import Settings
|
||||
from app.db import open_db
|
||||
|
||||
effective_settings: Settings = (
|
||||
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
|
||||
)
|
||||
test_db = await open_db(effective_settings.database_path)
|
||||
await test_db.close()
|
||||
except Exception as exc: # pragma: no cover - defensive, all paths logged
|
||||
log.warning("health_check_db_failed", error=str(exc))
|
||||
db_healthy = False
|
||||
components.append(
|
||||
ComponentHealth(name="database", healthy=False, message="Connection failed"),
|
||||
)
|
||||
|
||||
# --- Scheduler check ---
|
||||
scheduler_state: Literal["running", "stopped", "unknown"] = "unknown"
|
||||
try:
|
||||
scheduler = app_state.scheduler
|
||||
if scheduler is not None and getattr(scheduler, "running", False):
|
||||
scheduler_state = "running"
|
||||
elif scheduler is not None:
|
||||
scheduler_state = "stopped"
|
||||
else:
|
||||
scheduler_state = "unknown"
|
||||
components.append(
|
||||
ComponentHealth(name="scheduler", healthy=False, message="Not initialised"),
|
||||
)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
scheduler_state = "unknown"
|
||||
components.append(
|
||||
ComponentHealth(name="scheduler", healthy=False, message="Not accessible"),
|
||||
)
|
||||
|
||||
# --- Cache check ---
|
||||
cache_state: Literal["initialised", "uninitialised"] = "initialised"
|
||||
try:
|
||||
if app_state.session_cache is not None:
|
||||
cache_state = "initialised"
|
||||
else:
|
||||
cache_state = "uninitialised"
|
||||
components.append(
|
||||
ComponentHealth(name="cache", healthy=False, message="Not initialised"),
|
||||
)
|
||||
except Exception: # pragma: no cover - defensive
|
||||
cache_state = "uninitialised"
|
||||
|
||||
# --- fail2ban ---
|
||||
fail2ban_online: bool = server_status.online
|
||||
if not fail2ban_online:
|
||||
components.append(
|
||||
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
|
||||
)
|
||||
|
||||
# --- Overall status ---
|
||||
overall_status: Literal["ok", "degraded", "unavailable"]
|
||||
if not fail2ban_online:
|
||||
overall_status = "unavailable"
|
||||
http_status: int = status.HTTP_503_SERVICE_UNAVAILABLE
|
||||
elif components:
|
||||
overall_status = "degraded"
|
||||
http_status = status.HTTP_200_OK
|
||||
else:
|
||||
overall_status = "ok"
|
||||
http_status = status.HTTP_200_OK
|
||||
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_200_OK,
|
||||
content=HealthResponse(status="ok", fail2ban="online").model_dump(),
|
||||
status_code=http_status,
|
||||
content=HealthResponse(
|
||||
status=overall_status,
|
||||
fail2ban="online" if fail2ban_online else "offline",
|
||||
database="ok" if db_healthy else "error",
|
||||
scheduler=scheduler_state,
|
||||
cache=cache_state,
|
||||
components=components,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user