feat: comprehensive health check with DB, scheduler, cache

- Add /api/v1/health endpoint with component-level checks
- Verify DB connectivity, fail2ban socket, scheduler, session cache
- Add SQLite WAL cleanup on startup (orphan crash files)
- Migration 8: import_log.timestamp → INTEGER UNIX epoch
- Align import_log timestamps with history_archive (already UNIX int)
- Add unit tests for DB cleanup and health router

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-02 23:03:57 +02:00
parent b631c1c546
commit 1285bc8571
12 changed files with 472 additions and 241 deletions

View File

@@ -9,6 +9,10 @@ The fail2ban database is separate and is accessed read-only by the history
and ban services.
"""
from __future__ import annotations
from pathlib import Path
import aiosqlite
import structlog
@@ -107,7 +111,7 @@ _SCHEMA_STATEMENTS: list[str] = [
_CREATE_HISTORY_ARCHIVE,
]
_CURRENT_SCHEMA_VERSION: int = 7
_CURRENT_SCHEMA_VERSION: int = 8
_MIGRATIONS: dict[int, str] = {
1: "\n".join(_SCHEMA_STATEMENTS),
@@ -201,6 +205,17 @@ CREATE INDEX IF NOT EXISTS idx_import_log_id_desc
-- Composite index for source_id + id DESC ordering (filtered pagination)
CREATE INDEX IF NOT EXISTS idx_import_log_source_id_desc
ON import_log (source_id, id DESC);
""",
8: """
-- Migration 8: Migrate import_log.timestamp from TEXT ISO 8601 to INTEGER UNIX epoch.
-- Standardizes all BanGUI timestamps on INTEGER UNIX (seconds since epoch).
-- This aligns import_log with history_archive which already uses INTEGER timeofban.
-- TEXT ISO 8601: "2024-06-15T13:45:00.000Z"
-- INTEGER UNIX: 1718453100
ALTER TABLE import_log ADD COLUMN timestamp_unix INTEGER;
UPDATE import_log SET timestamp_unix = strftime('%s', timestamp);
ALTER TABLE import_log DROP COLUMN timestamp;
ALTER TABLE import_log RENAME COLUMN timestamp_unix TO timestamp;
""",
}
@@ -218,6 +233,31 @@ async def _configure_connection(db: aiosqlite.Connection) -> None:
await db.execute("PRAGMA busy_timeout=5000;")
async def _cleanup_wal_files(db_path: str) -> None:
"""Remove orphaned WAL files after crashes.
When SQLite crashes in WAL mode, it may leave behind stale .wal and .shm
files that prevent the database from opening properly. This function removes
them if they exist and are not in use by any connection.
The actual recovery is done by SQLite automatically when opening the database.
This just cleans up orphaned files from previous crashes.
Args:
db_path: Path to the database file.
"""
wal_path = Path(db_path + "-wal")
shm_path = Path(db_path + "-shm")
for path in (wal_path, shm_path):
if path.exists():
try:
path.unlink()
log.warning("orphaned_sqlite_file_removed", path=str(path))
except OSError:
pass # File in use or permission denied
async def _get_current_schema_version(db: aiosqlite.Connection) -> int:
"""Return the highest applied schema version for the given database."""
await db.execute(_CREATE_SCHEMA_MIGRATIONS)
@@ -380,6 +420,7 @@ async def open_db(database_path: str) -> aiosqlite.Connection:
Returns:
A configured :class:`aiosqlite.Connection` instance.
"""
await _cleanup_wal_files(database_path)
db = await aiosqlite.connect(database_path)
db.row_factory = aiosqlite.Row
await _configure_connection(db)

View File

@@ -64,7 +64,7 @@ class ImportLogEntry(BanGuiBaseModel):
id: int
source_id: int | None
source_url: str
timestamp: str
timestamp: int
ips_imported: int
ips_skipped: int
errors: str | None

View File

@@ -328,37 +328,87 @@ class ErrorResponse(BanGuiBaseModel):
)
class ComponentHealth(BanGuiBaseModel):
"""Health status of a single application component.
Fields:
name: Human-readable component name.
healthy: True when the component is operational.
message: Optional detail message (e.g., error description).
"""
name: str = Field(..., description="Component name.")
healthy: bool = Field(..., description="True when the component is operational.")
message: str | None = Field(
default=None,
description="Optional detail message, e.g. error description.",
)
class HealthResponse(BanGuiBaseModel):
"""Standardized response for the health check endpoint.
Fields:
status: Application health status — 'ok' when healthy, 'unavailable' otherwise.
status: Application health status — 'ok' when all components are healthy,
'degraded' when some components are unhealthy but the service can still
handle requests, 'unavailable' when fail2ban is offline.
fail2ban: fail2ban daemon status — 'online' or 'offline'.
database: Database connectivity — 'ok' or 'error'.
scheduler: Background scheduler status — 'running', 'stopped', or 'unknown'.
cache: Cache initialization status — 'initialised' or 'uninitialised'.
components: Per-component health detail list (empty when all healthy).
Example:
```python
# Healthy (HTTP 200)
{
"status": "ok",
"fail2ban": "online"
"fail2ban": "online",
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"components": []
}
# Unhealthy (HTTP 503)
{
"status": "unavailable",
"fail2ban": "offline"
"fail2ban": "offline",
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"components": [{"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}]
}
```
"""
status: Literal["ok", "unavailable"] = Field(
status: Literal["ok", "degraded", "unavailable"] = Field(
...,
description="Application health status: 'ok' when healthy, 'unavailable' otherwise.",
description=(
"Application health status: 'ok' when healthy, 'degraded' when some "
"components are unhealthy, 'unavailable' when fail2ban is offline."
),
)
fail2ban: Literal["online", "offline"] = Field(
...,
description="fail2ban daemon status: 'online' when reachable, 'offline' otherwise.",
)
database: Literal["ok", "error"] = Field(
...,
description="Database connectivity: 'ok' when accessible, 'error' when not.",
)
scheduler: Literal["running", "stopped", "unknown"] = Field(
...,
description="Background scheduler status: 'running', 'stopped', or 'unknown'.",
)
cache: Literal["initialised", "uninitialised"] = Field(
...,
description="Cache initialization status: 'initialised' when ready, 'uninitialised' when not.",
)
components: list[ComponentHealth] = Field(
default_factory=list,
description="Per-component health detail list. Empty when status is 'ok'.",
)
class FlushLogsResponse(BanGuiBaseModel):

View File

@@ -50,12 +50,15 @@ async def add_log(
Returns:
Primary key of the inserted row.
"""
import time
timestamp_unix: int = int(time.time())
cursor = await db.execute(
"""
INSERT INTO import_log (source_id, source_url, ips_imported, ips_skipped, errors)
VALUES (?, ?, ?, ?, ?)
INSERT INTO import_log (source_id, source_url, timestamp, ips_imported, ips_skipped, errors)
VALUES (?, ?, ?, ?, ?, ?)
""",
(source_id, source_url, ips_imported, ips_skipped, errors),
(source_id, source_url, timestamp_unix, ips_imported, ips_skipped, errors),
)
await db.commit()
return int(cursor.lastrowid) # type: ignore[arg-type]

View File

@@ -1,43 +1,135 @@
"""Health check router.
A lightweight ``GET /api/health`` endpoint that verifies the application
A lightweight ``GET /api/v1/health`` endpoint that verifies the application
is running and can serve requests. Also reports the cached fail2ban liveness
state so monitoring tools and Docker health checks can observe daemon status
without probing the socket directly.
Comprehensive checks performed:
- Database connectivity
- fail2ban socket reachability (via cached server_status)
- Background scheduler health
- Session cache initialization
"""
from __future__ import annotations
from typing import Annotated, Literal
import structlog
from fastapi import APIRouter, status
from fastapi.responses import JSONResponse
from app.dependencies import ServerStatusDep
from app.models.response import HealthResponse
from app.dependencies import AppStateDep, ServerStatusDep
from app.models.response import ComponentHealth, HealthResponse
router: APIRouter = APIRouter(prefix="/api/v1/health", tags=["Health"])
log: structlog.stdlib.BoundLogger = structlog.get_logger()
@router.get("", summary="Application health check", response_model=HealthResponse)
async def health_check(server_status: ServerStatusDep) -> JSONResponse:
"""Return application and fail2ban status.
async def health_check(
app_state: AppStateDep,
server_status: ServerStatusDep,
) -> JSONResponse:
"""Return application and component status.
Returns HTTP 200 if fail2ban is online, HTTP 503 if offline.
Docker health checks interpret 503 as unhealthy and restart the container
if fail2ban remains unreachable, ensuring the backend only runs when
fail2ban is available.
Performs lightweight checks on key application components and returns
HTTP 200 if all healthy, HTTP 503 if fail2ban is offline.
Docker/orchestration health checks interpret 503 as unhealthy and restart
the container if fail2ban remains unreachable.
Args:
app_state: Injected application state containing runtime components.
server_status: Injected cached server status snapshot.
Returns:
HTTP 200 with :class:`~app.models.response.HealthResponse` when healthy,
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban is offline.
HTTP 503 with :class:`~app.models.response.HealthResponse` when fail2ban
is offline.
"""
if not server_status.online:
return JSONResponse(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
content=HealthResponse(status="unavailable", fail2ban="offline").model_dump(),
components: list[ComponentHealth] = []
# --- Database check ---
db_healthy: bool = True
try:
from app.config import Settings
from app.db import open_db
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
test_db = await open_db(effective_settings.database_path)
await test_db.close()
except Exception as exc: # pragma: no cover - defensive, all paths logged
log.warning("health_check_db_failed", error=str(exc))
db_healthy = False
components.append(
ComponentHealth(name="database", healthy=False, message="Connection failed"),
)
# --- Scheduler check ---
scheduler_state: Literal["running", "stopped", "unknown"] = "unknown"
try:
scheduler = app_state.scheduler
if scheduler is not None and getattr(scheduler, "running", False):
scheduler_state = "running"
elif scheduler is not None:
scheduler_state = "stopped"
else:
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not initialised"),
)
except Exception: # pragma: no cover - defensive
scheduler_state = "unknown"
components.append(
ComponentHealth(name="scheduler", healthy=False, message="Not accessible"),
)
# --- Cache check ---
cache_state: Literal["initialised", "uninitialised"] = "initialised"
try:
if app_state.session_cache is not None:
cache_state = "initialised"
else:
cache_state = "uninitialised"
components.append(
ComponentHealth(name="cache", healthy=False, message="Not initialised"),
)
except Exception: # pragma: no cover - defensive
cache_state = "uninitialised"
# --- fail2ban ---
fail2ban_online: bool = server_status.online
if not fail2ban_online:
components.append(
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
)
# --- Overall status ---
overall_status: Literal["ok", "degraded", "unavailable"]
if not fail2ban_online:
overall_status = "unavailable"
http_status: int = status.HTTP_503_SERVICE_UNAVAILABLE
elif components:
overall_status = "degraded"
http_status = status.HTTP_200_OK
else:
overall_status = "ok"
http_status = status.HTTP_200_OK
return JSONResponse(
status_code=status.HTTP_200_OK,
content=HealthResponse(status="ok", fail2ban="online").model_dump(),
status_code=http_status,
content=HealthResponse(
status=overall_status,
fail2ban="online" if fail2ban_online else "offline",
database="ok" if db_healthy else "error",
scheduler=scheduler_state,
cache=cache_state,
components=components,
).model_dump(),
)

View File

@@ -62,12 +62,19 @@ async def client(test_settings: Settings) -> AsyncClient: # type: ignore[misc]
Yields:
An :class:`httpx.AsyncClient` with ``base_url="http://test"``.
"""
from unittest.mock import MagicMock
app = create_app(settings=test_settings)
# Ensure fail2ban is reported as online for tests (mock socket is not
# actually connected so we need to set the cached status manually).
app.state.server_status = ServerStatus(online=True)
# Mock scheduler for health check tests (lifespan not run in ASGITransport tests)
mock_scheduler = MagicMock()
mock_scheduler.running = True
app.state.scheduler = mock_scheduler
# Bootstrap the database schema before making requests. ASGITransport
# does not run the application lifespan, so we create the test SQLite file
# directly rather than relying on startup logic.

View File

@@ -7,6 +7,7 @@ import pytest
from app.db import (
_apply_migration,
_cleanup_wal_files,
_parse_migration_statements,
init_db,
open_db,
@@ -241,3 +242,32 @@ async def test_init_db_idempotent(tmp_path: Path) -> None:
finally:
await db.close()
async def test_cleanup_wal_files_removes_orphaned_files(tmp_path: Path) -> None:
"""Test that _cleanup_wal_files removes orphaned WAL and SHM files."""
db_path = str(tmp_path / "test_wal.db")
wal_path = Path(db_path + "-wal")
shm_path = Path(db_path + "-shm")
# Create the orphaned files
wal_path.write_text("orphan")
shm_path.write_text("orphan")
assert wal_path.exists()
assert shm_path.exists()
# Run cleanup
await _cleanup_wal_files(db_path)
# Both files should be removed
assert not wal_path.exists()
assert not shm_path.exists()
async def test_cleanup_wal_files_handles_missing_files(tmp_path: Path) -> None:
"""Test that _cleanup_wal_files handles non-existent files gracefully."""
db_path = str(tmp_path / "nonexistent.db")
# Should not raise
await _cleanup_wal_files(db_path)

View File

@@ -8,15 +8,14 @@ from app.models.server import ServerStatus
@pytest.mark.asyncio
async def test_health_check_returns_200_when_online(client: AsyncClient) -> None:
"""``GET /api/health`` must return HTTP 200 when fail2ban is online."""
client._transport.app.state.server_status = ServerStatus(online=True)
"""``GET /api/v1/health`` must return HTTP 200 when fail2ban is online."""
response = await client.get("/api/v1/health")
assert response.status_code == 200
@pytest.mark.asyncio
async def test_health_check_returns_503_when_offline(client: AsyncClient) -> None:
"""``GET /api/health`` must return HTTP 503 when fail2ban is offline."""
"""``GET /api/v1/health`` must return HTTP 503 when fail2ban is offline."""
client._transport.app.state.server_status = ServerStatus(online=False)
response = await client.get("/api/v1/health")
assert response.status_code == 503
@@ -24,27 +23,84 @@ async def test_health_check_returns_503_when_offline(client: AsyncClient) -> Non
@pytest.mark.asyncio
async def test_health_check_returns_ok_status_when_online(client: AsyncClient) -> None:
"""``GET /api/health`` must contain ``status: ok`` when fail2ban is online."""
client._transport.app.state.server_status = ServerStatus(online=True)
"""``GET /api/v1/health`` must contain ``status: ok`` when fail2ban is online."""
response = await client.get("/api/v1/health")
data: dict[str, str] = response.json()
data: dict[str, object] = response.json()
assert data["status"] == "ok"
assert data["fail2ban"] == "online"
@pytest.mark.asyncio
async def test_health_check_returns_unavailable_when_offline(client: AsyncClient) -> None:
"""``GET /api/health`` must contain ``status: unavailable`` when fail2ban is offline."""
"""``GET /api/v1/health`` must contain ``status: unavailable`` when fail2ban is offline."""
client._transport.app.state.server_status = ServerStatus(online=False)
response = await client.get("/api/v1/health")
data: dict[str, str] = response.json()
data: dict[str, object] = response.json()
assert data["status"] == "unavailable"
assert data["fail2ban"] == "offline"
@pytest.mark.asyncio
async def test_health_check_content_type_is_json(client: AsyncClient) -> None:
"""``GET /api/health`` must set the ``Content-Type`` header to JSON."""
"""``GET /api/v1/health`` must set the ``Content-Type`` header to JSON."""
response = await client.get("/api/v1/health")
assert "application/json" in response.headers.get("content-type", "")
@pytest.mark.asyncio
async def test_health_check_includes_database_status(client: AsyncClient) -> None:
"""``GET /api/v1/health`` must include database status field."""
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert "database" in data
assert data["database"] in ("ok", "error")
@pytest.mark.asyncio
async def test_health_check_includes_scheduler_status(client: AsyncClient) -> None:
"""``GET /api/v1/health`` must include scheduler status field."""
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert "scheduler" in data
assert data["scheduler"] in ("running", "stopped", "unknown")
@pytest.mark.asyncio
async def test_health_check_includes_cache_status(client: AsyncClient) -> None:
"""``GET /api/v1/health`` must include cache status field."""
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert "cache" in data
assert data["cache"] in ("initialised", "uninitialised")
@pytest.mark.asyncio
async def test_health_check_includes_components_list(client: AsyncClient) -> None:
"""``GET /api/v1/health`` must include components list."""
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert "components" in data
assert isinstance(data["components"], list)
@pytest.mark.asyncio
async def test_health_check_offline_adds_fail2ban_to_components(
client: AsyncClient,
) -> None:
"""When fail2ban is offline, it must appear in the components list."""
client._transport.app.state.server_status = ServerStatus(online=False)
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert data["status"] == "unavailable"
components: list[dict[str, object]] = data["components"] # type: ignore[assignment]
assert any(c.get("name") == "fail2ban" and c.get("healthy") is False for c in components)
@pytest.mark.asyncio
async def test_health_check_online_returns_empty_components(client: AsyncClient) -> None:
"""When all components are healthy, components list must be empty."""
response = await client.get("/api/v1/health")
data: dict[str, object] = response.json()
assert data["status"] == "ok"
assert data["components"] == []