feat: comprehensive health check with DB, scheduler, cache

- Add /api/v1/health endpoint with component-level checks
- Verify DB connectivity, fail2ban socket, scheduler, session cache
- Add SQLite WAL cleanup on startup (orphan crash files)
- Migration 8: import_log.timestamp → INTEGER UNIX epoch
- Align import_log timestamps with history_archive (already UNIX int)
- Add unit tests for DB cleanup and health router

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-02 23:03:57 +02:00
parent b631c1c546
commit 1285bc8571
12 changed files with 472 additions and 241 deletions

View File

@@ -64,7 +64,7 @@ class ImportLogEntry(BanGuiBaseModel):
id: int
source_id: int | None
source_url: str
timestamp: str
timestamp: int
ips_imported: int
ips_skipped: int
errors: str | None

View File

@@ -328,37 +328,87 @@ class ErrorResponse(BanGuiBaseModel):
)
class ComponentHealth(BanGuiBaseModel):
"""Health status of a single application component.
Fields:
name: Human-readable component name.
healthy: True when the component is operational.
message: Optional detail message (e.g., error description).
"""
name: str = Field(..., description="Component name.")
healthy: bool = Field(..., description="True when the component is operational.")
message: str | None = Field(
default=None,
description="Optional detail message, e.g. error description.",
)
class HealthResponse(BanGuiBaseModel):
"""Standardized response for the health check endpoint.
Fields:
status: Application health status — 'ok' when healthy, 'unavailable' otherwise.
status: Application health status — 'ok' when all components are healthy,
'degraded' when some components are unhealthy but the service can still
handle requests, 'unavailable' when fail2ban is offline.
fail2ban: fail2ban daemon status — 'online' or 'offline'.
database: Database connectivity — 'ok' or 'error'.
scheduler: Background scheduler status — 'running', 'stopped', or 'unknown'.
cache: Cache initialization status — 'initialised' or 'uninitialised'.
components: Per-component health detail list (empty when all healthy).
Example:
```python
# Healthy (HTTP 200)
{
"status": "ok",
"fail2ban": "online"
"fail2ban": "online",
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"components": []
}
# Unhealthy (HTTP 503)
{
"status": "unavailable",
"fail2ban": "offline"
"fail2ban": "offline",
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"components": [{"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}]
}
```
"""
status: Literal["ok", "unavailable"] = Field(
status: Literal["ok", "degraded", "unavailable"] = Field(
...,
description="Application health status: 'ok' when healthy, 'unavailable' otherwise.",
description=(
"Application health status: 'ok' when healthy, 'degraded' when some "
"components are unhealthy, 'unavailable' when fail2ban is offline."
),
)
fail2ban: Literal["online", "offline"] = Field(
...,
description="fail2ban daemon status: 'online' when reachable, 'offline' otherwise.",
)
database: Literal["ok", "error"] = Field(
...,
description="Database connectivity: 'ok' when accessible, 'error' when not.",
)
scheduler: Literal["running", "stopped", "unknown"] = Field(
...,
description="Background scheduler status: 'running', 'stopped', or 'unknown'.",
)
cache: Literal["initialised", "uninitialised"] = Field(
...,
description="Cache initialization status: 'initialised' when ready, 'uninitialised' when not.",
)
components: list[ComponentHealth] = Field(
default_factory=list,
description="Per-component health detail list. Empty when status is 'ok'.",
)
class FlushLogsResponse(BanGuiBaseModel):