refactor(backend): external logging metrics, required mode, health checks

- Add external_logging_init_failures counter
- Add external_log_required flag, raise if init fails and required
- Health endpoint: add external_logging status check
- Blocklist service: enrich with metadata fields, update import logic
- Health check task: add runtime_state dependency, fix return typing
- Metrics: add Histogram for request latencies
- Frontend: align BlocklistImportLogSection props
- Docs: update deployment guide, remove stale tasks

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-04 03:45:13 +02:00
parent 42e177e6ea
commit 0a3f9c6c16
15 changed files with 172 additions and 131 deletions

View File

@@ -450,6 +450,16 @@ class Settings(BaseSettings):
"Logs are sent earlier if the batch size is reached."
),
)
external_log_required: bool = Field(
default=False,
description=(
"When enabled and external logging is configured, startup aborts if the "
"external log handler fails to initialize. When disabled (default), a failed "
"handler is treated as a warning and the application continues without external "
"logging. Set to true in production environments where logs must reach the "
"monitoring system."
),
)
datadog_api_key: str | None = Field(
default=None,
description=(

View File

@@ -503,6 +503,19 @@ class BlocklistSourceHasLogsError(ConflictError):
return {"source_id": self.source_id}
class BlocklistSourceAlreadyExistsError(ConflictError):
"""Raised when a blocklist source with the same URL already exists."""
error_code: str = "blocklist_source_already_exists"
def __init__(self, url: str) -> None:
self.url = url
super().__init__(f"Blocklist source with URL already exists: {url}")
def get_error_metadata(self) -> ErrorMetadata:
return {"url": self.url}
class HistoryNotFoundError(NotFoundError):
"""Raised when no history is found for the given IP."""

View File

@@ -182,6 +182,7 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
global _external_log_handler # noqa: PLW0603
settings: Settings = app.state.settings
runtime_state = app.state.runtime_state
http_session, scheduler, startup_db = await startup_shared_resources(app, settings)
app.state.http_session = http_session
@@ -210,10 +211,18 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
if _external_log_handler:
_external_log_handler.start_periodic_flush()
except ValueError as exc:
log.warning(
from app.utils import metrics as _metrics_mod
_metrics_mod.external_logging_init_failures.inc()
runtime_state.external_log_init_failed = True
log.error(
"external_logging_initialization_failed",
error=str(exc),
)
if settings.external_log_required:
msg = f"External logging is required but handler creation failed: {exc}"
log.critical("external_logging_required_but_unavailable", error=str(exc))
raise RuntimeError(msg) from exc
# Now configure logging with the handler in place
_configure_logging(settings.log_level, settings)

View File

@@ -183,7 +183,7 @@ class PaginationMetadata(BanGuiBaseModel):
description="Opaque cursor token for fetching the next page (cursor pagination only).",
)
pagination_mode: Literal["offset", "cursor"] = Field(
...,
default="offset",
description="Pagination mode used by the endpoint. 'offset' uses page/page_size; 'cursor' uses cursor tokens.",
)
@@ -353,6 +353,7 @@ class ErrorMetadata(TypedDict, total=False):
filter_name: Name of the filter involved in the error.
action_name: Name of the action involved in the error.
source_id: ID of a blocklist source involved in the error.
url: URL involved in a blocklist error.
ip: IP address involved in the error.
pattern: Regex pattern that caused an error.
error: Regex compilation error message.
@@ -371,6 +372,7 @@ class ErrorMetadata(TypedDict, total=False):
filter_name: str
action_name: str
source_id: int
url: str
ip: str
pattern: str
error: str
@@ -412,6 +414,7 @@ class HealthResponse(BanGuiBaseModel):
database: Database connectivity — 'ok' or 'error'.
scheduler: Background scheduler status — 'running', 'stopped', or 'unknown'.
cache: Cache initialization status — 'initialised' or 'uninitialised'.
external_logging: External logging handler status — 'ok', 'error', or 'disabled'.
components: Per-component health detail list (empty when all healthy).
Example:
@@ -423,6 +426,7 @@ class HealthResponse(BanGuiBaseModel):
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"external_logging": "disabled",
"components": []
}
@@ -433,6 +437,7 @@ class HealthResponse(BanGuiBaseModel):
"database": "ok",
"scheduler": "running",
"cache": "initialised",
"external_logging": "ok",
"components": [{"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}]
}
```
@@ -461,6 +466,13 @@ class HealthResponse(BanGuiBaseModel):
...,
description="Cache initialization status: 'initialised' when ready, 'uninitialised' when not.",
)
external_logging: Literal["ok", "error", "disabled"] = Field(
...,
description=(
"External logging handler status: 'ok' when operational, 'error' when "
"initialization failed, 'disabled' when external logging is not configured."
),
)
components: list[ComponentHealth] = Field(
default_factory=list,
description="Per-component health detail list. Empty when status is 'ok'.",

View File

@@ -13,6 +13,37 @@ if TYPE_CHECKING:
import aiosqlite
async def create_source_in_tx(
db: aiosqlite.Connection,
name: str,
url: str,
*,
enabled: bool = True,
) -> int:
"""Insert a new blocklist source without committing.
Caller is responsible for committing or rolling back the transaction.
Use this variant when validation must be atomic with insert.
Args:
db: Active aiosqlite connection with an open transaction.
name: Human-readable display name.
url: URL of the blocklist text file.
enabled: Whether the source is active. Defaults to ``True``.
Returns:
The ``ROWID`` / primary key of the new row.
"""
cursor = await db.execute(
"""
INSERT INTO blocklist_sources (name, url, enabled)
VALUES (?, ?, ?)
""",
(name, url, int(enabled)),
)
return int(cursor.lastrowid) # type: ignore[arg-type]
async def create_source(
db: aiosqlite.Connection,
name: str,

View File

@@ -22,6 +22,7 @@ registered *before* the ``/{id}`` routes so FastAPI resolves them correctly.
from __future__ import annotations
import structlog
from fastapi import APIRouter, Depends, Query, Request, status
from app.dependencies import (
@@ -34,7 +35,12 @@ from app.dependencies import (
SchedulerDep,
SettingsDep,
)
from app.exceptions import BadRequestError, BlocklistSourceNotFoundError
from app.exceptions import (
BadRequestError,
BlocklistSourceAlreadyExistsError,
BlocklistSourceNotFoundError,
RateLimitError,
)
from app.mappers import blocklist_mappers
from app.models.blocklist import (
BlocklistListResponse,
@@ -53,11 +59,13 @@ from app.utils.constants import DEFAULT_PAGE_SIZE, RATE_LIMIT_BLOCKLIST_IMPORT_R
router: APIRouter = APIRouter(prefix="/api/v1/blocklists", tags=["Blocklists"])
# Rate limit bucket constants
#: Rate limit bucket constants
_BLOCKLIST_IMPORT_BUCKET = "blocklist:import"
# 3600 seconds per hour
_HOUR = 3600
log: structlog.stdlib.BoundLogger = structlog.get_logger()
def _check_blocklist_import_rate_limit(
request: Request,
@@ -72,10 +80,6 @@ def _check_blocklist_import_rate_limit(
_BLOCKLIST_IMPORT_BUCKET, client_ip, RATE_LIMIT_BLOCKLIST_IMPORT_REQUESTS, _HOUR
)
if not is_allowed:
from app.exceptions import RateLimitError
import structlog
log = structlog.get_logger()
log.warning(
"blocklist_import_rate_limit_exceeded",
client_ip=client_ip,
@@ -128,6 +132,7 @@ async def list_blocklists(
201: {"description": "Blocklist source created", "model": BlocklistSource},
400: {"description": "URL validation failed"},
401: {"description": "Session missing, expired, or invalid"},
409: {"description": "A blocklist source with this URL already exists"},
},
)
async def create_blocklist(
@@ -154,6 +159,8 @@ async def create_blocklist(
)
except ValueError as exc:
raise BadRequestError(str(exc)) from exc
except BlocklistSourceAlreadyExistsError as exc:
raise exc
# ---------------------------------------------------------------------------

View File

@@ -128,6 +128,26 @@ async def health_check(
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
)
# --- External logging check ---
external_log_state: Literal["ok", "error", "disabled", "unknown"] = "unknown"
effective_settings: Settings = (
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
)
try:
ext_log_failed = getattr(app_state.runtime_state, "external_log_init_failed", False)
if effective_settings.external_logging_enabled and effective_settings.external_logging_provider:
if ext_log_failed:
external_log_state = "error"
components.append(
ComponentHealth(name="external_logging", healthy=False, message="Handler initialization failed"),
)
else:
external_log_state = "ok"
else:
external_log_state = "disabled"
except AttributeError: # pragma: no cover - defensive
external_log_state = "unknown"
# --- Overall status ---
overall_status: Literal["ok", "degraded", "unavailable"]
if not fail2ban_online:
@@ -148,6 +168,7 @@ async def health_check(
database="ok" if db_healthy else "error",
scheduler=scheduler_state,
cache=cache_state,
external_logging=external_log_state,
components=components,
).model_dump(),
)

View File

@@ -122,6 +122,10 @@ async def create_source(
at source creation time. The application's HTTP connector performs additional
runtime validation at connection time to prevent DNS-rebinding attacks.
Validation and insert run inside a single DB transaction. If validation
fails or a duplicate URL is detected, the transaction is rolled back and
no row is left behind.
Args:
db: Active application database connection.
name: Human-readable display name.
@@ -133,12 +137,30 @@ async def create_source(
Raises:
ValueError: If the URL fails SSRF validation.
BlocklistSourceAlreadyExistsError: If a source with the same URL already exists.
"""
from app.exceptions import BlocklistSourceAlreadyExistsError
from app.utils.ip_utils import validate_blocklist_url
await validate_blocklist_url(url)
try:
await db.execute("BEGIN IMMEDIATE")
await validate_blocklist_url(url)
try:
new_id = await blocklist_repo.create_source_in_tx(db, name, url, enabled=enabled)
except aiosqlite.IntegrityError as exc:
if "UNIQUE constraint failed" in str(exc):
await db.rollback()
raise BlocklistSourceAlreadyExistsError(url) from exc
raise
await db.commit()
except Exception:
await db.rollback()
raise
new_id = await blocklist_repo.create_source(db, name, url, enabled=enabled)
source = await get_source(db, new_id)
assert source is not None # noqa: S101
log.info("blocklist_source_created", id=new_id, name=name, url=url)

View File

@@ -20,8 +20,10 @@ so that task logs can be correlated across runs.
from __future__ import annotations
import asyncio
import datetime
import uuid
from contextvars import copy_context
from typing import TYPE_CHECKING
import structlog
@@ -69,20 +71,24 @@ async def _run_probe_with_resources(
token = set_correlation_id(correlation_id)
try:
await _do_probe_with_resources(settings, runtime_state)
# Use copy_context() so ContextVar values (e.g. correlation_id)
# propagate to any child asyncio tasks spawned inside the coroutine.
probe_task = asyncio.create_task(
_do_probe_with_resources(settings, runtime_state),
context=copy_context(),
)
await run_with_timeout("health_check", probe_task, HEALTH_PROBE_TIMEOUT_SECONDS)
finally:
# Reset AFTER run_with_timeout completes, so child tasks still
# have the correlation ID in their context while they log.
reset_correlation_id(token)
async def _do_probe_with_resources(settings: Settings, runtime_state: RuntimeState) -> None:
"""Inner probe logic that runs with correlation context set."""
async def _do_probe() -> None:
socket_path: str = settings.fail2ban_socket
status: ServerStatus = await health_service.probe(socket_path)
process_health_probe_result(runtime_state, status)
await run_with_timeout("health_check", _do_probe(), HEALTH_PROBE_TIMEOUT_SECONDS)
socket_path: str = settings.fail2ban_socket
status: ServerStatus = await health_service.probe(socket_path)
process_health_probe_result(runtime_state, status)
async def _run_probe(app: FastAPI) -> None:

View File

@@ -8,7 +8,15 @@ This module provides metrics collection for:
from __future__ import annotations
from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
from prometheus_client import (
CONTENT_TYPE_LATEST,
CollectorRegistry,
Counter,
Gauge,
Histogram,
Summary,
generate_latest,
)
__all__ = [
"get_metrics_registry",
@@ -19,6 +27,7 @@ __all__ = [
"bans_total",
"jails_total",
"fail2ban_connection_errors",
"external_logging_init_failures",
]
# Global registry
@@ -81,6 +90,12 @@ fail2ban_connection_errors = Counter(
registry=get_metrics_registry(),
)
external_logging_init_failures = Counter(
"bangui_external_logging_init_failures_total",
"Total number of external logging handler initialization failures",
registry=get_metrics_registry(),
)
# Application startup and health
app_uptime = Summary(

View File

@@ -125,6 +125,7 @@ class RuntimeState:
last_activation: ActivationRecord | None = None
runtime_settings: Settings | None = None
jail_service_state: JailServiceState = field(default_factory=JailServiceState)
external_log_init_failed: bool = False
class ApplicationState(State):