refactor(backend): external logging metrics, required mode, health checks
- Add external_logging_init_failures counter - Add external_log_required flag, raise if init fails and required - Health endpoint: add external_logging status check - Blocklist service: enrich with metadata fields, update import logic - Health check task: add runtime_state dependency, fix return typing - Metrics: add Histogram for request latencies - Frontend: align BlocklistImportLogSection props - Docs: update deployment guide, remove stale tasks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -450,6 +450,16 @@ class Settings(BaseSettings):
|
||||
"Logs are sent earlier if the batch size is reached."
|
||||
),
|
||||
)
|
||||
external_log_required: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"When enabled and external logging is configured, startup aborts if the "
|
||||
"external log handler fails to initialize. When disabled (default), a failed "
|
||||
"handler is treated as a warning and the application continues without external "
|
||||
"logging. Set to true in production environments where logs must reach the "
|
||||
"monitoring system."
|
||||
),
|
||||
)
|
||||
datadog_api_key: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
|
||||
@@ -503,6 +503,19 @@ class BlocklistSourceHasLogsError(ConflictError):
|
||||
return {"source_id": self.source_id}
|
||||
|
||||
|
||||
class BlocklistSourceAlreadyExistsError(ConflictError):
|
||||
"""Raised when a blocklist source with the same URL already exists."""
|
||||
|
||||
error_code: str = "blocklist_source_already_exists"
|
||||
|
||||
def __init__(self, url: str) -> None:
|
||||
self.url = url
|
||||
super().__init__(f"Blocklist source with URL already exists: {url}")
|
||||
|
||||
def get_error_metadata(self) -> ErrorMetadata:
|
||||
return {"url": self.url}
|
||||
|
||||
|
||||
class HistoryNotFoundError(NotFoundError):
|
||||
"""Raised when no history is found for the given IP."""
|
||||
|
||||
|
||||
@@ -182,6 +182,7 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
global _external_log_handler # noqa: PLW0603
|
||||
|
||||
settings: Settings = app.state.settings
|
||||
runtime_state = app.state.runtime_state
|
||||
|
||||
http_session, scheduler, startup_db = await startup_shared_resources(app, settings)
|
||||
app.state.http_session = http_session
|
||||
@@ -210,10 +211,18 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
if _external_log_handler:
|
||||
_external_log_handler.start_periodic_flush()
|
||||
except ValueError as exc:
|
||||
log.warning(
|
||||
from app.utils import metrics as _metrics_mod
|
||||
|
||||
_metrics_mod.external_logging_init_failures.inc()
|
||||
runtime_state.external_log_init_failed = True
|
||||
log.error(
|
||||
"external_logging_initialization_failed",
|
||||
error=str(exc),
|
||||
)
|
||||
if settings.external_log_required:
|
||||
msg = f"External logging is required but handler creation failed: {exc}"
|
||||
log.critical("external_logging_required_but_unavailable", error=str(exc))
|
||||
raise RuntimeError(msg) from exc
|
||||
|
||||
# Now configure logging with the handler in place
|
||||
_configure_logging(settings.log_level, settings)
|
||||
|
||||
@@ -183,7 +183,7 @@ class PaginationMetadata(BanGuiBaseModel):
|
||||
description="Opaque cursor token for fetching the next page (cursor pagination only).",
|
||||
)
|
||||
pagination_mode: Literal["offset", "cursor"] = Field(
|
||||
...,
|
||||
default="offset",
|
||||
description="Pagination mode used by the endpoint. 'offset' uses page/page_size; 'cursor' uses cursor tokens.",
|
||||
)
|
||||
|
||||
@@ -353,6 +353,7 @@ class ErrorMetadata(TypedDict, total=False):
|
||||
filter_name: Name of the filter involved in the error.
|
||||
action_name: Name of the action involved in the error.
|
||||
source_id: ID of a blocklist source involved in the error.
|
||||
url: URL involved in a blocklist error.
|
||||
ip: IP address involved in the error.
|
||||
pattern: Regex pattern that caused an error.
|
||||
error: Regex compilation error message.
|
||||
@@ -371,6 +372,7 @@ class ErrorMetadata(TypedDict, total=False):
|
||||
filter_name: str
|
||||
action_name: str
|
||||
source_id: int
|
||||
url: str
|
||||
ip: str
|
||||
pattern: str
|
||||
error: str
|
||||
@@ -412,6 +414,7 @@ class HealthResponse(BanGuiBaseModel):
|
||||
database: Database connectivity — 'ok' or 'error'.
|
||||
scheduler: Background scheduler status — 'running', 'stopped', or 'unknown'.
|
||||
cache: Cache initialization status — 'initialised' or 'uninitialised'.
|
||||
external_logging: External logging handler status — 'ok', 'error', or 'disabled'.
|
||||
components: Per-component health detail list (empty when all healthy).
|
||||
|
||||
Example:
|
||||
@@ -423,6 +426,7 @@ class HealthResponse(BanGuiBaseModel):
|
||||
"database": "ok",
|
||||
"scheduler": "running",
|
||||
"cache": "initialised",
|
||||
"external_logging": "disabled",
|
||||
"components": []
|
||||
}
|
||||
|
||||
@@ -433,6 +437,7 @@ class HealthResponse(BanGuiBaseModel):
|
||||
"database": "ok",
|
||||
"scheduler": "running",
|
||||
"cache": "initialised",
|
||||
"external_logging": "ok",
|
||||
"components": [{"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}]
|
||||
}
|
||||
```
|
||||
@@ -461,6 +466,13 @@ class HealthResponse(BanGuiBaseModel):
|
||||
...,
|
||||
description="Cache initialization status: 'initialised' when ready, 'uninitialised' when not.",
|
||||
)
|
||||
external_logging: Literal["ok", "error", "disabled"] = Field(
|
||||
...,
|
||||
description=(
|
||||
"External logging handler status: 'ok' when operational, 'error' when "
|
||||
"initialization failed, 'disabled' when external logging is not configured."
|
||||
),
|
||||
)
|
||||
components: list[ComponentHealth] = Field(
|
||||
default_factory=list,
|
||||
description="Per-component health detail list. Empty when status is 'ok'.",
|
||||
|
||||
@@ -13,6 +13,37 @@ if TYPE_CHECKING:
|
||||
import aiosqlite
|
||||
|
||||
|
||||
async def create_source_in_tx(
|
||||
db: aiosqlite.Connection,
|
||||
name: str,
|
||||
url: str,
|
||||
*,
|
||||
enabled: bool = True,
|
||||
) -> int:
|
||||
"""Insert a new blocklist source without committing.
|
||||
|
||||
Caller is responsible for committing or rolling back the transaction.
|
||||
Use this variant when validation must be atomic with insert.
|
||||
|
||||
Args:
|
||||
db: Active aiosqlite connection with an open transaction.
|
||||
name: Human-readable display name.
|
||||
url: URL of the blocklist text file.
|
||||
enabled: Whether the source is active. Defaults to ``True``.
|
||||
|
||||
Returns:
|
||||
The ``ROWID`` / primary key of the new row.
|
||||
"""
|
||||
cursor = await db.execute(
|
||||
"""
|
||||
INSERT INTO blocklist_sources (name, url, enabled)
|
||||
VALUES (?, ?, ?)
|
||||
""",
|
||||
(name, url, int(enabled)),
|
||||
)
|
||||
return int(cursor.lastrowid) # type: ignore[arg-type]
|
||||
|
||||
|
||||
async def create_source(
|
||||
db: aiosqlite.Connection,
|
||||
name: str,
|
||||
|
||||
@@ -22,6 +22,7 @@ registered *before* the ``/{id}`` routes so FastAPI resolves them correctly.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter, Depends, Query, Request, status
|
||||
|
||||
from app.dependencies import (
|
||||
@@ -34,7 +35,12 @@ from app.dependencies import (
|
||||
SchedulerDep,
|
||||
SettingsDep,
|
||||
)
|
||||
from app.exceptions import BadRequestError, BlocklistSourceNotFoundError
|
||||
from app.exceptions import (
|
||||
BadRequestError,
|
||||
BlocklistSourceAlreadyExistsError,
|
||||
BlocklistSourceNotFoundError,
|
||||
RateLimitError,
|
||||
)
|
||||
from app.mappers import blocklist_mappers
|
||||
from app.models.blocklist import (
|
||||
BlocklistListResponse,
|
||||
@@ -53,11 +59,13 @@ from app.utils.constants import DEFAULT_PAGE_SIZE, RATE_LIMIT_BLOCKLIST_IMPORT_R
|
||||
|
||||
router: APIRouter = APIRouter(prefix="/api/v1/blocklists", tags=["Blocklists"])
|
||||
|
||||
# Rate limit bucket constants
|
||||
#: Rate limit bucket constants
|
||||
_BLOCKLIST_IMPORT_BUCKET = "blocklist:import"
|
||||
# 3600 seconds per hour
|
||||
_HOUR = 3600
|
||||
|
||||
log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
|
||||
|
||||
def _check_blocklist_import_rate_limit(
|
||||
request: Request,
|
||||
@@ -72,10 +80,6 @@ def _check_blocklist_import_rate_limit(
|
||||
_BLOCKLIST_IMPORT_BUCKET, client_ip, RATE_LIMIT_BLOCKLIST_IMPORT_REQUESTS, _HOUR
|
||||
)
|
||||
if not is_allowed:
|
||||
from app.exceptions import RateLimitError
|
||||
import structlog
|
||||
|
||||
log = structlog.get_logger()
|
||||
log.warning(
|
||||
"blocklist_import_rate_limit_exceeded",
|
||||
client_ip=client_ip,
|
||||
@@ -128,6 +132,7 @@ async def list_blocklists(
|
||||
201: {"description": "Blocklist source created", "model": BlocklistSource},
|
||||
400: {"description": "URL validation failed"},
|
||||
401: {"description": "Session missing, expired, or invalid"},
|
||||
409: {"description": "A blocklist source with this URL already exists"},
|
||||
},
|
||||
)
|
||||
async def create_blocklist(
|
||||
@@ -154,6 +159,8 @@ async def create_blocklist(
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise BadRequestError(str(exc)) from exc
|
||||
except BlocklistSourceAlreadyExistsError as exc:
|
||||
raise exc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -128,6 +128,26 @@ async def health_check(
|
||||
ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"),
|
||||
)
|
||||
|
||||
# --- External logging check ---
|
||||
external_log_state: Literal["ok", "error", "disabled", "unknown"] = "unknown"
|
||||
effective_settings: Settings = (
|
||||
app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings
|
||||
)
|
||||
try:
|
||||
ext_log_failed = getattr(app_state.runtime_state, "external_log_init_failed", False)
|
||||
if effective_settings.external_logging_enabled and effective_settings.external_logging_provider:
|
||||
if ext_log_failed:
|
||||
external_log_state = "error"
|
||||
components.append(
|
||||
ComponentHealth(name="external_logging", healthy=False, message="Handler initialization failed"),
|
||||
)
|
||||
else:
|
||||
external_log_state = "ok"
|
||||
else:
|
||||
external_log_state = "disabled"
|
||||
except AttributeError: # pragma: no cover - defensive
|
||||
external_log_state = "unknown"
|
||||
|
||||
# --- Overall status ---
|
||||
overall_status: Literal["ok", "degraded", "unavailable"]
|
||||
if not fail2ban_online:
|
||||
@@ -148,6 +168,7 @@ async def health_check(
|
||||
database="ok" if db_healthy else "error",
|
||||
scheduler=scheduler_state,
|
||||
cache=cache_state,
|
||||
external_logging=external_log_state,
|
||||
components=components,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
@@ -122,6 +122,10 @@ async def create_source(
|
||||
at source creation time. The application's HTTP connector performs additional
|
||||
runtime validation at connection time to prevent DNS-rebinding attacks.
|
||||
|
||||
Validation and insert run inside a single DB transaction. If validation
|
||||
fails or a duplicate URL is detected, the transaction is rolled back and
|
||||
no row is left behind.
|
||||
|
||||
Args:
|
||||
db: Active application database connection.
|
||||
name: Human-readable display name.
|
||||
@@ -133,12 +137,30 @@ async def create_source(
|
||||
|
||||
Raises:
|
||||
ValueError: If the URL fails SSRF validation.
|
||||
BlocklistSourceAlreadyExistsError: If a source with the same URL already exists.
|
||||
"""
|
||||
from app.exceptions import BlocklistSourceAlreadyExistsError
|
||||
from app.utils.ip_utils import validate_blocklist_url
|
||||
|
||||
await validate_blocklist_url(url)
|
||||
try:
|
||||
await db.execute("BEGIN IMMEDIATE")
|
||||
|
||||
await validate_blocklist_url(url)
|
||||
|
||||
try:
|
||||
new_id = await blocklist_repo.create_source_in_tx(db, name, url, enabled=enabled)
|
||||
except aiosqlite.IntegrityError as exc:
|
||||
if "UNIQUE constraint failed" in str(exc):
|
||||
await db.rollback()
|
||||
raise BlocklistSourceAlreadyExistsError(url) from exc
|
||||
raise
|
||||
|
||||
await db.commit()
|
||||
|
||||
except Exception:
|
||||
await db.rollback()
|
||||
raise
|
||||
|
||||
new_id = await blocklist_repo.create_source(db, name, url, enabled=enabled)
|
||||
source = await get_source(db, new_id)
|
||||
assert source is not None # noqa: S101
|
||||
log.info("blocklist_source_created", id=new_id, name=name, url=url)
|
||||
|
||||
@@ -20,8 +20,10 @@ so that task logs can be correlated across runs.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import uuid
|
||||
from contextvars import copy_context
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
@@ -69,20 +71,24 @@ async def _run_probe_with_resources(
|
||||
|
||||
token = set_correlation_id(correlation_id)
|
||||
try:
|
||||
await _do_probe_with_resources(settings, runtime_state)
|
||||
# Use copy_context() so ContextVar values (e.g. correlation_id)
|
||||
# propagate to any child asyncio tasks spawned inside the coroutine.
|
||||
probe_task = asyncio.create_task(
|
||||
_do_probe_with_resources(settings, runtime_state),
|
||||
context=copy_context(),
|
||||
)
|
||||
await run_with_timeout("health_check", probe_task, HEALTH_PROBE_TIMEOUT_SECONDS)
|
||||
finally:
|
||||
# Reset AFTER run_with_timeout completes, so child tasks still
|
||||
# have the correlation ID in their context while they log.
|
||||
reset_correlation_id(token)
|
||||
|
||||
|
||||
async def _do_probe_with_resources(settings: Settings, runtime_state: RuntimeState) -> None:
|
||||
"""Inner probe logic that runs with correlation context set."""
|
||||
|
||||
async def _do_probe() -> None:
|
||||
socket_path: str = settings.fail2ban_socket
|
||||
status: ServerStatus = await health_service.probe(socket_path)
|
||||
process_health_probe_result(runtime_state, status)
|
||||
|
||||
await run_with_timeout("health_check", _do_probe(), HEALTH_PROBE_TIMEOUT_SECONDS)
|
||||
socket_path: str = settings.fail2ban_socket
|
||||
status: ServerStatus = await health_service.probe(socket_path)
|
||||
process_health_probe_result(runtime_state, status)
|
||||
|
||||
|
||||
async def _run_probe(app: FastAPI) -> None:
|
||||
|
||||
@@ -8,7 +8,15 @@ This module provides metrics collection for:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
|
||||
from prometheus_client import (
|
||||
CONTENT_TYPE_LATEST,
|
||||
CollectorRegistry,
|
||||
Counter,
|
||||
Gauge,
|
||||
Histogram,
|
||||
Summary,
|
||||
generate_latest,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"get_metrics_registry",
|
||||
@@ -19,6 +27,7 @@ __all__ = [
|
||||
"bans_total",
|
||||
"jails_total",
|
||||
"fail2ban_connection_errors",
|
||||
"external_logging_init_failures",
|
||||
]
|
||||
|
||||
# Global registry
|
||||
@@ -81,6 +90,12 @@ fail2ban_connection_errors = Counter(
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
external_logging_init_failures = Counter(
|
||||
"bangui_external_logging_init_failures_total",
|
||||
"Total number of external logging handler initialization failures",
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
# Application startup and health
|
||||
|
||||
app_uptime = Summary(
|
||||
|
||||
@@ -125,6 +125,7 @@ class RuntimeState:
|
||||
last_activation: ActivationRecord | None = None
|
||||
runtime_settings: Settings | None = None
|
||||
jail_service_state: JailServiceState = field(default_factory=JailServiceState)
|
||||
external_log_init_failed: bool = False
|
||||
|
||||
|
||||
class ApplicationState(State):
|
||||
|
||||
Reference in New Issue
Block a user