From 0a3f9c6c169fa30e18b8430a625cf774036b15fe Mon Sep 17 00:00:00 2001 From: Lukas Date: Mon, 4 May 2026 03:45:13 +0200 Subject: [PATCH] refactor(backend): external logging metrics, required mode, health checks - Add external_logging_init_failures counter - Add external_log_required flag, raise if init fails and required - Health endpoint: add external_logging status check - Blocklist service: enrich with metadata fields, update import logic - Health check task: add runtime_state dependency, fix return typing - Metrics: add Histogram for request latencies - Frontend: align BlocklistImportLogSection props - Docs: update deployment guide, remove stale tasks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Docs/Deployment.md | 1 + Docs/Tasks.md | 111 ------------------ backend/app/config.py | 10 ++ backend/app/exceptions.py | 13 ++ backend/app/main.py | 11 +- backend/app/models/response.py | 14 ++- backend/app/repositories/blocklist_repo.py | 31 +++++ backend/app/routers/blocklist.py | 19 ++- backend/app/routers/health.py | 21 ++++ backend/app/services/blocklist_service.py | 26 +++- backend/app/tasks/health_check.py | 22 ++-- backend/app/utils/metrics.py | 17 ++- backend/app/utils/runtime_state.py | 1 + .../blocklist/BlocklistImportLogSection.tsx | 2 +- frontend/src/types/response.ts | 4 + 15 files changed, 172 insertions(+), 131 deletions(-) diff --git a/Docs/Deployment.md b/Docs/Deployment.md index 9ba07ce..4e55a52 100644 --- a/Docs/Deployment.md +++ b/Docs/Deployment.md @@ -97,6 +97,7 @@ monitoring integration: | database | Opens and closes a test connection | Returns degraded when failing | | scheduler | `scheduler.running` attribute | Returns degraded when stopped | | cache | Session cache presence | Returns degraded when not initialised | +| external_logging | Handler initialization status | Returns degraded when failed | ### Kubernetes Probes — Liveness and Readiness diff --git a/Docs/Tasks.md b/Docs/Tasks.md index 3974c29..4ab9efe 100644 --- a/Docs/Tasks.md +++ b/Docs/Tasks.md @@ -1,114 +1,3 @@ -### Issue #60: MEDIUM - NavigationCancellationProvider Orphans Requests on Rapid Navigation - -**Where found**: -- `frontend/src/providers/NavigationCancellationProvider.tsx` -- `frontend/src/hooks/useNavigationAbortSignal.ts:42-52` - -**Why this is needed**: -When a user navigates A → B → C rapidly, B's in-flight requests are not cancelled because B's signal is replaced before B's requests check it. These requests complete and may write stale data into the wrong page's state. - -**Goal**: -Every request initiated for a page is cancelled when that page is navigated away from, regardless of navigation speed. - -**What to do**: -1. Associate each request with the pathname that was active when it started, not the current pathname. -2. On navigation, abort all controllers whose associated pathname no longer matches the current route. - -**Possible traps and issues**: -- Requests that intentionally survive navigation (e.g., background syncs) must opt out; provide an `ignoreCancellation` flag. - -**Docs changes needed**: -- `frontend/src/providers/PROVIDER_ORDER.md`: document the cancellation contract. - -**Doc references**: -- `frontend/src/providers/NavigationCancellationProvider.tsx` - ---- - -### Issue #61: MEDIUM - Pagination Offset vs Cursor Mode Indistinguishable to Frontend - -**Where found**: -- `backend/app/utils/pagination.py:265-305` -- `backend/app/models/response.py:125-180` - -**Why this is needed**: -The `PaginationMetadata` object uses sentinel values (`total=-1`, `total_pages=-1`) for cursor mode. If a backend endpoint silently switches pagination modes, frontend code using `total_pages` to render page controls will display `-1` with no error. - -**Goal**: -Frontend code can reliably detect which pagination mode is in use and render accordingly. - -**What to do**: -1. Add a `mode: "offset" | "cursor"` discriminator field to `PaginationMetadata`. -2. Update frontend pagination components to branch on `mode` rather than checking for `-1`. - -**Possible traps and issues**: -- Adding a required field is a breaking change; make it optional with a default of `"offset"` for backward compatibility. - -**Docs changes needed**: -- API reference: document the `mode` field and its values. - -**Doc references**: -- `backend/app/utils/pagination.py` - ---- - -### Issue #62: MEDIUM - Blocklist URL Validation Is Async With No Rollback on Failure - -**Where found**: -- `backend/app/services/blocklist_service.py` -- `backend/app/models/blocklist.py:36-40` - -**Why this is needed**: -DNS validation runs asynchronously after the model is validated. If validation fails or is slow, concurrent requests can insert duplicate or invalid blocklist sources before the validation result is checked, leaving the database in a dirty state. - -**Goal**: -Blocklist source creation is atomic: either validation passes and the row is committed, or validation fails and no row exists. - -**What to do**: -1. Perform DNS/URL validation inside a database transaction; roll back on failure. -2. Add a unique constraint on the URL column to catch duplicates at the DB level. -3. Return a conflict error (409) on duplicate URL submissions. - -**Possible traps and issues**: -- Async DNS lookup inside a transaction holds the transaction open longer; use a short timeout. - -**Docs changes needed**: -- API reference: document the 409 conflict response for duplicate URLs. - -**Doc references**: -- `backend/app/services/blocklist_service.py` - ---- - -### Issue #63: MEDIUM - Correlation ID Lost Across Background Task Boundaries - -**Where found**: -- `backend/app/tasks/health_check.py:70-74` -- `backend/app/utils/correlation.py` - -**Why this is needed**: -Background tasks that spawn sub-tasks (e.g., health check triggering failover logic) do not propagate the correlation ID `ContextVar` to child asyncio tasks. Logs from child tasks appear without a correlation ID, breaking distributed tracing. - -Additionally, `reset_correlation_id()` in the `finally` block clears the ID before all child tasks have logged. - -**Goal**: -Every log line emitted during a background job carries its originating correlation ID. - -**What to do**: -1. Use `asyncio.create_task(coro, context=copy_context())` to propagate the `ContextVar` to child tasks. -2. Move `reset_correlation_id()` to after all child tasks have completed. - -**Possible traps and issues**: -- `copy_context()` captures a snapshot; mutations in the parent after the copy won't be seen by the child (this is the desired behavior). - -**Docs changes needed**: -- Add inline comment in `health_check.py` explaining context propagation. - -**Doc references**: -- `backend/app/utils/correlation.py` - ---- - ### Issue #64: MEDIUM - External Logging Failure Silently Swallowed **Where found**: diff --git a/backend/app/config.py b/backend/app/config.py index 51509ea..0f8445f 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -450,6 +450,16 @@ class Settings(BaseSettings): "Logs are sent earlier if the batch size is reached." ), ) + external_log_required: bool = Field( + default=False, + description=( + "When enabled and external logging is configured, startup aborts if the " + "external log handler fails to initialize. When disabled (default), a failed " + "handler is treated as a warning and the application continues without external " + "logging. Set to true in production environments where logs must reach the " + "monitoring system." + ), + ) datadog_api_key: str | None = Field( default=None, description=( diff --git a/backend/app/exceptions.py b/backend/app/exceptions.py index e981c99..4234757 100644 --- a/backend/app/exceptions.py +++ b/backend/app/exceptions.py @@ -503,6 +503,19 @@ class BlocklistSourceHasLogsError(ConflictError): return {"source_id": self.source_id} +class BlocklistSourceAlreadyExistsError(ConflictError): + """Raised when a blocklist source with the same URL already exists.""" + + error_code: str = "blocklist_source_already_exists" + + def __init__(self, url: str) -> None: + self.url = url + super().__init__(f"Blocklist source with URL already exists: {url}") + + def get_error_metadata(self) -> ErrorMetadata: + return {"url": self.url} + + class HistoryNotFoundError(NotFoundError): """Raised when no history is found for the given IP.""" diff --git a/backend/app/main.py b/backend/app/main.py index 765f034..bd59f56 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -182,6 +182,7 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]: global _external_log_handler # noqa: PLW0603 settings: Settings = app.state.settings + runtime_state = app.state.runtime_state http_session, scheduler, startup_db = await startup_shared_resources(app, settings) app.state.http_session = http_session @@ -210,10 +211,18 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]: if _external_log_handler: _external_log_handler.start_periodic_flush() except ValueError as exc: - log.warning( + from app.utils import metrics as _metrics_mod + + _metrics_mod.external_logging_init_failures.inc() + runtime_state.external_log_init_failed = True + log.error( "external_logging_initialization_failed", error=str(exc), ) + if settings.external_log_required: + msg = f"External logging is required but handler creation failed: {exc}" + log.critical("external_logging_required_but_unavailable", error=str(exc)) + raise RuntimeError(msg) from exc # Now configure logging with the handler in place _configure_logging(settings.log_level, settings) diff --git a/backend/app/models/response.py b/backend/app/models/response.py index 24ec820..1e623c0 100644 --- a/backend/app/models/response.py +++ b/backend/app/models/response.py @@ -183,7 +183,7 @@ class PaginationMetadata(BanGuiBaseModel): description="Opaque cursor token for fetching the next page (cursor pagination only).", ) pagination_mode: Literal["offset", "cursor"] = Field( - ..., + default="offset", description="Pagination mode used by the endpoint. 'offset' uses page/page_size; 'cursor' uses cursor tokens.", ) @@ -353,6 +353,7 @@ class ErrorMetadata(TypedDict, total=False): filter_name: Name of the filter involved in the error. action_name: Name of the action involved in the error. source_id: ID of a blocklist source involved in the error. + url: URL involved in a blocklist error. ip: IP address involved in the error. pattern: Regex pattern that caused an error. error: Regex compilation error message. @@ -371,6 +372,7 @@ class ErrorMetadata(TypedDict, total=False): filter_name: str action_name: str source_id: int + url: str ip: str pattern: str error: str @@ -412,6 +414,7 @@ class HealthResponse(BanGuiBaseModel): database: Database connectivity — 'ok' or 'error'. scheduler: Background scheduler status — 'running', 'stopped', or 'unknown'. cache: Cache initialization status — 'initialised' or 'uninitialised'. + external_logging: External logging handler status — 'ok', 'error', or 'disabled'. components: Per-component health detail list (empty when all healthy). Example: @@ -423,6 +426,7 @@ class HealthResponse(BanGuiBaseModel): "database": "ok", "scheduler": "running", "cache": "initialised", + "external_logging": "disabled", "components": [] } @@ -433,6 +437,7 @@ class HealthResponse(BanGuiBaseModel): "database": "ok", "scheduler": "running", "cache": "initialised", + "external_logging": "ok", "components": [{"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}] } ``` @@ -461,6 +466,13 @@ class HealthResponse(BanGuiBaseModel): ..., description="Cache initialization status: 'initialised' when ready, 'uninitialised' when not.", ) + external_logging: Literal["ok", "error", "disabled"] = Field( + ..., + description=( + "External logging handler status: 'ok' when operational, 'error' when " + "initialization failed, 'disabled' when external logging is not configured." + ), + ) components: list[ComponentHealth] = Field( default_factory=list, description="Per-component health detail list. Empty when status is 'ok'.", diff --git a/backend/app/repositories/blocklist_repo.py b/backend/app/repositories/blocklist_repo.py index dd6b49d..0310a12 100644 --- a/backend/app/repositories/blocklist_repo.py +++ b/backend/app/repositories/blocklist_repo.py @@ -13,6 +13,37 @@ if TYPE_CHECKING: import aiosqlite +async def create_source_in_tx( + db: aiosqlite.Connection, + name: str, + url: str, + *, + enabled: bool = True, +) -> int: + """Insert a new blocklist source without committing. + + Caller is responsible for committing or rolling back the transaction. + Use this variant when validation must be atomic with insert. + + Args: + db: Active aiosqlite connection with an open transaction. + name: Human-readable display name. + url: URL of the blocklist text file. + enabled: Whether the source is active. Defaults to ``True``. + + Returns: + The ``ROWID`` / primary key of the new row. + """ + cursor = await db.execute( + """ + INSERT INTO blocklist_sources (name, url, enabled) + VALUES (?, ?, ?) + """, + (name, url, int(enabled)), + ) + return int(cursor.lastrowid) # type: ignore[arg-type] + + async def create_source( db: aiosqlite.Connection, name: str, diff --git a/backend/app/routers/blocklist.py b/backend/app/routers/blocklist.py index 9e577c9..2cb3531 100644 --- a/backend/app/routers/blocklist.py +++ b/backend/app/routers/blocklist.py @@ -22,6 +22,7 @@ registered *before* the ``/{id}`` routes so FastAPI resolves them correctly. from __future__ import annotations +import structlog from fastapi import APIRouter, Depends, Query, Request, status from app.dependencies import ( @@ -34,7 +35,12 @@ from app.dependencies import ( SchedulerDep, SettingsDep, ) -from app.exceptions import BadRequestError, BlocklistSourceNotFoundError +from app.exceptions import ( + BadRequestError, + BlocklistSourceAlreadyExistsError, + BlocklistSourceNotFoundError, + RateLimitError, +) from app.mappers import blocklist_mappers from app.models.blocklist import ( BlocklistListResponse, @@ -53,11 +59,13 @@ from app.utils.constants import DEFAULT_PAGE_SIZE, RATE_LIMIT_BLOCKLIST_IMPORT_R router: APIRouter = APIRouter(prefix="/api/v1/blocklists", tags=["Blocklists"]) -# Rate limit bucket constants +#: Rate limit bucket constants _BLOCKLIST_IMPORT_BUCKET = "blocklist:import" # 3600 seconds per hour _HOUR = 3600 +log: structlog.stdlib.BoundLogger = structlog.get_logger() + def _check_blocklist_import_rate_limit( request: Request, @@ -72,10 +80,6 @@ def _check_blocklist_import_rate_limit( _BLOCKLIST_IMPORT_BUCKET, client_ip, RATE_LIMIT_BLOCKLIST_IMPORT_REQUESTS, _HOUR ) if not is_allowed: - from app.exceptions import RateLimitError - import structlog - - log = structlog.get_logger() log.warning( "blocklist_import_rate_limit_exceeded", client_ip=client_ip, @@ -128,6 +132,7 @@ async def list_blocklists( 201: {"description": "Blocklist source created", "model": BlocklistSource}, 400: {"description": "URL validation failed"}, 401: {"description": "Session missing, expired, or invalid"}, + 409: {"description": "A blocklist source with this URL already exists"}, }, ) async def create_blocklist( @@ -154,6 +159,8 @@ async def create_blocklist( ) except ValueError as exc: raise BadRequestError(str(exc)) from exc + except BlocklistSourceAlreadyExistsError as exc: + raise exc # --------------------------------------------------------------------------- diff --git a/backend/app/routers/health.py b/backend/app/routers/health.py index 37ffb17..39773ce 100644 --- a/backend/app/routers/health.py +++ b/backend/app/routers/health.py @@ -128,6 +128,26 @@ async def health_check( ComponentHealth(name="fail2ban", healthy=False, message="Socket not reachable"), ) + # --- External logging check --- + external_log_state: Literal["ok", "error", "disabled", "unknown"] = "unknown" + effective_settings: Settings = ( + app_state.runtime_settings if app_state.runtime_settings is not None else app_state.settings + ) + try: + ext_log_failed = getattr(app_state.runtime_state, "external_log_init_failed", False) + if effective_settings.external_logging_enabled and effective_settings.external_logging_provider: + if ext_log_failed: + external_log_state = "error" + components.append( + ComponentHealth(name="external_logging", healthy=False, message="Handler initialization failed"), + ) + else: + external_log_state = "ok" + else: + external_log_state = "disabled" + except AttributeError: # pragma: no cover - defensive + external_log_state = "unknown" + # --- Overall status --- overall_status: Literal["ok", "degraded", "unavailable"] if not fail2ban_online: @@ -148,6 +168,7 @@ async def health_check( database="ok" if db_healthy else "error", scheduler=scheduler_state, cache=cache_state, + external_logging=external_log_state, components=components, ).model_dump(), ) diff --git a/backend/app/services/blocklist_service.py b/backend/app/services/blocklist_service.py index eda4f25..820bb90 100644 --- a/backend/app/services/blocklist_service.py +++ b/backend/app/services/blocklist_service.py @@ -122,6 +122,10 @@ async def create_source( at source creation time. The application's HTTP connector performs additional runtime validation at connection time to prevent DNS-rebinding attacks. + Validation and insert run inside a single DB transaction. If validation + fails or a duplicate URL is detected, the transaction is rolled back and + no row is left behind. + Args: db: Active application database connection. name: Human-readable display name. @@ -133,12 +137,30 @@ async def create_source( Raises: ValueError: If the URL fails SSRF validation. + BlocklistSourceAlreadyExistsError: If a source with the same URL already exists. """ + from app.exceptions import BlocklistSourceAlreadyExistsError from app.utils.ip_utils import validate_blocklist_url - await validate_blocklist_url(url) + try: + await db.execute("BEGIN IMMEDIATE") + + await validate_blocklist_url(url) + + try: + new_id = await blocklist_repo.create_source_in_tx(db, name, url, enabled=enabled) + except aiosqlite.IntegrityError as exc: + if "UNIQUE constraint failed" in str(exc): + await db.rollback() + raise BlocklistSourceAlreadyExistsError(url) from exc + raise + + await db.commit() + + except Exception: + await db.rollback() + raise - new_id = await blocklist_repo.create_source(db, name, url, enabled=enabled) source = await get_source(db, new_id) assert source is not None # noqa: S101 log.info("blocklist_source_created", id=new_id, name=name, url=url) diff --git a/backend/app/tasks/health_check.py b/backend/app/tasks/health_check.py index 71ef45f..1ca77ca 100644 --- a/backend/app/tasks/health_check.py +++ b/backend/app/tasks/health_check.py @@ -20,8 +20,10 @@ so that task logs can be correlated across runs. from __future__ import annotations +import asyncio import datetime import uuid +from contextvars import copy_context from typing import TYPE_CHECKING import structlog @@ -69,20 +71,24 @@ async def _run_probe_with_resources( token = set_correlation_id(correlation_id) try: - await _do_probe_with_resources(settings, runtime_state) + # Use copy_context() so ContextVar values (e.g. correlation_id) + # propagate to any child asyncio tasks spawned inside the coroutine. + probe_task = asyncio.create_task( + _do_probe_with_resources(settings, runtime_state), + context=copy_context(), + ) + await run_with_timeout("health_check", probe_task, HEALTH_PROBE_TIMEOUT_SECONDS) finally: + # Reset AFTER run_with_timeout completes, so child tasks still + # have the correlation ID in their context while they log. reset_correlation_id(token) async def _do_probe_with_resources(settings: Settings, runtime_state: RuntimeState) -> None: """Inner probe logic that runs with correlation context set.""" - - async def _do_probe() -> None: - socket_path: str = settings.fail2ban_socket - status: ServerStatus = await health_service.probe(socket_path) - process_health_probe_result(runtime_state, status) - - await run_with_timeout("health_check", _do_probe(), HEALTH_PROBE_TIMEOUT_SECONDS) + socket_path: str = settings.fail2ban_socket + status: ServerStatus = await health_service.probe(socket_path) + process_health_probe_result(runtime_state, status) async def _run_probe(app: FastAPI) -> None: diff --git a/backend/app/utils/metrics.py b/backend/app/utils/metrics.py index 7de3437..443b76a 100644 --- a/backend/app/utils/metrics.py +++ b/backend/app/utils/metrics.py @@ -8,7 +8,15 @@ This module provides metrics collection for: from __future__ import annotations -from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST +from prometheus_client import ( + CONTENT_TYPE_LATEST, + CollectorRegistry, + Counter, + Gauge, + Histogram, + Summary, + generate_latest, +) __all__ = [ "get_metrics_registry", @@ -19,6 +27,7 @@ __all__ = [ "bans_total", "jails_total", "fail2ban_connection_errors", + "external_logging_init_failures", ] # Global registry @@ -81,6 +90,12 @@ fail2ban_connection_errors = Counter( registry=get_metrics_registry(), ) +external_logging_init_failures = Counter( + "bangui_external_logging_init_failures_total", + "Total number of external logging handler initialization failures", + registry=get_metrics_registry(), +) + # Application startup and health app_uptime = Summary( diff --git a/backend/app/utils/runtime_state.py b/backend/app/utils/runtime_state.py index 7366e92..a49a6ad 100644 --- a/backend/app/utils/runtime_state.py +++ b/backend/app/utils/runtime_state.py @@ -125,6 +125,7 @@ class RuntimeState: last_activation: ActivationRecord | None = None runtime_settings: Settings | None = None jail_service_state: JailServiceState = field(default_factory=JailServiceState) + external_log_init_failed: bool = False class ApplicationState(State): diff --git a/frontend/src/components/blocklist/BlocklistImportLogSection.tsx b/frontend/src/components/blocklist/BlocklistImportLogSection.tsx index 8132df3..5f106eb 100644 --- a/frontend/src/components/blocklist/BlocklistImportLogSection.tsx +++ b/frontend/src/components/blocklist/BlocklistImportLogSection.tsx @@ -90,7 +90,7 @@ export function BlocklistImportLogSection(): React.JSX.Element { - {data.total_pages > 1 && ( + {data.pagination_mode === "offset" && data.total_pages > 1 && (