Implement frontend and backend observability alignment

Align frontend and backend error observability with correlation IDs and
structured telemetry for distributed tracing across systems.

Backend changes:
- Add CorrelationIdMiddleware to generate/extract correlation IDs
- Include correlation_id in all ErrorResponse objects
- Store correlation ID in structlog contextvars for automatic inclusion in logs
- Add correlation ID to response headers (X-Correlation-ID)

Frontend changes:
- API client automatically generates session-scoped UUID4 and includes
  X-Correlation-ID header in all requests
- Extract correlation ID from API error responses
- Update error handlers to use telemetry with correlation IDs
- Add telemetry logging to ErrorBoundary, PageErrorBoundary, SectionErrorBoundary
- Implement redaction utilities for privacy-safe logging of sensitive data

Documentation:
- Add observability guidelines to Web-Development.md
  * Correlation ID usage patterns
  * Privacy & security best practices
  * Telemetry event structure
  * Redaction utilities for sensitive data
- Add distributed tracing architecture section to Architecture.md
  * Correlation ID flow across frontend/backend
  * Example troubleshooting scenario
  * Implementation details for future enhancements

Testing:
- Add comprehensive tests for correlation middleware
- Update error boundary tests to verify telemetry integration
- Verify TypeScript and ESLint pass with no warnings

Fixes: Issue #40 - Frontend and backend observability are not aligned

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-04-30 18:32:19 +02:00
parent 9a43123b3a
commit 3d1a6f5538
16 changed files with 916 additions and 54 deletions

View File

@@ -40,6 +40,7 @@ from app.exceptions import (
RateLimitError,
ServiceUnavailableError,
)
from app.middleware.correlation import CorrelationIdMiddleware
from app.middleware.csrf import CsrfMiddleware
from app.models.response import ErrorResponse
from app.routers import (
@@ -215,6 +216,20 @@ def _get_error_metadata(exc: Exception) -> dict[str, str | int | float | bool |
return {}
def _get_correlation_id(request: Request) -> str | None:
"""Extract correlation ID from request state if available.
The correlation ID is set by CorrelationIdMiddleware.
Args:
request: The incoming FastAPI request.
Returns:
The correlation ID string, or None if not present.
"""
return getattr(request.state, "correlation_id", None)
async def _unhandled_exception_handler(
request: Request,
exc: Exception,
@@ -241,6 +256,7 @@ async def _unhandled_exception_handler(
code="internal_error",
detail="An unexpected error occurred. Please try again later.",
metadata={},
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=500,
@@ -271,6 +287,7 @@ async def _fail2ban_connection_handler(
code="fail2ban_unreachable",
detail="Cannot reach the fail2ban service. Check the server status page.",
metadata={"socket_path": exc.socket_path},
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=502,
@@ -301,6 +318,7 @@ async def _fail2ban_protocol_handler(
code="fail2ban_protocol_error",
detail="Cannot reach the fail2ban service. Check the server status page.",
metadata={},
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=502,
@@ -331,6 +349,7 @@ async def _not_found_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_404_NOT_FOUND,
@@ -361,6 +380,7 @@ async def _bad_request_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_400_BAD_REQUEST,
@@ -383,6 +403,7 @@ async def _conflict_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_409_CONFLICT,
@@ -406,6 +427,7 @@ async def _domain_error_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
@@ -436,6 +458,7 @@ async def _value_error_handler(
code="invalid_input",
detail=str(exc),
metadata={},
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_400_BAD_REQUEST,
@@ -466,6 +489,7 @@ async def _service_unavailable_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
@@ -496,6 +520,7 @@ async def _authentication_error_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_401_UNAUTHORIZED,
@@ -526,6 +551,7 @@ async def _rate_limit_error_handler(
code=_get_error_code(exc),
detail=str(exc),
metadata=_get_error_metadata(exc),
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
@@ -576,6 +602,7 @@ async def _http_exception_handler(
code=error_code,
detail=exc.detail,
metadata={},
correlation_id=_get_correlation_id(request),
)
return JSONResponse(
@@ -743,6 +770,9 @@ def create_app(settings: Settings | None = None) -> FastAPI:
# Note: middleware is applied in reverse order of registration.
# The setup-redirect must run *after* CSRF, so it is added last.
# CSRF middleware protects cookie-authenticated state-mutating requests.
# CorrelationIdMiddleware must run first (added last) so correlation ID
# is available to all downstream handlers and loggers.
app.add_middleware(CorrelationIdMiddleware)
app.add_middleware(SetupRedirectMiddleware)
app.add_middleware(CsrfMiddleware)

View File

@@ -0,0 +1,93 @@
"""Correlation ID middleware for distributed tracing.
This middleware generates or extracts a correlation ID from each request,
stores it in structlog's contextvars, and includes it in error responses.
This enables correlating logs across frontend and backend for a single
user action or request flow.
Correlation IDs flow through the request lifecycle:
1. Frontend generates/passes via `X-Correlation-ID` header
2. Middleware extracts or generates a UUID4
3. Middleware stores in structlog.contextvars
4. All log entries include the correlation ID automatically
5. Error responses include the correlation ID for client-side correlation
"""
from __future__ import annotations
import uuid
from typing import TYPE_CHECKING
import structlog
from starlette.middleware.base import BaseHTTPMiddleware
if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
from starlette.requests import Request
from starlette.responses import Response as StarletteResponse
log: structlog.stdlib.BoundLogger = structlog.get_logger()
# Standard header name for correlation IDs (follows W3C Trace Context conventions)
_CORRELATION_ID_HEADER: str = "X-Correlation-ID"
# Key name for storing correlation ID in structlog context
CORRELATION_ID_CONTEXT_KEY: str = "correlation_id"
class CorrelationIdMiddleware(BaseHTTPMiddleware):
"""Extract or generate correlation ID and inject into structlog context.
For each request, this middleware:
1. Checks for `X-Correlation-ID` header (trusted from frontend)
2. Generates a new UUID4 if header not present
3. Stores in structlog.contextvars so all logs for this request include it
4. Makes available via request.state for error handlers
The correlation ID enables tracing a single user action or request flow
across both frontend and backend systems using structured logs.
"""
async def dispatch(
self,
request: Request,
call_next: Callable[[Request], Awaitable[StarletteResponse]],
) -> StarletteResponse:
"""Intercept requests to extract or generate correlation ID.
Args:
request: The incoming HTTP request.
call_next: The next middleware / router handler.
Returns:
The response from the next middleware / router, with correlation ID
in the request state for use by exception handlers.
"""
# Extract correlation ID from request header, or generate a new one
correlation_id: str = request.headers.get(
_CORRELATION_ID_HEADER,
str(uuid.uuid4()),
)
# Store in structlog context so all logs for this request include it
structlog.contextvars.clear_contextvars()
structlog.contextvars.bind_contextvars(
**{CORRELATION_ID_CONTEXT_KEY: correlation_id}
)
# Also store on request.state for use by exception handlers
request.state.correlation_id = correlation_id
log.debug(
"request_received",
method=request.method,
path=request.url.path,
)
response: StarletteResponse = await call_next(request)
# Add correlation ID to response header so frontend can correlate errors
response.headers[_CORRELATION_ID_HEADER] = correlation_id
return response

View File

@@ -214,10 +214,14 @@ class ErrorResponse(BanGuiBaseModel):
The error code enables machine-readable branching, while detail provides
human-readable context. Metadata offers optional structured context.
The correlation_id field enables tracing this error back through logs on both
frontend and backend, enabling correlation across distributed systems.
Fields:
code: Machine-readable error code (e.g., "jail_not_found", "invalid_input").
detail: Human-readable error description for display to users.
metadata: Optional structured context (e.g., field names, constraint violations).
correlation_id: Unique ID for correlating this error with request logs.
Example:
```python
@@ -225,21 +229,24 @@ class ErrorResponse(BanGuiBaseModel):
{
"code": "jail_not_found",
"detail": "Jail 'sshd' not found",
"metadata": {"jail_name": "sshd"}
"metadata": {"jail_name": "sshd"},
"correlation_id": "550e8400-e29b-41d4-a716-446655440000"
}
# 400 Bad Request - Validation Error
{
"code": "invalid_input",
"detail": "Invalid IP address format",
"metadata": {"field": "ip", "value": "999.999.999.999"}
"metadata": {"field": "ip", "value": "999.999.999.999"},
"correlation_id": "550e8400-e29b-41d4-a716-446655440000"
}
# 409 Conflict
{
"code": "jail_already_active",
"detail": "Jail is already active: 'sshd'",
"metadata": {"jail_name": "sshd", "current_status": "active"}
"metadata": {"jail_name": "sshd", "current_status": "active"},
"correlation_id": "550e8400-e29b-41d4-a716-446655440000"
}
```
"""
@@ -250,3 +257,7 @@ class ErrorResponse(BanGuiBaseModel):
default_factory=dict,
description="Optional structured context for the error.",
)
correlation_id: str | None = Field(
default=None,
description="Unique ID for correlating this error with request logs on both frontend and backend.",
)