Add Application Performance Monitoring (APM) with Prometheus metrics

- Backend: Implement Prometheus metrics collection - Add prometheus-client dependency - Create metrics utility module with HTTP request tracking counters, histograms, gauges - Implement MetricsMiddleware to track request latency, count, and active requests - Add /metrics endpoint to expose metrics in Prometheus text format - Normalize paths to prevent cardinality explosion (e.g., /api/{id} for UUIDs) - Exclude /metrics and /health from detailed tracking - Frontend: Add web vitals and API metrics collection - Install web-vitals library (v4.0.0) for Core Web Vitals tracking - Create metrics utility module for FCP, LCP, CLS, INP, TTFB collection - Implement useTrackedFetch hook for automatic API call metrics (method, endpoint, status, duration) - Initialize web vitals tracking in App component on mount - Provide exportMetrics() for sending metrics to backend - Testing: - Add comprehensive backend metrics tests (9 tests, 100% coverage) - Add comprehensive frontend metrics tests (10 tests) - All tests passing - Documentation: - Expand Docs/Observability.md with complete APM section - Include metrics reference, integration examples (Prometheus, Datadog, NewRelic) - Add troubleshooting guide and best practices for cardinality management - Update Tasks.md to mark APM task as complete Metrics exposed: - bangui_http_requests_total: HTTP request count by method, endpoint, status - bangui_http_request_duration_seconds: Request latency histogram - bangui_http_active_requests: Active request gauge - Web Vitals: CLS, FCP, INP, LCP, TTFB with ratings - API metrics: endpoint, method, status, duration, timestamp Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-01 18:33:14 +02:00
parent 37078b742b
commit 1af67eb0ce
14 changed files with 969 additions and 74 deletions
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -45,6 +45,7 @@ from app.exceptions import (
 )
 from app.middleware.correlation import CorrelationIdMiddleware
 from app.middleware.csrf import CsrfMiddleware
+from app.middleware.metrics import MetricsMiddleware
 from app.middleware.rate_limit import RateLimitMiddleware
 from app.models.response import ErrorResponse
 from app.routers import (
@@ -58,6 +59,7 @@ from app.routers import (
    health,
    history,
    jails,
+    metrics,
    server,
    setup,
 )
@@ -950,6 +952,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
    app.add_middleware(CorrelationIdMiddleware)
    app.add_middleware(SecurityHeadersMiddleware)
    app.add_middleware(SetupRedirectMiddleware)
+    app.add_middleware(MetricsMiddleware)
    app.add_middleware(CsrfMiddleware)
    app.add_middleware(
        RateLimitMiddleware,
@@ -995,6 +998,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
    app.add_exception_handler(Exception, _unhandled_exception_handler)

    # --- Routers ---
+    app.include_router(metrics.router)
    app.include_router(health.router)
    app.include_router(setup.router)
    app.include_router(auth.router)
--- a/backend/app/middleware/metrics.py
+++ b/backend/app/middleware/metrics.py
@@ -0,0 +1,95 @@
+"""Metrics collection middleware for BanGUI.
+
+Tracks HTTP request count, latency, and active requests.
+Excludes the /metrics endpoint to prevent recursive metrics collection.
+"""
+
+from __future__ import annotations
+
+import re
+import time
+from typing import TYPE_CHECKING
+
+import structlog
+from starlette.middleware.base import BaseHTTPMiddleware
+
+from app.utils.metrics import http_active_requests, http_request_count, http_request_latency
+
+if TYPE_CHECKING:
+    from collections.abc import Awaitable, Callable
+
+    from starlette.requests import Request
+    from starlette.responses import Response
+
+log = structlog.get_logger()
+
+# Paths excluded from detailed metrics (to avoid cardinality explosion)
+EXCLUDED_PATHS = {"/metrics", "/health", "/api/health"}
+
+# Pattern to normalize endpoint paths (convert IDs to placeholders)
+PATH_PATTERN = re.compile(r"/api/[^/]+/[a-f0-9\-]{36}|/api/[^/]+/\d+")
+
+
+def _normalize_path(path: str) -> str:
+    """Normalize path by replacing IDs with placeholders.
+    
+    Converts paths like /api/resource/123 to /api/resource/{id}
+    to prevent cardinality explosion from dynamic IDs.
+    
+    Args:
+        path: The request path.
+    
+    Returns:
+        Normalized path with IDs replaced by {id}.
+    """
+    return PATH_PATTERN.sub(r"/api/{id}", path)
+
+
+class MetricsMiddleware(BaseHTTPMiddleware):
+    """Middleware to collect Prometheus metrics for HTTP requests."""
+
+    async def dispatch(
+        self,
+        request: Request,
+        call_next: Callable[[Request], Awaitable[Response]],
+    ) -> Response:
+        """Collect metrics for the request and response.
+        
+        Args:
+            request: The incoming request.
+            call_next: The next middleware/route handler.
+        
+        Returns:
+            The response.
+        """
+        # Skip metrics for excluded paths
+        if request.url.path in EXCLUDED_PATHS:
+            return await call_next(request)
+
+        method: str = request.method
+        endpoint: str = _normalize_path(request.url.path)
+
+        # Track active requests
+        http_active_requests.labels(method=method, endpoint=endpoint).inc()
+
+        start_time = time.perf_counter()
+        status_code = 500
+
+        try:
+            response: Response = await call_next(request)
+            status_code = response.status_code
+            return response
+        finally:
+            # Record metrics
+            duration: float = time.perf_counter() - start_time
+            http_request_latency.labels(method=method, endpoint=endpoint).observe(duration)
+            http_request_count.labels(method=method, endpoint=endpoint, status_code=status_code).inc()
+            http_active_requests.labels(method=method, endpoint=endpoint).dec()
+
+            log.debug(
+                "http_request_recorded",
+                method=method,
+                endpoint=endpoint,
+                status_code=status_code,
+                duration_ms=duration * 1000,
+            )
--- a/backend/app/routers/metrics.py
+++ b/backend/app/routers/metrics.py
@@ -0,0 +1,36 @@
+"""Prometheus metrics endpoint for BanGUI.
+
+Exposes collected metrics in Prometheus text format at GET /metrics.
+"""
+
+from __future__ import annotations
+
+import structlog
+from fastapi import APIRouter
+from starlette.responses import Response
+
+from app.utils.metrics import get_metrics, get_metrics_content_type
+
+log = structlog.get_logger()
+
+router = APIRouter()
+
+
+@router.get(
+    "/metrics",
+    tags=["observability"],
+    summary="Prometheus metrics endpoint",
+    description="Exposes application metrics in Prometheus text format (OpenMetrics)",
+    include_in_schema=False,
+)
+async def get_application_metrics() -> Response:
+    """Get Prometheus metrics.
+    
+    Returns:
+        Prometheus-formatted metrics as plain text.
+    """
+    log.debug("metrics_endpoint_accessed")
+    return Response(
+        content=get_metrics(),
+        media_type=get_metrics_content_type(),
+    )
--- a/backend/app/utils/metrics.py
+++ b/backend/app/utils/metrics.py
@@ -0,0 +1,108 @@
+"""Prometheus metrics collection for BanGUI backend.
+
+This module provides metrics collection for:
+- HTTP request count and latency per endpoint
+- Active concurrent requests
+- Custom application metrics (bans, jails, etc.)
+"""
+
+from __future__ import annotations
+
+from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
+
+__all__ = [
+    "get_metrics_registry",
+    "get_metrics",
+    "http_request_count",
+    "http_request_latency",
+    "http_active_requests",
+    "bans_total",
+    "jails_total",
+    "fail2ban_connection_errors",
+]
+
+# Global registry
+_registry: CollectorRegistry | None = None
+
+
+def get_metrics_registry() -> CollectorRegistry:
+    """Get or create the global metrics registry.
+    
+    Returns:
+        The Prometheus CollectorRegistry instance.
+    """
+    global _registry
+    if _registry is None:
+        _registry = CollectorRegistry()
+    return _registry
+
+
+# HTTP Metrics
+
+http_request_count = Counter(
+    "bangui_http_requests_total",
+    "Total HTTP requests by method, endpoint, and status code",
+    ["method", "endpoint", "status_code"],
+    registry=get_metrics_registry(),
+)
+
+http_request_latency = Histogram(
+    "bangui_http_request_duration_seconds",
+    "HTTP request latency in seconds by method and endpoint",
+    ["method", "endpoint"],
+    buckets=(0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0),
+    registry=get_metrics_registry(),
+)
+
+http_active_requests = Gauge(
+    "bangui_http_active_requests",
+    "Current number of active HTTP requests by method and endpoint",
+    ["method", "endpoint"],
+    registry=get_metrics_registry(),
+)
+
+# Application Metrics
+
+bans_total = Gauge(
+    "bangui_bans_total",
+    "Total number of banned IPs across all jails",
+    registry=get_metrics_registry(),
+)
+
+jails_total = Gauge(
+    "bangui_jails_total",
+    "Total number of fail2ban jails",
+    registry=get_metrics_registry(),
+)
+
+fail2ban_connection_errors = Counter(
+    "bangui_fail2ban_connection_errors_total",
+    "Total number of fail2ban connection errors",
+    registry=get_metrics_registry(),
+)
+
+# Application startup and health
+
+app_uptime = Summary(
+    "bangui_uptime_seconds",
+    "Application uptime in seconds",
+    registry=get_metrics_registry(),
+)
+
+
+def get_metrics() -> bytes:
+    """Get all collected metrics in Prometheus text format.
+    
+    Returns:
+        Prometheus-formatted metrics as bytes.
+    """
+    return generate_latest(get_metrics_registry())
+
+
+def get_metrics_content_type() -> str:
+    """Get the correct Content-Type for Prometheus metrics.
+    
+    Returns:
+        The MIME type for Prometheus metrics.
+    """
+    return CONTENT_TYPE_LATEST