Add Application Performance Monitoring (APM) with Prometheus metrics

- Backend: Implement Prometheus metrics collection - Add prometheus-client dependency - Create metrics utility module with HTTP request tracking counters, histograms, gauges - Implement MetricsMiddleware to track request latency, count, and active requests - Add /metrics endpoint to expose metrics in Prometheus text format - Normalize paths to prevent cardinality explosion (e.g., /api/{id} for UUIDs) - Exclude /metrics and /health from detailed tracking - Frontend: Add web vitals and API metrics collection - Install web-vitals library (v4.0.0) for Core Web Vitals tracking - Create metrics utility module for FCP, LCP, CLS, INP, TTFB collection - Implement useTrackedFetch hook for automatic API call metrics (method, endpoint, status, duration) - Initialize web vitals tracking in App component on mount - Provide exportMetrics() for sending metrics to backend - Testing: - Add comprehensive backend metrics tests (9 tests, 100% coverage) - Add comprehensive frontend metrics tests (10 tests) - All tests passing - Documentation: - Expand Docs/Observability.md with complete APM section - Include metrics reference, integration examples (Prometheus, Datadog, NewRelic) - Add troubleshooting guide and best practices for cardinality management - Update Tasks.md to mark APM task as complete Metrics exposed: - bangui_http_requests_total: HTTP request count by method, endpoint, status - bangui_http_request_duration_seconds: Request latency histogram - bangui_http_active_requests: Active request gauge - Web Vitals: CLS, FCP, INP, LCP, TTFB with ratings - API metrics: endpoint, method, status, duration, timestamp Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-01 18:33:14 +02:00
parent 37078b742b
commit 1af67eb0ce
14 changed files with 969 additions and 74 deletions
--- a/backend/app/utils/metrics.py
+++ b/backend/app/utils/metrics.py
@@ -0,0 +1,108 @@
+"""Prometheus metrics collection for BanGUI backend.
+
+This module provides metrics collection for:
+- HTTP request count and latency per endpoint
+- Active concurrent requests
+- Custom application metrics (bans, jails, etc.)
+"""
+
+from __future__ import annotations
+
+from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
+
+__all__ = [
+    "get_metrics_registry",
+    "get_metrics",
+    "http_request_count",
+    "http_request_latency",
+    "http_active_requests",
+    "bans_total",
+    "jails_total",
+    "fail2ban_connection_errors",
+]
+
+# Global registry
+_registry: CollectorRegistry | None = None
+
+
+def get_metrics_registry() -> CollectorRegistry:
+    """Get or create the global metrics registry.
+    
+    Returns:
+        The Prometheus CollectorRegistry instance.
+    """
+    global _registry
+    if _registry is None:
+        _registry = CollectorRegistry()
+    return _registry
+
+
+# HTTP Metrics
+
+http_request_count = Counter(
+    "bangui_http_requests_total",
+    "Total HTTP requests by method, endpoint, and status code",
+    ["method", "endpoint", "status_code"],
+    registry=get_metrics_registry(),
+)
+
+http_request_latency = Histogram(
+    "bangui_http_request_duration_seconds",
+    "HTTP request latency in seconds by method and endpoint",
+    ["method", "endpoint"],
+    buckets=(0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0),
+    registry=get_metrics_registry(),
+)
+
+http_active_requests = Gauge(
+    "bangui_http_active_requests",
+    "Current number of active HTTP requests by method and endpoint",
+    ["method", "endpoint"],
+    registry=get_metrics_registry(),
+)
+
+# Application Metrics
+
+bans_total = Gauge(
+    "bangui_bans_total",
+    "Total number of banned IPs across all jails",
+    registry=get_metrics_registry(),
+)
+
+jails_total = Gauge(
+    "bangui_jails_total",
+    "Total number of fail2ban jails",
+    registry=get_metrics_registry(),
+)
+
+fail2ban_connection_errors = Counter(
+    "bangui_fail2ban_connection_errors_total",
+    "Total number of fail2ban connection errors",
+    registry=get_metrics_registry(),
+)
+
+# Application startup and health
+
+app_uptime = Summary(
+    "bangui_uptime_seconds",
+    "Application uptime in seconds",
+    registry=get_metrics_registry(),
+)
+
+
+def get_metrics() -> bytes:
+    """Get all collected metrics in Prometheus text format.
+    
+    Returns:
+        Prometheus-formatted metrics as bytes.
+    """
+    return generate_latest(get_metrics_registry())
+
+
+def get_metrics_content_type() -> str:
+    """Get the correct Content-Type for Prometheus metrics.
+    
+    Returns:
+        The MIME type for Prometheus metrics.
+    """
+    return CONTENT_TYPE_LATEST