Add Application Performance Monitoring (APM) with Prometheus metrics
- Backend: Implement Prometheus metrics collection
- Add prometheus-client dependency
- Create metrics utility module with HTTP request tracking counters, histograms, gauges
- Implement MetricsMiddleware to track request latency, count, and active requests
- Add /metrics endpoint to expose metrics in Prometheus text format
- Normalize paths to prevent cardinality explosion (e.g., /api/{id} for UUIDs)
- Exclude /metrics and /health from detailed tracking
- Frontend: Add web vitals and API metrics collection
- Install web-vitals library (v4.0.0) for Core Web Vitals tracking
- Create metrics utility module for FCP, LCP, CLS, INP, TTFB collection
- Implement useTrackedFetch hook for automatic API call metrics (method, endpoint, status, duration)
- Initialize web vitals tracking in App component on mount
- Provide exportMetrics() for sending metrics to backend
- Testing:
- Add comprehensive backend metrics tests (9 tests, 100% coverage)
- Add comprehensive frontend metrics tests (10 tests)
- All tests passing
- Documentation:
- Expand Docs/Observability.md with complete APM section
- Include metrics reference, integration examples (Prometheus, Datadog, NewRelic)
- Add troubleshooting guide and best practices for cardinality management
- Update Tasks.md to mark APM task as complete
Metrics exposed:
- bangui_http_requests_total: HTTP request count by method, endpoint, status
- bangui_http_request_duration_seconds: Request latency histogram
- bangui_http_active_requests: Active request gauge
- Web Vitals: CLS, FCP, INP, LCP, TTFB with ratings
- API metrics: endpoint, method, status, duration, timestamp
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -45,6 +45,7 @@ from app.exceptions import (
|
||||
)
|
||||
from app.middleware.correlation import CorrelationIdMiddleware
|
||||
from app.middleware.csrf import CsrfMiddleware
|
||||
from app.middleware.metrics import MetricsMiddleware
|
||||
from app.middleware.rate_limit import RateLimitMiddleware
|
||||
from app.models.response import ErrorResponse
|
||||
from app.routers import (
|
||||
@@ -58,6 +59,7 @@ from app.routers import (
|
||||
health,
|
||||
history,
|
||||
jails,
|
||||
metrics,
|
||||
server,
|
||||
setup,
|
||||
)
|
||||
@@ -950,6 +952,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
||||
app.add_middleware(CorrelationIdMiddleware)
|
||||
app.add_middleware(SecurityHeadersMiddleware)
|
||||
app.add_middleware(SetupRedirectMiddleware)
|
||||
app.add_middleware(MetricsMiddleware)
|
||||
app.add_middleware(CsrfMiddleware)
|
||||
app.add_middleware(
|
||||
RateLimitMiddleware,
|
||||
@@ -995,6 +998,7 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
||||
app.add_exception_handler(Exception, _unhandled_exception_handler)
|
||||
|
||||
# --- Routers ---
|
||||
app.include_router(metrics.router)
|
||||
app.include_router(health.router)
|
||||
app.include_router(setup.router)
|
||||
app.include_router(auth.router)
|
||||
|
||||
95
backend/app/middleware/metrics.py
Normal file
95
backend/app/middleware/metrics.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Metrics collection middleware for BanGUI.
|
||||
|
||||
Tracks HTTP request count, latency, and active requests.
|
||||
Excludes the /metrics endpoint to prevent recursive metrics collection.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
|
||||
from app.utils.metrics import http_active_requests, http_request_count, http_request_latency
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Awaitable, Callable
|
||||
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
|
||||
log = structlog.get_logger()
|
||||
|
||||
# Paths excluded from detailed metrics (to avoid cardinality explosion)
|
||||
EXCLUDED_PATHS = {"/metrics", "/health", "/api/health"}
|
||||
|
||||
# Pattern to normalize endpoint paths (convert IDs to placeholders)
|
||||
PATH_PATTERN = re.compile(r"/api/[^/]+/[a-f0-9\-]{36}|/api/[^/]+/\d+")
|
||||
|
||||
|
||||
def _normalize_path(path: str) -> str:
|
||||
"""Normalize path by replacing IDs with placeholders.
|
||||
|
||||
Converts paths like /api/resource/123 to /api/resource/{id}
|
||||
to prevent cardinality explosion from dynamic IDs.
|
||||
|
||||
Args:
|
||||
path: The request path.
|
||||
|
||||
Returns:
|
||||
Normalized path with IDs replaced by {id}.
|
||||
"""
|
||||
return PATH_PATTERN.sub(r"/api/{id}", path)
|
||||
|
||||
|
||||
class MetricsMiddleware(BaseHTTPMiddleware):
|
||||
"""Middleware to collect Prometheus metrics for HTTP requests."""
|
||||
|
||||
async def dispatch(
|
||||
self,
|
||||
request: Request,
|
||||
call_next: Callable[[Request], Awaitable[Response]],
|
||||
) -> Response:
|
||||
"""Collect metrics for the request and response.
|
||||
|
||||
Args:
|
||||
request: The incoming request.
|
||||
call_next: The next middleware/route handler.
|
||||
|
||||
Returns:
|
||||
The response.
|
||||
"""
|
||||
# Skip metrics for excluded paths
|
||||
if request.url.path in EXCLUDED_PATHS:
|
||||
return await call_next(request)
|
||||
|
||||
method: str = request.method
|
||||
endpoint: str = _normalize_path(request.url.path)
|
||||
|
||||
# Track active requests
|
||||
http_active_requests.labels(method=method, endpoint=endpoint).inc()
|
||||
|
||||
start_time = time.perf_counter()
|
||||
status_code = 500
|
||||
|
||||
try:
|
||||
response: Response = await call_next(request)
|
||||
status_code = response.status_code
|
||||
return response
|
||||
finally:
|
||||
# Record metrics
|
||||
duration: float = time.perf_counter() - start_time
|
||||
http_request_latency.labels(method=method, endpoint=endpoint).observe(duration)
|
||||
http_request_count.labels(method=method, endpoint=endpoint, status_code=status_code).inc()
|
||||
http_active_requests.labels(method=method, endpoint=endpoint).dec()
|
||||
|
||||
log.debug(
|
||||
"http_request_recorded",
|
||||
method=method,
|
||||
endpoint=endpoint,
|
||||
status_code=status_code,
|
||||
duration_ms=duration * 1000,
|
||||
)
|
||||
36
backend/app/routers/metrics.py
Normal file
36
backend/app/routers/metrics.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Prometheus metrics endpoint for BanGUI.
|
||||
|
||||
Exposes collected metrics in Prometheus text format at GET /metrics.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import structlog
|
||||
from fastapi import APIRouter
|
||||
from starlette.responses import Response
|
||||
|
||||
from app.utils.metrics import get_metrics, get_metrics_content_type
|
||||
|
||||
log = structlog.get_logger()
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/metrics",
|
||||
tags=["observability"],
|
||||
summary="Prometheus metrics endpoint",
|
||||
description="Exposes application metrics in Prometheus text format (OpenMetrics)",
|
||||
include_in_schema=False,
|
||||
)
|
||||
async def get_application_metrics() -> Response:
|
||||
"""Get Prometheus metrics.
|
||||
|
||||
Returns:
|
||||
Prometheus-formatted metrics as plain text.
|
||||
"""
|
||||
log.debug("metrics_endpoint_accessed")
|
||||
return Response(
|
||||
content=get_metrics(),
|
||||
media_type=get_metrics_content_type(),
|
||||
)
|
||||
108
backend/app/utils/metrics.py
Normal file
108
backend/app/utils/metrics.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Prometheus metrics collection for BanGUI backend.
|
||||
|
||||
This module provides metrics collection for:
|
||||
- HTTP request count and latency per endpoint
|
||||
- Active concurrent requests
|
||||
- Custom application metrics (bans, jails, etc.)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus_client import Counter, Gauge, Histogram, Summary, generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
|
||||
|
||||
__all__ = [
|
||||
"get_metrics_registry",
|
||||
"get_metrics",
|
||||
"http_request_count",
|
||||
"http_request_latency",
|
||||
"http_active_requests",
|
||||
"bans_total",
|
||||
"jails_total",
|
||||
"fail2ban_connection_errors",
|
||||
]
|
||||
|
||||
# Global registry
|
||||
_registry: CollectorRegistry | None = None
|
||||
|
||||
|
||||
def get_metrics_registry() -> CollectorRegistry:
|
||||
"""Get or create the global metrics registry.
|
||||
|
||||
Returns:
|
||||
The Prometheus CollectorRegistry instance.
|
||||
"""
|
||||
global _registry
|
||||
if _registry is None:
|
||||
_registry = CollectorRegistry()
|
||||
return _registry
|
||||
|
||||
|
||||
# HTTP Metrics
|
||||
|
||||
http_request_count = Counter(
|
||||
"bangui_http_requests_total",
|
||||
"Total HTTP requests by method, endpoint, and status code",
|
||||
["method", "endpoint", "status_code"],
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
http_request_latency = Histogram(
|
||||
"bangui_http_request_duration_seconds",
|
||||
"HTTP request latency in seconds by method and endpoint",
|
||||
["method", "endpoint"],
|
||||
buckets=(0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0),
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
http_active_requests = Gauge(
|
||||
"bangui_http_active_requests",
|
||||
"Current number of active HTTP requests by method and endpoint",
|
||||
["method", "endpoint"],
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
# Application Metrics
|
||||
|
||||
bans_total = Gauge(
|
||||
"bangui_bans_total",
|
||||
"Total number of banned IPs across all jails",
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
jails_total = Gauge(
|
||||
"bangui_jails_total",
|
||||
"Total number of fail2ban jails",
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
fail2ban_connection_errors = Counter(
|
||||
"bangui_fail2ban_connection_errors_total",
|
||||
"Total number of fail2ban connection errors",
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
# Application startup and health
|
||||
|
||||
app_uptime = Summary(
|
||||
"bangui_uptime_seconds",
|
||||
"Application uptime in seconds",
|
||||
registry=get_metrics_registry(),
|
||||
)
|
||||
|
||||
|
||||
def get_metrics() -> bytes:
|
||||
"""Get all collected metrics in Prometheus text format.
|
||||
|
||||
Returns:
|
||||
Prometheus-formatted metrics as bytes.
|
||||
"""
|
||||
return generate_latest(get_metrics_registry())
|
||||
|
||||
|
||||
def get_metrics_content_type() -> str:
|
||||
"""Get the correct Content-Type for Prometheus metrics.
|
||||
|
||||
Returns:
|
||||
The MIME type for Prometheus metrics.
|
||||
"""
|
||||
return CONTENT_TYPE_LATEST
|
||||
Reference in New Issue
Block a user