backup
This commit is contained in:
266
src/server/api/health.py
Normal file
266
src/server/api/health.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""Health check endpoints for system monitoring and status verification."""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import psutil
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.server.utils.dependencies import get_database_session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/health", tags=["health"])
|
||||
|
||||
|
||||
class HealthStatus(BaseModel):
|
||||
"""Basic health status response."""
|
||||
|
||||
status: str
|
||||
timestamp: str
|
||||
version: str = "1.0.0"
|
||||
|
||||
|
||||
class DatabaseHealth(BaseModel):
|
||||
"""Database health status."""
|
||||
|
||||
status: str
|
||||
connection_time_ms: float
|
||||
message: Optional[str] = None
|
||||
|
||||
|
||||
class SystemMetrics(BaseModel):
|
||||
"""System resource metrics."""
|
||||
|
||||
cpu_percent: float
|
||||
memory_percent: float
|
||||
memory_available_mb: float
|
||||
disk_percent: float
|
||||
disk_free_mb: float
|
||||
uptime_seconds: float
|
||||
|
||||
|
||||
class DependencyHealth(BaseModel):
|
||||
"""Health status of external dependencies."""
|
||||
|
||||
database: DatabaseHealth
|
||||
filesystem: Dict[str, Any]
|
||||
system: SystemMetrics
|
||||
|
||||
|
||||
class DetailedHealthStatus(BaseModel):
|
||||
"""Comprehensive health check response."""
|
||||
|
||||
status: str
|
||||
timestamp: str
|
||||
version: str = "1.0.0"
|
||||
dependencies: DependencyHealth
|
||||
startup_time: datetime
|
||||
|
||||
|
||||
# Global startup time
|
||||
startup_time = datetime.now()
|
||||
|
||||
|
||||
async def check_database_health(db: AsyncSession) -> DatabaseHealth:
|
||||
"""Check database connection and performance.
|
||||
|
||||
Args:
|
||||
db: Database session dependency.
|
||||
|
||||
Returns:
|
||||
DatabaseHealth: Database status and connection time.
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
await db.execute(text("SELECT 1"))
|
||||
connection_time = (time.time() - start_time) * 1000 # Convert to milliseconds
|
||||
|
||||
return DatabaseHealth(
|
||||
status="healthy",
|
||||
connection_time_ms=connection_time,
|
||||
message="Database connection successful",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Database health check failed: {e}")
|
||||
return DatabaseHealth(
|
||||
status="unhealthy",
|
||||
connection_time_ms=0,
|
||||
message=f"Database connection failed: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
async def check_filesystem_health() -> Dict[str, Any]:
|
||||
"""Check filesystem availability and permissions.
|
||||
|
||||
Returns:
|
||||
dict: Filesystem status and available space.
|
||||
"""
|
||||
try:
|
||||
import os
|
||||
|
||||
data_dir = "data"
|
||||
logs_dir = "logs"
|
||||
|
||||
data_accessible = os.path.exists(data_dir) and os.access(data_dir, os.W_OK)
|
||||
logs_accessible = os.path.exists(logs_dir) and os.access(logs_dir, os.W_OK)
|
||||
|
||||
return {
|
||||
"status": "healthy" if (data_accessible and logs_accessible) else "degraded",
|
||||
"data_dir_writable": data_accessible,
|
||||
"logs_dir_writable": logs_accessible,
|
||||
"message": "Filesystem check completed",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Filesystem health check failed: {e}")
|
||||
return {
|
||||
"status": "unhealthy",
|
||||
"message": f"Filesystem check failed: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def get_system_metrics() -> SystemMetrics:
|
||||
"""Get system resource metrics.
|
||||
|
||||
Returns:
|
||||
SystemMetrics: CPU, memory, disk, and uptime information.
|
||||
"""
|
||||
try:
|
||||
import os
|
||||
import time
|
||||
|
||||
# CPU usage
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
|
||||
# Memory usage
|
||||
memory_info = psutil.virtual_memory()
|
||||
memory_percent = memory_info.percent
|
||||
memory_available_mb = memory_info.available / (1024 * 1024)
|
||||
|
||||
# Disk usage
|
||||
disk_info = psutil.disk_usage("/")
|
||||
disk_percent = disk_info.percent
|
||||
disk_free_mb = disk_info.free / (1024 * 1024)
|
||||
|
||||
# Uptime
|
||||
boot_time = psutil.boot_time()
|
||||
uptime_seconds = time.time() - boot_time
|
||||
|
||||
return SystemMetrics(
|
||||
cpu_percent=cpu_percent,
|
||||
memory_percent=memory_percent,
|
||||
memory_available_mb=memory_available_mb,
|
||||
disk_percent=disk_percent,
|
||||
disk_free_mb=disk_free_mb,
|
||||
uptime_seconds=uptime_seconds,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"System metrics collection failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to collect system metrics: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("", response_model=HealthStatus)
|
||||
async def basic_health_check() -> HealthStatus:
|
||||
"""Basic health check endpoint.
|
||||
|
||||
Returns:
|
||||
HealthStatus: Simple health status with timestamp.
|
||||
"""
|
||||
logger.debug("Basic health check requested")
|
||||
return HealthStatus(
|
||||
status="healthy",
|
||||
timestamp=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/detailed", response_model=DetailedHealthStatus)
|
||||
async def detailed_health_check(
|
||||
db: AsyncSession = Depends(get_database_session),
|
||||
) -> DetailedHealthStatus:
|
||||
"""Comprehensive health check endpoint.
|
||||
|
||||
Checks database, filesystem, and system metrics.
|
||||
|
||||
Args:
|
||||
db: Database session dependency.
|
||||
|
||||
Returns:
|
||||
DetailedHealthStatus: Comprehensive health information.
|
||||
"""
|
||||
logger.debug("Detailed health check requested")
|
||||
|
||||
try:
|
||||
# Check dependencies
|
||||
database_health = await check_database_health(db)
|
||||
filesystem_health = await check_filesystem_health()
|
||||
system_metrics = get_system_metrics()
|
||||
|
||||
# Determine overall status
|
||||
overall_status = "healthy"
|
||||
if database_health.status != "healthy":
|
||||
overall_status = "degraded"
|
||||
if filesystem_health.get("status") != "healthy":
|
||||
overall_status = "degraded"
|
||||
|
||||
dependencies = DependencyHealth(
|
||||
database=database_health,
|
||||
filesystem=filesystem_health,
|
||||
system=system_metrics,
|
||||
)
|
||||
|
||||
return DetailedHealthStatus(
|
||||
status=overall_status,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
dependencies=dependencies,
|
||||
startup_time=startup_time,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Detailed health check failed: {e}")
|
||||
raise HTTPException(status_code=500, detail="Health check failed")
|
||||
|
||||
|
||||
@router.get("/metrics", response_model=SystemMetrics)
|
||||
async def get_metrics() -> SystemMetrics:
|
||||
"""Get system resource metrics.
|
||||
|
||||
Returns:
|
||||
SystemMetrics: Current CPU, memory, disk, and uptime metrics.
|
||||
"""
|
||||
logger.debug("System metrics requested")
|
||||
return get_system_metrics()
|
||||
|
||||
|
||||
@router.get("/metrics/prometheus")
|
||||
async def get_prometheus_metrics() -> str:
|
||||
"""Get metrics in Prometheus format.
|
||||
|
||||
Returns:
|
||||
str: Prometheus formatted metrics.
|
||||
"""
|
||||
from src.server.utils.metrics import get_metrics_collector
|
||||
|
||||
logger.debug("Prometheus metrics requested")
|
||||
collector = get_metrics_collector()
|
||||
return collector.export_prometheus_format()
|
||||
|
||||
|
||||
@router.get("/metrics/json")
|
||||
async def get_metrics_json() -> Dict[str, Any]:
|
||||
"""Get metrics as JSON.
|
||||
|
||||
Returns:
|
||||
dict: Metrics in JSON format.
|
||||
"""
|
||||
from src.server.utils.metrics import get_metrics_collector
|
||||
|
||||
logger.debug("JSON metrics requested")
|
||||
collector = get_metrics_collector()
|
||||
return collector.export_json()
|
||||
Reference in New Issue
Block a user