Aniworld/src/server/api/health.py
Lukas 27108aacda Fix architecture issues from todolist
- Add documentation warnings for in-memory rate limiting and failed login attempts
- Consolidate duplicate health endpoints into api/health.py
- Fix CLI to use correct async rescan method names
- Update download.py and anime.py to use custom exception classes
- Add WebSocket room validation and rate limiting
2025-12-15 14:23:41 +01:00

280 lines
7.8 KiB
Python

"""Health check endpoints for system monitoring and status verification."""
import logging
from datetime import datetime
from typing import Any, Dict, Optional
import psutil
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from src.server.utils.dependencies import get_database_session
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/health", tags=["health"])
class HealthStatus(BaseModel):
"""Basic health status response."""
status: str
timestamp: str
version: str = "1.0.0"
service: str = "aniworld-api"
series_app_initialized: bool = False
anime_directory_configured: bool = False
class DatabaseHealth(BaseModel):
"""Database health status."""
status: str
connection_time_ms: float
message: Optional[str] = None
class SystemMetrics(BaseModel):
"""System resource metrics."""
cpu_percent: float
memory_percent: float
memory_available_mb: float
disk_percent: float
disk_free_mb: float
uptime_seconds: float
class DependencyHealth(BaseModel):
"""Health status of external dependencies."""
database: DatabaseHealth
filesystem: Dict[str, Any]
system: SystemMetrics
class DetailedHealthStatus(BaseModel):
"""Comprehensive health check response."""
status: str
timestamp: str
version: str = "1.0.0"
dependencies: DependencyHealth
startup_time: datetime
# Global startup time
startup_time = datetime.now()
async def check_database_health(db: AsyncSession) -> DatabaseHealth:
"""Check database connection and performance.
Args:
db: Database session dependency.
Returns:
DatabaseHealth: Database status and connection time.
"""
try:
import time
start_time = time.time()
await db.execute(text("SELECT 1"))
connection_time = (time.time() - start_time) * 1000 # Convert to milliseconds
return DatabaseHealth(
status="healthy",
connection_time_ms=connection_time,
message="Database connection successful",
)
except Exception as e:
logger.error(f"Database health check failed: {e}")
return DatabaseHealth(
status="unhealthy",
connection_time_ms=0,
message=f"Database connection failed: {str(e)}",
)
async def check_filesystem_health() -> Dict[str, Any]:
"""Check filesystem availability and permissions.
Returns:
dict: Filesystem status and available space.
"""
try:
import os
data_dir = "data"
logs_dir = "logs"
data_accessible = os.path.exists(data_dir) and os.access(data_dir, os.W_OK)
logs_accessible = os.path.exists(logs_dir) and os.access(logs_dir, os.W_OK)
return {
"status": "healthy" if (data_accessible and logs_accessible) else "degraded",
"data_dir_writable": data_accessible,
"logs_dir_writable": logs_accessible,
"message": "Filesystem check completed",
}
except Exception as e:
logger.error(f"Filesystem health check failed: {e}")
return {
"status": "unhealthy",
"message": f"Filesystem check failed: {str(e)}",
}
def get_system_metrics() -> SystemMetrics:
"""Get system resource metrics.
Returns:
SystemMetrics: CPU, memory, disk, and uptime information.
"""
try:
import os
import time
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
# Memory usage
memory_info = psutil.virtual_memory()
memory_percent = memory_info.percent
memory_available_mb = memory_info.available / (1024 * 1024)
# Disk usage
disk_info = psutil.disk_usage("/")
disk_percent = disk_info.percent
disk_free_mb = disk_info.free / (1024 * 1024)
# Uptime
boot_time = psutil.boot_time()
uptime_seconds = time.time() - boot_time
return SystemMetrics(
cpu_percent=cpu_percent,
memory_percent=memory_percent,
memory_available_mb=memory_available_mb,
disk_percent=disk_percent,
disk_free_mb=disk_free_mb,
uptime_seconds=uptime_seconds,
)
except Exception as e:
logger.error(f"System metrics collection failed: {e}")
raise HTTPException(
status_code=500, detail=f"Failed to collect system metrics: {str(e)}"
)
@router.get("", response_model=HealthStatus)
async def basic_health_check() -> HealthStatus:
"""Basic health check endpoint.
This endpoint does not depend on anime_directory configuration
and should always return 200 OK for basic health monitoring.
Includes service information for identification.
Returns:
HealthStatus: Simple health status with timestamp and service info.
"""
from src.config.settings import settings
from src.server.utils.dependencies import _series_app
logger.debug("Basic health check requested")
return HealthStatus(
status="healthy",
timestamp=datetime.now().isoformat(),
service="aniworld-api",
series_app_initialized=_series_app is not None,
anime_directory_configured=bool(settings.anime_directory),
)
@router.get("/detailed", response_model=DetailedHealthStatus)
async def detailed_health_check(
db: AsyncSession = Depends(get_database_session),
) -> DetailedHealthStatus:
"""Comprehensive health check endpoint.
Checks database, filesystem, and system metrics.
Args:
db: Database session dependency.
Returns:
DetailedHealthStatus: Comprehensive health information.
"""
logger.debug("Detailed health check requested")
try:
# Check dependencies
database_health = await check_database_health(db)
filesystem_health = await check_filesystem_health()
system_metrics = get_system_metrics()
# Determine overall status
overall_status = "healthy"
if database_health.status != "healthy":
overall_status = "degraded"
if filesystem_health.get("status") != "healthy":
overall_status = "degraded"
dependencies = DependencyHealth(
database=database_health,
filesystem=filesystem_health,
system=system_metrics,
)
return DetailedHealthStatus(
status=overall_status,
timestamp=datetime.now().isoformat(),
dependencies=dependencies,
startup_time=startup_time,
)
except Exception as e:
logger.error(f"Detailed health check failed: {e}")
raise HTTPException(status_code=500, detail="Health check failed")
@router.get("/metrics", response_model=SystemMetrics)
async def get_metrics() -> SystemMetrics:
"""Get system resource metrics.
Returns:
SystemMetrics: Current CPU, memory, disk, and uptime metrics.
"""
logger.debug("System metrics requested")
return get_system_metrics()
@router.get("/metrics/prometheus")
async def get_prometheus_metrics() -> str:
"""Get metrics in Prometheus format.
Returns:
str: Prometheus formatted metrics.
"""
from src.server.utils.metrics import get_metrics_collector
logger.debug("Prometheus metrics requested")
collector = get_metrics_collector()
return collector.export_prometheus_format()
@router.get("/metrics/json")
async def get_metrics_json() -> Dict[str, Any]:
"""Get metrics as JSON.
Returns:
dict: Metrics in JSON format.
"""
from src.server.utils.metrics import get_metrics_collector
logger.debug("JSON metrics requested")
collector = get_metrics_collector()
return collector.export_json()