- Add _run_startup_health_checks() function in fastapi_app.py - Check ffmpeg availability (warning) - Check DNS resolution for aniworld.to and api.themoviedb.org (warning) - Check anime_directory configuration and writability (error) - Store startup checks in app.state for health endpoint access - Add /health/ready endpoint for container orchestrators - Returns not_ready with 503 when critical failures present - Includes critical_failures list for debugging - Update /health endpoint to include startup check results - Status reflects worst check (error > warning > ok) - Document health check endpoints in DEVELOPMENT.md - Add unit tests for startup health checks - Add unit tests for /health/ready endpoint Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
344 lines
10 KiB
Python
344 lines
10 KiB
Python
"""Health check endpoints for system monitoring and status verification."""
|
|
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Any, Dict, Optional
|
|
|
|
import psutil
|
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from src.server.utils.dependencies import get_database_session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/health", tags=["health"])
|
|
|
|
|
|
class HealthStatus(BaseModel):
|
|
"""Basic health status response."""
|
|
|
|
status: str
|
|
timestamp: str
|
|
version: str = "1.0.1"
|
|
service: str = "aniworld-api"
|
|
series_app_initialized: bool = False
|
|
anime_directory_configured: bool = False
|
|
scheduler_next_run: Optional[str] = None
|
|
scheduler_last_run: Optional[str] = None
|
|
checks: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class DatabaseHealth(BaseModel):
|
|
"""Database health status."""
|
|
|
|
status: str
|
|
connection_time_ms: float
|
|
message: Optional[str] = None
|
|
|
|
|
|
class SystemMetrics(BaseModel):
|
|
"""System resource metrics."""
|
|
|
|
cpu_percent: float
|
|
memory_percent: float
|
|
memory_available_mb: float
|
|
disk_percent: float
|
|
disk_free_mb: float
|
|
uptime_seconds: float
|
|
|
|
|
|
class DependencyHealth(BaseModel):
|
|
"""Health status of external dependencies."""
|
|
|
|
database: DatabaseHealth
|
|
filesystem: Dict[str, Any]
|
|
system: SystemMetrics
|
|
|
|
|
|
class DetailedHealthStatus(BaseModel):
|
|
"""Comprehensive health check response."""
|
|
|
|
status: str
|
|
timestamp: str
|
|
version: str = "1.0.1"
|
|
dependencies: DependencyHealth
|
|
startup_time: datetime
|
|
|
|
|
|
# Global startup time
|
|
startup_time = datetime.now()
|
|
|
|
|
|
async def check_database_health(db: AsyncSession) -> DatabaseHealth:
|
|
"""Check database connection and performance.
|
|
|
|
Args:
|
|
db: Database session dependency.
|
|
|
|
Returns:
|
|
DatabaseHealth: Database status and connection time.
|
|
"""
|
|
try:
|
|
import time
|
|
|
|
start_time = time.time()
|
|
await db.execute(text("SELECT 1"))
|
|
connection_time = (time.time() - start_time) * 1000 # Convert to milliseconds
|
|
|
|
return DatabaseHealth(
|
|
status="healthy",
|
|
connection_time_ms=connection_time,
|
|
message="Database connection successful",
|
|
)
|
|
except Exception as e:
|
|
logger.error("Database health check failed: %s", e)
|
|
return DatabaseHealth(
|
|
status="unhealthy",
|
|
connection_time_ms=0,
|
|
message=f"Database connection failed: {str(e)}",
|
|
)
|
|
|
|
|
|
async def check_filesystem_health() -> Dict[str, Any]:
|
|
"""Check filesystem availability and permissions.
|
|
|
|
Returns:
|
|
dict: Filesystem status and available space.
|
|
"""
|
|
try:
|
|
import os
|
|
|
|
data_dir = "data"
|
|
logs_dir = "logs"
|
|
|
|
data_accessible = os.path.exists(data_dir) and os.access(data_dir, os.W_OK)
|
|
logs_accessible = os.path.exists(logs_dir) and os.access(logs_dir, os.W_OK)
|
|
|
|
return {
|
|
"status": "healthy" if (data_accessible and logs_accessible) else "degraded",
|
|
"data_dir_writable": data_accessible,
|
|
"logs_dir_writable": logs_accessible,
|
|
"message": "Filesystem check completed",
|
|
}
|
|
except Exception as e:
|
|
logger.error("Filesystem health check failed: %s", e)
|
|
return {
|
|
"status": "unhealthy",
|
|
"message": f"Filesystem check failed: {str(e)}",
|
|
}
|
|
|
|
|
|
def get_system_metrics() -> SystemMetrics:
|
|
"""Get system resource metrics.
|
|
|
|
Returns:
|
|
SystemMetrics: CPU, memory, disk, and uptime information.
|
|
"""
|
|
try:
|
|
import os
|
|
import time
|
|
|
|
# CPU usage
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
|
|
# Memory usage
|
|
memory_info = psutil.virtual_memory()
|
|
memory_percent = memory_info.percent
|
|
memory_available_mb = memory_info.available / (1024 * 1024)
|
|
|
|
# Disk usage
|
|
disk_info = psutil.disk_usage("/")
|
|
disk_percent = disk_info.percent
|
|
disk_free_mb = disk_info.free / (1024 * 1024)
|
|
|
|
# Uptime
|
|
boot_time = psutil.boot_time()
|
|
uptime_seconds = time.time() - boot_time
|
|
|
|
return SystemMetrics(
|
|
cpu_percent=cpu_percent,
|
|
memory_percent=memory_percent,
|
|
memory_available_mb=memory_available_mb,
|
|
disk_percent=disk_percent,
|
|
disk_free_mb=disk_free_mb,
|
|
uptime_seconds=uptime_seconds,
|
|
)
|
|
except Exception as e:
|
|
logger.error("System metrics collection failed: %s", e)
|
|
raise HTTPException(
|
|
status_code=500, detail=f"Failed to collect system metrics: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.get("", response_model=HealthStatus)
|
|
async def basic_health_check(request: Request) -> HealthStatus:
|
|
"""Basic health check endpoint.
|
|
|
|
This endpoint does not depend on anime_directory configuration
|
|
and should always return 200 OK for basic health monitoring.
|
|
Includes service information for identification.
|
|
Includes scheduler next/last run times for monitoring tools.
|
|
Includes startup health check results.
|
|
|
|
Returns:
|
|
HealthStatus: Simple health status with timestamp and service info.
|
|
"""
|
|
from src.config.settings import settings
|
|
from src.server.utils.dependencies import _series_app
|
|
|
|
# Get scheduler status for health monitoring
|
|
scheduler_status: dict = {}
|
|
try:
|
|
from src.server.services.scheduler_service import get_scheduler_service
|
|
scheduler_status = get_scheduler_service().get_status()
|
|
except Exception:
|
|
pass
|
|
|
|
# Get startup checks from app state
|
|
checks = getattr(request.app.state, "startup_checks", None)
|
|
|
|
# Determine overall status based on checks
|
|
overall_status = "healthy"
|
|
if checks:
|
|
for check_name, check_data in checks.items():
|
|
if check_data.get("status") == "error":
|
|
overall_status = "unhealthy"
|
|
break
|
|
elif check_data.get("status") == "warning":
|
|
overall_status = "degraded"
|
|
|
|
logger.debug("Basic health check requested")
|
|
return HealthStatus(
|
|
status=overall_status,
|
|
timestamp=datetime.now().isoformat(),
|
|
service="aniworld-api",
|
|
series_app_initialized=_series_app is not None,
|
|
anime_directory_configured=bool(settings.anime_directory),
|
|
scheduler_next_run=scheduler_status.get("next_run"),
|
|
scheduler_last_run=scheduler_status.get("last_run"),
|
|
checks=checks,
|
|
)
|
|
|
|
|
|
@router.get("/ready")
|
|
async def ready_check(request: Request) -> Dict[str, Any]:
|
|
"""Readiness check endpoint for container orchestrators.
|
|
|
|
Returns 503 if critical dependencies are not available.
|
|
This endpoint is used by Kubernetes, Docker Swarm, etc. to determine
|
|
if the container should receive traffic.
|
|
|
|
Returns:
|
|
dict: Readiness status with checks details.
|
|
"""
|
|
checks = getattr(request.app.state, "startup_checks", {})
|
|
|
|
critical_failures = []
|
|
for check_name, check_data in checks.items():
|
|
if check_data.get("status") == "error":
|
|
critical_failures.append(f"{check_name}: {check_data.get('message')}")
|
|
|
|
if critical_failures:
|
|
return {
|
|
"status": "not_ready",
|
|
"ready": False,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"critical_failures": critical_failures,
|
|
"checks": checks,
|
|
}
|
|
|
|
return {
|
|
"status": "ready",
|
|
"ready": True,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"checks": checks,
|
|
}
|
|
|
|
|
|
@router.get("/detailed", response_model=DetailedHealthStatus)
|
|
async def detailed_health_check(
|
|
db: AsyncSession = Depends(get_database_session),
|
|
) -> DetailedHealthStatus:
|
|
"""Comprehensive health check endpoint.
|
|
|
|
Checks database, filesystem, and system metrics.
|
|
|
|
Args:
|
|
db: Database session dependency.
|
|
|
|
Returns:
|
|
DetailedHealthStatus: Comprehensive health information.
|
|
"""
|
|
logger.debug("Detailed health check requested")
|
|
|
|
try:
|
|
# Check dependencies
|
|
database_health = await check_database_health(db)
|
|
filesystem_health = await check_filesystem_health()
|
|
system_metrics = get_system_metrics()
|
|
|
|
# Determine overall status
|
|
overall_status = "healthy"
|
|
if database_health.status != "healthy":
|
|
overall_status = "degraded"
|
|
if filesystem_health.get("status") != "healthy":
|
|
overall_status = "degraded"
|
|
|
|
dependencies = DependencyHealth(
|
|
database=database_health,
|
|
filesystem=filesystem_health,
|
|
system=system_metrics,
|
|
)
|
|
|
|
return DetailedHealthStatus(
|
|
status=overall_status,
|
|
timestamp=datetime.now().isoformat(),
|
|
dependencies=dependencies,
|
|
startup_time=startup_time,
|
|
)
|
|
except Exception as e:
|
|
logger.error("Detailed health check failed: %s", e)
|
|
raise HTTPException(status_code=500, detail="Health check failed")
|
|
|
|
|
|
@router.get("/metrics", response_model=SystemMetrics)
|
|
async def get_metrics() -> SystemMetrics:
|
|
"""Get system resource metrics.
|
|
|
|
Returns:
|
|
SystemMetrics: Current CPU, memory, disk, and uptime metrics.
|
|
"""
|
|
logger.debug("System metrics requested")
|
|
return get_system_metrics()
|
|
|
|
|
|
@router.get("/metrics/prometheus")
|
|
async def get_prometheus_metrics() -> str:
|
|
"""Get metrics in Prometheus format.
|
|
|
|
Returns:
|
|
str: Prometheus formatted metrics.
|
|
"""
|
|
from src.server.utils.metrics import get_metrics_collector
|
|
|
|
logger.debug("Prometheus metrics requested")
|
|
collector = get_metrics_collector()
|
|
return collector.export_prometheus_format()
|
|
|
|
|
|
@router.get("/metrics/json")
|
|
async def get_metrics_json() -> Dict[str, Any]:
|
|
"""Get metrics as JSON.
|
|
|
|
Returns:
|
|
dict: Metrics in JSON format.
|
|
"""
|
|
from src.server.utils.metrics import get_metrics_collector
|
|
|
|
logger.debug("JSON metrics requested")
|
|
collector = get_metrics_collector()
|
|
return collector.export_json()
|