Add startup health checks and /health/ready endpoint

- Add _run_startup_health_checks() function in fastapi_app.py
  - Check ffmpeg availability (warning)
  - Check DNS resolution for aniworld.to and api.themoviedb.org (warning)
  - Check anime_directory configuration and writability (error)
- Store startup checks in app.state for health endpoint access
- Add /health/ready endpoint for container orchestrators
  - Returns not_ready with 503 when critical failures present
  - Includes critical_failures list for debugging
- Update /health endpoint to include startup check results
  - Status reflects worst check (error > warning > ok)
- Document health check endpoints in DEVELOPMENT.md
- Add unit tests for startup health checks
- Add unit tests for /health/ready endpoint

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-23 22:12:03 +02:00
parent 9a20541598
commit 3551838887
5 changed files with 458 additions and 7 deletions

View File

@@ -5,7 +5,7 @@ from datetime import datetime
from typing import Any, Dict, Optional
import psutil
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
@@ -28,6 +28,7 @@ class HealthStatus(BaseModel):
anime_directory_configured: bool = False
scheduler_next_run: Optional[str] = None
scheduler_last_run: Optional[str] = None
checks: Optional[Dict[str, Any]] = None
class DatabaseHealth(BaseModel):
@@ -173,13 +174,14 @@ def get_system_metrics() -> SystemMetrics:
@router.get("", response_model=HealthStatus)
async def basic_health_check() -> HealthStatus:
async def basic_health_check(request: Request) -> HealthStatus:
"""Basic health check endpoint.
This endpoint does not depend on anime_directory configuration
and should always return 200 OK for basic health monitoring.
Includes service information for identification.
Includes scheduler next/last run times for monitoring tools.
Includes startup health check results.
Returns:
HealthStatus: Simple health status with timestamp and service info.
@@ -195,18 +197,67 @@ async def basic_health_check() -> HealthStatus:
except Exception:
pass
# Get startup checks from app state
checks = getattr(request.app.state, "startup_checks", None)
# Determine overall status based on checks
overall_status = "healthy"
if checks:
for check_name, check_data in checks.items():
if check_data.get("status") == "error":
overall_status = "unhealthy"
break
elif check_data.get("status") == "warning":
overall_status = "degraded"
logger.debug("Basic health check requested")
return HealthStatus(
status="healthy",
status=overall_status,
timestamp=datetime.now().isoformat(),
service="aniworld-api",
series_app_initialized=_series_app is not None,
anime_directory_configured=bool(settings.anime_directory),
scheduler_next_run=scheduler_status.get("next_run"),
scheduler_last_run=scheduler_status.get("last_run"),
checks=checks,
)
@router.get("/ready")
async def ready_check(request: Request) -> Dict[str, Any]:
"""Readiness check endpoint for container orchestrators.
Returns 503 if critical dependencies are not available.
This endpoint is used by Kubernetes, Docker Swarm, etc. to determine
if the container should receive traffic.
Returns:
dict: Readiness status with checks details.
"""
checks = getattr(request.app.state, "startup_checks", {})
critical_failures = []
for check_name, check_data in checks.items():
if check_data.get("status") == "error":
critical_failures.append(f"{check_name}: {check_data.get('message')}")
if critical_failures:
return {
"status": "not_ready",
"ready": False,
"timestamp": datetime.now().isoformat(),
"critical_failures": critical_failures,
"checks": checks,
}
return {
"status": "ready",
"ready": True,
"timestamp": datetime.now().isoformat(),
"checks": checks,
}
@router.get("/detailed", response_model=DetailedHealthStatus)
async def detailed_health_check(
db: AsyncSession = Depends(get_database_session),

View File

@@ -104,6 +104,107 @@ async def _check_incomplete_series_on_startup(background_loader) -> None:
logger.exception("Failed to check incomplete series on startup")
async def _run_startup_health_checks(logger) -> dict:
"""Run startup health checks for critical dependencies.
Checks:
- ffmpeg availability
- DNS resolution for aniworld.to and api.themoviedb.org
- anime_directory configuration and writability
Args:
logger: Logger instance for recording check results.
Returns:
dict: Health check results with status and details for each check.
"""
import asyncio
import shutil
import socket
from typing import Dict, Any
checks: Dict[str, Any] = {
"ffmpeg": {"status": "unknown", "message": None},
"dns_aniworld": {"status": "unknown", "message": None},
"dns_tmdb": {"status": "unknown", "message": None},
"anime_directory": {"status": "unknown", "message": None, "path": None},
}
# Check ffmpeg availability
try:
ffmpeg_path = shutil.which("ffmpeg")
if ffmpeg_path:
checks["ffmpeg"]["status"] = "ok"
checks["ffmpeg"]["message"] = f"Found at {ffmpeg_path}"
logger.debug("ffmpeg health check passed: %s", ffmpeg_path)
else:
checks["ffmpeg"]["status"] = "warning"
checks["ffmpeg"]["message"] = "ffmpeg not found in PATH"
logger.warning("ffmpeg health check failed: not in PATH")
except Exception as e:
checks["ffmpeg"]["status"] = "error"
checks["ffmpeg"]["message"] = str(e)
logger.warning("Could not check ffmpeg: %s", e)
# Check DNS resolution for aniworld.to
try:
socket.gethostbyname("aniworld.to")
checks["dns_aniworld"]["status"] = "ok"
checks["dns_aniworld"]["message"] = "Resolved successfully"
logger.debug("DNS health check passed for aniworld.to")
except socket.gaierror as e:
checks["dns_aniworld"]["status"] = "warning"
checks["dns_aniworld"]["message"] = f"DNS resolution failed: {e}"
logger.warning("DNS health check failed for aniworld.to: %s", e)
except Exception as e:
checks["dns_aniworld"]["status"] = "warning"
checks["dns_aniworld"]["message"] = f"Unexpected error: {e}"
logger.warning("Unexpected DNS error for aniworld.to: %s", e)
# Check DNS resolution for api.themoviedb.org
try:
socket.gethostbyname("api.themoviedb.org")
checks["dns_tmdb"]["status"] = "ok"
checks["dns_tmdb"]["message"] = "Resolved successfully"
logger.debug("DNS health check passed for api.themoviedb.org")
except socket.gaierror as e:
checks["dns_tmdb"]["status"] = "warning"
checks["dns_tmdb"]["message"] = f"DNS resolution failed: {e}"
logger.warning("DNS health check failed for api.themoviedb.org: %s", e)
except Exception as e:
checks["dns_tmdb"]["status"] = "warning"
checks["dns_tmdb"]["message"] = f"Unexpected error: {e}"
logger.warning("Unexpected DNS error for api.themoviedb.org: %s", e)
# Check anime_directory configuration and writability
from src.config.settings import settings
anime_dir = settings.anime_directory
if not anime_dir:
checks["anime_directory"]["status"] = "error"
checks["anime_directory"]["message"] = "anime_directory not configured"
checks["anime_directory"]["path"] = None
logger.error("anime_directory health check failed: not configured")
else:
import os
checks["anime_directory"]["path"] = anime_dir
if not os.path.isdir(anime_dir):
checks["anime_directory"]["status"] = "error"
checks["anime_directory"]["message"] = f"Directory does not exist: {anime_dir}"
logger.error("anime_directory health check failed: %s does not exist", anime_dir)
elif not os.access(anime_dir, os.W_OK):
checks["anime_directory"]["status"] = "error"
checks["anime_directory"]["message"] = f"Directory not writable: {anime_dir}"
logger.error("anime_directory health check failed: %s not writable", anime_dir)
else:
checks["anime_directory"]["status"] = "ok"
checks["anime_directory"]["message"] = f"Directory exists and is writable: {anime_dir}"
logger.debug("anime_directory health check passed: %s", anime_dir)
return checks
@asynccontextmanager
async def lifespan(_application: FastAPI):
"""Manage application lifespan (startup and shutdown).
@@ -342,6 +443,14 @@ async def lifespan(_application: FastAPI):
logger.debug("ffmpeg found at: %s", _shutil.which("ffmpeg"))
except Exception as _exc:
logger.warning("Could not check for ffmpeg: %s", _exc)
# Run startup health checks and store results for /health endpoint
try:
startup_checks = await _run_startup_health_checks(logger)
app.state.startup_checks = startup_checks
except Exception as _exc:
logger.warning("Could not run startup health checks: %s", _exc)
app.state.startup_checks = {}
except Exception as e:
logger.error("Error during startup: %s", e, exc_info=True)
startup_error = e