feat: Add comprehensive provider health monitoring and failover system
- Implemented ProviderHealthMonitor for real-time tracking - Monitors availability, response times, success rates - Automatic marking unavailable after failures - Background health check loop - Added ProviderFailover for automatic provider switching - Configurable retry attempts with exponential backoff - Integration with health monitoring - Smart provider selection - Created MonitoredProviderWrapper for performance tracking - Transparent monitoring for any provider - Automatic metric recording - No changes needed to existing providers - Implemented ProviderConfigManager for dynamic configuration - Runtime updates without restart - Per-provider settings (timeout, retries, bandwidth) - JSON-based persistence - Added Provider Management API (15+ endpoints) - Health monitoring endpoints - Configuration management - Failover control - Comprehensive testing (34 tests, 100% pass rate) - Health monitoring tests - Failover scenario tests - Configuration management tests - Documentation updates - Updated infrastructure.md - Updated instructions.md - Created PROVIDER_ENHANCEMENT_SUMMARY.md Total: ~2,593 lines of code, 34 passing tests
This commit is contained in:
531
src/server/api/providers.py
Normal file
531
src/server/api/providers.py
Normal file
@@ -0,0 +1,531 @@
|
||||
"""Provider management API endpoints.
|
||||
|
||||
This module provides REST API endpoints for monitoring and managing
|
||||
anime providers, including health checks, configuration, and failover.
|
||||
"""
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.core.providers.config_manager import ProviderSettings, get_config_manager
|
||||
from src.core.providers.failover import get_failover
|
||||
from src.core.providers.health_monitor import get_health_monitor
|
||||
from src.server.utils.dependencies import require_auth
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/providers", tags=["providers"])
|
||||
|
||||
|
||||
# Request/Response Models
|
||||
|
||||
|
||||
class ProviderHealthResponse(BaseModel):
|
||||
"""Response model for provider health status."""
|
||||
|
||||
provider_name: str
|
||||
is_available: bool
|
||||
last_check_time: Optional[str] = None
|
||||
total_requests: int
|
||||
successful_requests: int
|
||||
failed_requests: int
|
||||
success_rate: float
|
||||
average_response_time_ms: float
|
||||
last_error: Optional[str] = None
|
||||
last_error_time: Optional[str] = None
|
||||
consecutive_failures: int
|
||||
total_bytes_downloaded: int
|
||||
uptime_percentage: float
|
||||
|
||||
|
||||
class HealthSummaryResponse(BaseModel):
|
||||
"""Response model for overall health summary."""
|
||||
|
||||
total_providers: int
|
||||
available_providers: int
|
||||
availability_percentage: float
|
||||
average_success_rate: float
|
||||
average_response_time_ms: float
|
||||
providers: Dict[str, Dict[str, Any]]
|
||||
|
||||
|
||||
class ProviderSettingsRequest(BaseModel):
|
||||
"""Request model for updating provider settings."""
|
||||
|
||||
enabled: Optional[bool] = None
|
||||
priority: Optional[int] = None
|
||||
timeout_seconds: Optional[int] = Field(None, gt=0)
|
||||
max_retries: Optional[int] = Field(None, ge=0)
|
||||
retry_delay_seconds: Optional[float] = Field(None, gt=0)
|
||||
max_concurrent_downloads: Optional[int] = Field(None, gt=0)
|
||||
bandwidth_limit_mbps: Optional[float] = Field(None, gt=0)
|
||||
|
||||
|
||||
class ProviderSettingsResponse(BaseModel):
|
||||
"""Response model for provider settings."""
|
||||
|
||||
name: str
|
||||
enabled: bool
|
||||
priority: int
|
||||
timeout_seconds: int
|
||||
max_retries: int
|
||||
retry_delay_seconds: float
|
||||
max_concurrent_downloads: int
|
||||
bandwidth_limit_mbps: Optional[float] = None
|
||||
|
||||
|
||||
class FailoverStatsResponse(BaseModel):
|
||||
"""Response model for failover statistics."""
|
||||
|
||||
total_providers: int
|
||||
providers: List[str]
|
||||
current_provider: str
|
||||
max_retries: int
|
||||
retry_delay: float
|
||||
health_monitoring_enabled: bool
|
||||
available_providers: Optional[List[str]] = None
|
||||
unavailable_providers: Optional[List[str]] = None
|
||||
|
||||
|
||||
# Health Monitoring Endpoints
|
||||
|
||||
|
||||
@router.get("/health", response_model=HealthSummaryResponse)
|
||||
async def get_providers_health(
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> HealthSummaryResponse:
|
||||
"""Get overall provider health summary.
|
||||
|
||||
Args:
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Health summary for all providers.
|
||||
"""
|
||||
try:
|
||||
health_monitor = get_health_monitor()
|
||||
summary = health_monitor.get_health_summary()
|
||||
return HealthSummaryResponse(**summary)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get provider health: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve provider health: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health/{provider_name}", response_model=ProviderHealthResponse) # noqa: E501
|
||||
async def get_provider_health(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> ProviderHealthResponse:
|
||||
"""Get health status for a specific provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Health metrics for the provider.
|
||||
|
||||
Raises:
|
||||
HTTPException: If provider not found or error occurs.
|
||||
"""
|
||||
try:
|
||||
health_monitor = get_health_monitor()
|
||||
metrics = health_monitor.get_provider_metrics(provider_name)
|
||||
|
||||
if not metrics:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Provider '{provider_name}' not found",
|
||||
)
|
||||
|
||||
return ProviderHealthResponse(**metrics.to_dict())
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get health for {provider_name}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve provider health: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/available", response_model=List[str])
|
||||
async def get_available_providers(
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> List[str]:
|
||||
"""Get list of currently available providers.
|
||||
|
||||
Args:
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
List of available provider names.
|
||||
"""
|
||||
try:
|
||||
health_monitor = get_health_monitor()
|
||||
return health_monitor.get_available_providers()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get available providers: {e}", exc_info=True) # noqa: E501
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve available providers: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/best", response_model=Dict[str, str])
|
||||
async def get_best_provider(
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Get the best performing provider.
|
||||
|
||||
Args:
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Dictionary with best provider name.
|
||||
"""
|
||||
try:
|
||||
health_monitor = get_health_monitor()
|
||||
best = health_monitor.get_best_provider()
|
||||
|
||||
if not best:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="No available providers",
|
||||
)
|
||||
|
||||
return {"provider": best}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get best provider: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to determine best provider: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/health/{provider_name}/reset")
|
||||
async def reset_provider_health(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Reset health metrics for a specific provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Success message.
|
||||
|
||||
Raises:
|
||||
HTTPException: If provider not found or error occurs.
|
||||
"""
|
||||
try:
|
||||
health_monitor = get_health_monitor()
|
||||
success = health_monitor.reset_provider_metrics(provider_name)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Provider '{provider_name}' not found",
|
||||
)
|
||||
|
||||
return {"message": f"Reset metrics for provider: {provider_name}"}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to reset health for {provider_name}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to reset provider health: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
# Configuration Endpoints
|
||||
|
||||
|
||||
@router.get("/config", response_model=List[ProviderSettingsResponse])
|
||||
async def get_all_provider_configs(
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> List[ProviderSettingsResponse]:
|
||||
"""Get configuration for all providers.
|
||||
|
||||
Args:
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
List of provider configurations.
|
||||
"""
|
||||
try:
|
||||
config_manager = get_config_manager()
|
||||
all_settings = config_manager.get_all_provider_settings()
|
||||
return [
|
||||
ProviderSettingsResponse(**settings.to_dict())
|
||||
for settings in all_settings.values()
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get provider configs: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve provider configurations: {str(e)}", # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/config/{provider_name}", response_model=ProviderSettingsResponse
|
||||
)
|
||||
async def get_provider_config(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> ProviderSettingsResponse:
|
||||
"""Get configuration for a specific provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Provider configuration.
|
||||
|
||||
Raises:
|
||||
HTTPException: If provider not found or error occurs.
|
||||
"""
|
||||
try:
|
||||
config_manager = get_config_manager()
|
||||
settings = config_manager.get_provider_settings(provider_name)
|
||||
|
||||
if not settings:
|
||||
# Return default settings
|
||||
settings = ProviderSettings(name=provider_name)
|
||||
|
||||
return ProviderSettingsResponse(**settings.to_dict())
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to get config for {provider_name}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve provider configuration: {str(e)}", # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
@router.put(
|
||||
"/config/{provider_name}", response_model=ProviderSettingsResponse
|
||||
)
|
||||
async def update_provider_config(
|
||||
provider_name: str,
|
||||
settings: ProviderSettingsRequest,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> ProviderSettingsResponse:
|
||||
"""Update configuration for a specific provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
settings: Settings to update.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Updated provider configuration.
|
||||
"""
|
||||
try:
|
||||
config_manager = get_config_manager()
|
||||
|
||||
# Update settings
|
||||
update_dict = settings.dict(exclude_unset=True)
|
||||
config_manager.update_provider_settings(
|
||||
provider_name, **update_dict
|
||||
)
|
||||
|
||||
# Get updated settings
|
||||
updated = config_manager.get_provider_settings(provider_name)
|
||||
if not updated:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Failed to retrieve updated configuration",
|
||||
)
|
||||
|
||||
return ProviderSettingsResponse(**updated.to_dict())
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to update config for {provider_name}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to update provider configuration: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/config/{provider_name}/enable")
|
||||
async def enable_provider(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Enable a provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Success message.
|
||||
"""
|
||||
try:
|
||||
config_manager = get_config_manager()
|
||||
config_manager.update_provider_settings(
|
||||
provider_name, enabled=True
|
||||
)
|
||||
return {"message": f"Enabled provider: {provider_name}"}
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to enable {provider_name}: {e}", exc_info=True
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to enable provider: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/config/{provider_name}/disable")
|
||||
async def disable_provider(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Disable a provider.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Success message.
|
||||
"""
|
||||
try:
|
||||
config_manager = get_config_manager()
|
||||
config_manager.update_provider_settings(
|
||||
provider_name, enabled=False
|
||||
)
|
||||
return {"message": f"Disabled provider: {provider_name}"}
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to disable {provider_name}: {e}", exc_info=True
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to disable provider: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
# Failover Endpoints
|
||||
|
||||
|
||||
@router.get("/failover", response_model=FailoverStatsResponse)
|
||||
async def get_failover_stats(
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> FailoverStatsResponse:
|
||||
"""Get failover statistics and configuration.
|
||||
|
||||
Args:
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Failover statistics.
|
||||
"""
|
||||
try:
|
||||
failover = get_failover()
|
||||
stats = failover.get_failover_stats()
|
||||
return FailoverStatsResponse(**stats)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get failover stats: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to retrieve failover statistics: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/failover/{provider_name}/add")
|
||||
async def add_provider_to_failover(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Add a provider to the failover chain.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Success message.
|
||||
"""
|
||||
try:
|
||||
failover = get_failover()
|
||||
failover.add_provider(provider_name)
|
||||
return {"message": f"Added provider to failover: {provider_name}"}
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to add {provider_name} to failover: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to add provider to failover: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/failover/{provider_name}")
|
||||
async def remove_provider_from_failover(
|
||||
provider_name: str,
|
||||
auth: Optional[dict] = Depends(require_auth),
|
||||
) -> Dict[str, str]:
|
||||
"""Remove a provider from the failover chain.
|
||||
|
||||
Args:
|
||||
provider_name: Name of the provider.
|
||||
auth: Authentication token (optional).
|
||||
|
||||
Returns:
|
||||
Success message.
|
||||
|
||||
Raises:
|
||||
HTTPException: If provider not found in failover chain.
|
||||
"""
|
||||
try:
|
||||
failover = get_failover()
|
||||
success = failover.remove_provider(provider_name)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Provider '{provider_name}' not in failover chain", # noqa: E501
|
||||
)
|
||||
|
||||
return {
|
||||
"message": f"Removed provider from failover: {provider_name}"
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to remove {provider_name} from failover: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to remove provider from failover: {str(e)}",
|
||||
)
|
||||
@@ -25,6 +25,7 @@ from src.server.api.config import router as config_router
|
||||
from src.server.api.diagnostics import router as diagnostics_router
|
||||
from src.server.api.download import router as download_router
|
||||
from src.server.api.logging import router as logging_router
|
||||
from src.server.api.providers import router as providers_router
|
||||
from src.server.api.scheduler import router as scheduler_router
|
||||
from src.server.api.websocket import router as websocket_router
|
||||
from src.server.controllers.error_controller import (
|
||||
@@ -139,6 +140,7 @@ app.include_router(diagnostics_router)
|
||||
app.include_router(analytics_router)
|
||||
app.include_router(anime_router)
|
||||
app.include_router(download_router)
|
||||
app.include_router(providers_router)
|
||||
app.include_router(websocket_router)
|
||||
|
||||
# Register exception handlers
|
||||
|
||||
Reference in New Issue
Block a user