Aniworld/src/server/web/middleware/error_handler.py

462 lines
16 KiB
Python

"""
Error Handling & Recovery System for AniWorld App
This module provides comprehensive error handling for network failures,
download errors, and system recovery mechanisms.
"""
import logging
import time
import functools
import threading
from typing import Callable, Any, Dict, Optional, List
from datetime import datetime, timedelta
import requests
import socket
import ssl
from urllib3.exceptions import ConnectionError, TimeoutError, ReadTimeoutError
from requests.exceptions import RequestException, ConnectionError as ReqConnectionError
from flask import jsonify
import os
import hashlib
class NetworkError(Exception):
"""Base class for network-related errors."""
pass
class DownloadError(Exception):
"""Base class for download-related errors."""
pass
class RetryableError(Exception):
"""Base class for errors that can be retried."""
pass
class NonRetryableError(Exception):
"""Base class for errors that should not be retried."""
pass
class ErrorRecoveryManager:
"""Manages error recovery strategies and retry mechanisms."""
def __init__(self, max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 60.0):
self.max_retries = max_retries
self.base_delay = base_delay
self.max_delay = max_delay
self.error_history: List[Dict] = []
self.blacklisted_urls: Dict[str, datetime] = {}
self.retry_counts: Dict[str, int] = {}
self.logger = logging.getLogger(__name__)
def is_network_error(self, error: Exception) -> bool:
"""Check if error is network-related."""
network_errors = (
ConnectionError, TimeoutError, ReadTimeoutError,
ReqConnectionError, socket.timeout, socket.gaierror,
ssl.SSLError, requests.exceptions.Timeout,
requests.exceptions.ConnectionError
)
return isinstance(error, network_errors)
def is_retryable_error(self, error: Exception) -> bool:
"""Determine if an error should be retried."""
if isinstance(error, NonRetryableError):
return False
if isinstance(error, RetryableError):
return True
# Network errors are generally retryable
if self.is_network_error(error):
return True
# HTTP status codes that are retryable
if hasattr(error, 'response') and error.response:
status_code = error.response.status_code
retryable_codes = [408, 429, 500, 502, 503, 504]
return status_code in retryable_codes
return False
def calculate_delay(self, attempt: int) -> float:
"""Calculate exponential backoff delay."""
delay = self.base_delay * (2 ** (attempt - 1))
return min(delay, self.max_delay)
def log_error(self, error: Exception, context: str, attempt: int = None):
"""Log error with context information."""
error_info = {
'timestamp': datetime.now().isoformat(),
'error_type': type(error).__name__,
'error_message': str(error),
'context': context,
'attempt': attempt,
'retryable': self.is_retryable_error(error)
}
self.error_history.append(error_info)
# Keep only last 1000 errors
if len(self.error_history) > 1000:
self.error_history = self.error_history[-1000:]
log_level = logging.WARNING if self.is_retryable_error(error) else logging.ERROR
self.logger.log(log_level, f"Error in {context}: {error}", exc_info=True)
def add_to_blacklist(self, url: str, duration_minutes: int = 30):
"""Add URL to temporary blacklist."""
self.blacklisted_urls[url] = datetime.now() + timedelta(minutes=duration_minutes)
def is_blacklisted(self, url: str) -> bool:
"""Check if URL is currently blacklisted."""
if url in self.blacklisted_urls:
if datetime.now() < self.blacklisted_urls[url]:
return True
else:
del self.blacklisted_urls[url]
return False
def cleanup_blacklist(self):
"""Remove expired entries from blacklist."""
now = datetime.now()
expired_keys = [url for url, expiry in self.blacklisted_urls.items() if now >= expiry]
for key in expired_keys:
del self.blacklisted_urls[key]
class RetryMechanism:
"""Advanced retry mechanism with exponential backoff and jitter."""
def __init__(self, recovery_manager: ErrorRecoveryManager):
self.recovery_manager = recovery_manager
self.logger = logging.getLogger(__name__)
def retry_with_backoff(
self,
func: Callable,
*args,
max_retries: int = None,
backoff_factor: float = 1.0,
jitter: bool = True,
retry_on: tuple = None,
context: str = None,
**kwargs
) -> Any:
"""
Retry function with exponential backoff and jitter.
Args:
func: Function to retry
max_retries: Maximum number of retries (uses recovery manager default if None)
backoff_factor: Multiplier for backoff delay
jitter: Add random jitter to prevent thundering herd
retry_on: Tuple of exception types to retry on
context: Context string for logging
Returns:
Function result
Raises:
Last exception if all retries fail
"""
if max_retries is None:
max_retries = self.recovery_manager.max_retries
if context is None:
context = f"{func.__name__}"
last_exception = None
for attempt in range(1, max_retries + 2): # +1 for initial attempt
try:
return func(*args, **kwargs)
except Exception as e:
last_exception = e
# Check if we should retry this error
should_retry = (
retry_on is None and self.recovery_manager.is_retryable_error(e)
) or (
retry_on is not None and isinstance(e, retry_on)
)
if attempt > max_retries or not should_retry:
self.recovery_manager.log_error(e, context, attempt)
raise e
# Calculate delay with jitter
delay = self.recovery_manager.calculate_delay(attempt) * backoff_factor
if jitter:
import random
delay *= (0.5 + random.random() * 0.5) # Add 0-50% jitter
self.recovery_manager.log_error(e, context, attempt)
self.logger.info(f"Retrying {context} in {delay:.2f}s (attempt {attempt}/{max_retries})")
time.sleep(delay)
raise last_exception
class NetworkHealthChecker:
"""Monitor network connectivity and health."""
def __init__(self):
self.logger = logging.getLogger(__name__)
self.connectivity_cache = {}
self.cache_timeout = 60 # seconds
def check_connectivity(self, host: str = "8.8.8.8", port: int = 53, timeout: float = 3.0) -> bool:
"""Check basic network connectivity."""
cache_key = f"{host}:{port}"
now = time.time()
# Check cache
if cache_key in self.connectivity_cache:
timestamp, result = self.connectivity_cache[cache_key]
if now - timestamp < self.cache_timeout:
return result
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
result = True
except Exception:
result = False
self.connectivity_cache[cache_key] = (now, result)
return result
def check_url_reachability(self, url: str, timeout: float = 10.0) -> bool:
"""Check if a specific URL is reachable."""
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except Exception as e:
self.logger.debug(f"URL {url} not reachable: {e}")
return False
def get_network_status(self) -> Dict[str, Any]:
"""Get comprehensive network status."""
return {
'basic_connectivity': self.check_connectivity(),
'dns_resolution': self.check_connectivity("1.1.1.1", 53),
'timestamp': datetime.now().isoformat()
}
class FileCorruptionDetector:
"""Detect and handle file corruption."""
def __init__(self):
self.logger = logging.getLogger(__name__)
def calculate_checksum(self, file_path: str, algorithm: str = 'md5') -> str:
"""Calculate file checksum."""
hash_func = getattr(hashlib, algorithm)()
try:
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk)
return hash_func.hexdigest()
except Exception as e:
self.logger.error(f"Failed to calculate checksum for {file_path}: {e}")
raise
def verify_file_size(self, file_path: str, expected_size: int = None, min_size: int = 1024) -> bool:
"""Verify file has reasonable size."""
try:
actual_size = os.path.getsize(file_path)
# Check minimum size
if actual_size < min_size:
self.logger.warning(f"File {file_path} too small: {actual_size} bytes")
return False
# Check expected size if provided
if expected_size and abs(actual_size - expected_size) > expected_size * 0.1: # 10% tolerance
self.logger.warning(f"File {file_path} size mismatch: expected {expected_size}, got {actual_size}")
return False
return True
except Exception as e:
self.logger.error(f"Failed to verify file size for {file_path}: {e}")
return False
def is_valid_video_file(self, file_path: str) -> bool:
"""Basic validation for video files."""
if not os.path.exists(file_path):
return False
# Check file size
if not self.verify_file_size(file_path):
return False
# Check file extension
video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm'}
ext = os.path.splitext(file_path)[1].lower()
if ext not in video_extensions:
self.logger.warning(f"File {file_path} has unexpected extension: {ext}")
# Try to read first few bytes to check for valid headers
try:
with open(file_path, 'rb') as f:
header = f.read(32)
# Common video file signatures
video_signatures = [
b'\x00\x00\x00\x18ftypmp4', # MP4
b'\x1a\x45\xdf\xa3', # MKV (Matroska)
b'RIFF', # AVI
]
for sig in video_signatures:
if header.startswith(sig):
return True
# If no specific signature matches, assume it's valid if size is reasonable
return True
except Exception as e:
self.logger.error(f"Failed to read file header for {file_path}: {e}")
return False
class RecoveryStrategies:
"""Implement various recovery strategies for different error types."""
def __init__(self, recovery_manager: ErrorRecoveryManager):
self.recovery_manager = recovery_manager
self.retry_mechanism = RetryMechanism(recovery_manager)
self.health_checker = NetworkHealthChecker()
self.corruption_detector = FileCorruptionDetector()
self.logger = logging.getLogger(__name__)
def handle_network_failure(self, func: Callable, *args, **kwargs) -> Any:
"""Handle network failures with comprehensive recovery."""
def recovery_wrapper():
# Check basic connectivity first
if not self.health_checker.check_connectivity():
raise NetworkError("No internet connectivity")
return func(*args, **kwargs)
return self.retry_mechanism.retry_with_backoff(
recovery_wrapper,
max_retries=5,
backoff_factor=1.5,
context=f"network_operation_{func.__name__}",
retry_on=(NetworkError, ConnectionError, TimeoutError)
)
def handle_download_failure(
self,
download_func: Callable,
file_path: str,
*args,
**kwargs
) -> Any:
"""Handle download failures with corruption checking and resume support."""
def download_with_verification():
result = download_func(*args, **kwargs)
# Verify downloaded file if it exists
if os.path.exists(file_path):
if not self.corruption_detector.is_valid_video_file(file_path):
self.logger.warning(f"Downloaded file appears corrupted: {file_path}")
# Remove corrupted file to force re-download
try:
os.remove(file_path)
except Exception as e:
self.logger.error(f"Failed to remove corrupted file {file_path}: {e}")
raise DownloadError("Downloaded file is corrupted")
return result
return self.retry_mechanism.retry_with_backoff(
download_with_verification,
max_retries=3,
backoff_factor=2.0,
context=f"download_{os.path.basename(file_path)}",
retry_on=(DownloadError, NetworkError, ConnectionError)
)
# Singleton instances
error_recovery_manager = ErrorRecoveryManager()
recovery_strategies = RecoveryStrategies(error_recovery_manager)
network_health_checker = NetworkHealthChecker()
file_corruption_detector = FileCorruptionDetector()
def with_error_recovery(max_retries: int = None, context: str = None):
"""Decorator for adding error recovery to functions."""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
return recovery_strategies.retry_mechanism.retry_with_backoff(
func,
*args,
max_retries=max_retries,
context=context or func.__name__,
**kwargs
)
return wrapper
return decorator
def handle_api_errors(func: Callable) -> Callable:
"""Decorator for consistent API error handling."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except NonRetryableError as e:
error_recovery_manager.log_error(e, f"api_{func.__name__}")
return jsonify({
'status': 'error',
'message': 'Operation failed',
'error_type': 'non_retryable',
'retry_suggested': False
}), 400
except RetryableError as e:
error_recovery_manager.log_error(e, f"api_{func.__name__}")
return jsonify({
'status': 'error',
'message': 'Temporary failure, please try again',
'error_type': 'retryable',
'retry_suggested': True
}), 503
except Exception as e:
error_recovery_manager.log_error(e, f"api_{func.__name__}")
return jsonify({
'status': 'error',
'message': 'An unexpected error occurred',
'error_type': 'unknown',
'retry_suggested': error_recovery_manager.is_retryable_error(e)
}), 500
return wrapper
# Export main components
__all__ = [
'ErrorRecoveryManager',
'RetryMechanism',
'NetworkHealthChecker',
'FileCorruptionDetector',
'RecoveryStrategies',
'NetworkError',
'DownloadError',
'RetryableError',
'NonRetryableError',
'with_error_recovery',
'handle_api_errors',
'error_recovery_manager',
'recovery_strategies',
'network_health_checker',
'file_corruption_detector'
]