Aniworld/src/core/providers/enhanced_provider.py
2025-10-05 22:22:04 +02:00

671 lines
27 KiB
Python

"""
Enhanced AniWorld Loader with Error Handling and Recovery
This module extends the original AniWorldLoader with comprehensive
error handling, retry mechanisms, and recovery strategies.
"""
import os
import re
import logging
import json
import requests
import html
from urllib.parse import quote
import time
import hashlib
from typing import Optional, Dict, Any, Callable
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from yt_dlp import YoutubeDL
import shutil
from .base_provider import Loader
from ..interfaces.providers import Providers
from error_handler import (
with_error_recovery,
recovery_strategies,
NetworkError,
DownloadError,
RetryableError,
NonRetryableError,
file_corruption_detector
)
class EnhancedAniWorldLoader(Loader):
"""Enhanced AniWorld loader with comprehensive error handling."""
def __init__(self):
super().__init__()
self.logger = logging.getLogger(__name__)
self.SUPPORTED_PROVIDERS = ["VOE", "Doodstream", "Vidmoly", "Vidoza", "SpeedFiles", "Streamtape", "Luluvdo"]
self.AniworldHeaders = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "de,de-DE;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"cache-control": "max-age=0",
"priority": "u=0, i",
"sec-ch-ua": '"Chromium";v="136", "Microsoft Edge";v="136", "Not.A/Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0"
}
self.INVALID_PATH_CHARS = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '&']
self.RANDOM_USER_AGENT = UserAgent().random
self.LULUVDO_USER_AGENT = "Mozilla/5.0 (Android 15; Mobile; rv:132.0) Gecko/132.0 Firefox/132.0"
self.PROVIDER_HEADERS = {
"Vidmoly": ['Referer: "https://vidmoly.to"'],
"Doodstream": ['Referer: "https://dood.li/"'],
"VOE": [f'User-Agent: {self.RANDOM_USER_AGENT}'],
"Luluvdo": [
f'User-Agent: {self.LULUVDO_USER_AGENT}',
'Accept-Language: de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
'Origin: "https://luluvdo.com"',
'Referer: "https://luluvdo.com/"'
]
}
self.ANIWORLD_TO = "https://aniworld.to"
self.DEFAULT_REQUEST_TIMEOUT = 30
# Initialize session with enhanced retry configuration
self.session = self._create_robust_session()
# Cache dictionaries
self._KeyHTMLDict = {}
self._EpisodeHTMLDict = {}
# Provider manager
self.Providers = Providers()
# Download statistics
self.download_stats = {
'total_downloads': 0,
'successful_downloads': 0,
'failed_downloads': 0,
'retried_downloads': 0
}
# Read timeout from environment variable
self.download_timeout = int(os.getenv("DOWNLOAD_TIMEOUT", 600))
# Setup logging
self._setup_logging()
def _create_robust_session(self) -> requests.Session:
"""Create a session with robust retry and error handling configuration."""
session = requests.Session()
# Enhanced retry strategy
retries = Retry(
total=5,
backoff_factor=2, # More aggressive backoff
status_forcelist=[408, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524],
allowed_methods=["GET", "POST", "HEAD"],
raise_on_status=False # Handle status errors manually
)
adapter = HTTPAdapter(
max_retries=retries,
pool_connections=10,
pool_maxsize=20,
pool_block=True
)
session.mount("https://", adapter)
session.mount("http://", adapter)
# Set default headers
session.headers.update(self.AniworldHeaders)
return session
def _setup_logging(self):
"""Setup specialized logging for download errors and missing keys."""
# Download error logger
self.download_error_logger = logging.getLogger("DownloadErrors")
download_error_handler = logging.FileHandler("../../download_errors.log")
download_error_handler.setLevel(logging.ERROR)
download_error_formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
download_error_handler.setFormatter(download_error_formatter)
if not self.download_error_logger.handlers:
self.download_error_logger.addHandler(download_error_handler)
self.download_error_logger.setLevel(logging.ERROR)
# No key found logger
self.nokey_logger = logging.getLogger("NoKeyFound")
nokey_handler = logging.FileHandler("../../NoKeyFound.log")
nokey_handler.setLevel(logging.ERROR)
nokey_handler.setFormatter(download_error_formatter)
if not self.nokey_logger.handlers:
self.nokey_logger.addHandler(nokey_handler)
self.nokey_logger.setLevel(logging.ERROR)
def ClearCache(self):
"""Clear all cached data."""
self._KeyHTMLDict.clear()
self._EpisodeHTMLDict.clear()
self.logger.debug("Cache cleared")
def RemoveFromCache(self):
"""Remove episode HTML cache."""
self._EpisodeHTMLDict.clear()
self.logger.debug("Episode cache cleared")
@with_error_recovery(max_retries=3, context="anime_search")
def Search(self, word: str) -> list:
"""Search for anime with error handling."""
if not word or not word.strip():
raise ValueError("Search term cannot be empty")
search_url = f"{self.ANIWORLD_TO}/ajax/seriesSearch?keyword={quote(word)}"
try:
return self._fetch_anime_list_with_recovery(search_url)
except Exception as e:
self.logger.error(f"Search failed for term '{word}': {e}")
raise RetryableError(f"Search failed: {e}") from e
def _fetch_anime_list_with_recovery(self, url: str) -> list:
"""Fetch anime list with comprehensive error handling."""
try:
response = recovery_strategies.handle_network_failure(
self.session.get,
url,
timeout=self.DEFAULT_REQUEST_TIMEOUT
)
if not response.ok:
if response.status_code == 404:
raise NonRetryableError(f"URL not found: {url}")
elif response.status_code == 403:
raise NonRetryableError(f"Access forbidden: {url}")
elif response.status_code >= 500:
raise RetryableError(f"Server error {response.status_code}")
else:
raise RetryableError(f"HTTP error {response.status_code}")
return self._parse_anime_response(response.text)
except (requests.RequestException, ConnectionError) as e:
raise NetworkError(f"Network error during anime search: {e}") from e
def _parse_anime_response(self, response_text: str) -> list:
"""Parse anime search response with error handling."""
if not response_text or not response_text.strip():
raise ValueError("Empty response from server")
clean_text = response_text.strip()
# Try multiple parsing strategies
parsing_strategies = [
lambda text: json.loads(html.unescape(text)),
lambda text: json.loads(text.encode('utf-8').decode('utf-8-sig')),
lambda text: json.loads(re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text))
]
for i, strategy in enumerate(parsing_strategies):
try:
decoded_data = strategy(clean_text)
if isinstance(decoded_data, list):
self.logger.debug(f"Successfully parsed anime response with strategy {i + 1}")
return decoded_data
else:
self.logger.warning(f"Strategy {i + 1} returned non-list data: {type(decoded_data)}")
except json.JSONDecodeError as e:
self.logger.debug(f"Parsing strategy {i + 1} failed: {e}")
continue
raise ValueError("Could not parse anime search response with any strategy")
def _GetLanguageKey(self, language: str) -> int:
"""Get numeric language code."""
language_map = {
"German Dub": 1,
"English Sub": 2,
"German Sub": 3
}
return language_map.get(language, 0)
@with_error_recovery(max_retries=2, context="language_check")
def IsLanguage(self, season: int, episode: int, key: str, language: str = "German Dub") -> bool:
"""Check if episode is available in specified language with error handling."""
try:
languageCode = self._GetLanguageKey(language)
if languageCode == 0:
raise ValueError(f"Unknown language: {language}")
episode_response = self._GetEpisodeHTML(season, episode, key)
soup = BeautifulSoup(episode_response.content, 'html.parser')
change_language_box_div = soup.find('div', class_='changeLanguageBox')
if not change_language_box_div:
self.logger.debug(f"No language box found for {key} S{season}E{episode}")
return False
img_tags = change_language_box_div.find_all('img')
available_languages = []
for img in img_tags:
lang_key = img.get('data-lang-key')
if lang_key and lang_key.isdigit():
available_languages.append(int(lang_key))
is_available = languageCode in available_languages
self.logger.debug(f"Language check for {key} S{season}E{episode} - "
f"Requested: {languageCode}, Available: {available_languages}, "
f"Result: {is_available}")
return is_available
except Exception as e:
self.logger.error(f"Language check failed for {key} S{season}E{episode}: {e}")
raise RetryableError(f"Language check failed: {e}") from e
def Download(self, baseDirectory: str, serieFolder: str, season: int, episode: int,
key: str, language: str = "German Dub", progress_callback: Callable = None) -> bool:
"""Download episode with comprehensive error handling and recovery."""
self.download_stats['total_downloads'] += 1
try:
# Validate inputs
if not all([baseDirectory, serieFolder, key]):
raise ValueError("Missing required parameters for download")
if season < 0 or episode < 0:
raise ValueError("Season and episode must be non-negative")
# Prepare file paths
sanitized_anime_title = ''.join(
char for char in self.GetTitle(key) if char not in self.INVALID_PATH_CHARS
)
if not sanitized_anime_title:
sanitized_anime_title = f"Unknown_{key}"
# Generate output filename
if season == 0:
output_file = f"{sanitized_anime_title} - Movie {episode:02} - ({language}).mp4"
else:
output_file = f"{sanitized_anime_title} - S{season:02}E{episode:03} - ({language}).mp4"
# Create directory structure
folder_path = os.path.join(baseDirectory, serieFolder, f"Season {season}")
output_path = os.path.join(folder_path, output_file)
# Check if file already exists and is valid
if os.path.exists(output_path):
if file_corruption_detector.is_valid_video_file(output_path):
self.logger.info(f"File already exists and is valid: {output_file}")
self.download_stats['successful_downloads'] += 1
return True
else:
self.logger.warning(f"Existing file appears corrupted, removing: {output_path}")
try:
os.remove(output_path)
except Exception as e:
self.logger.error(f"Failed to remove corrupted file: {e}")
os.makedirs(folder_path, exist_ok=True)
# Create temp directory
temp_dir = "./Temp/"
os.makedirs(temp_dir, exist_ok=True)
temp_path = os.path.join(temp_dir, output_file)
# Attempt download with recovery strategies
success = self._download_with_recovery(
season, episode, key, language, temp_path, output_path, progress_callback
)
if success:
self.download_stats['successful_downloads'] += 1
self.logger.info(f"Successfully downloaded: {output_file}")
else:
self.download_stats['failed_downloads'] += 1
self.download_error_logger.error(
f"Download failed for {key} S{season}E{episode} ({language})"
)
return success
except Exception as e:
self.download_stats['failed_downloads'] += 1
self.download_error_logger.error(
f"Download error for {key} S{season}E{episode}: {e}", exc_info=True
)
raise DownloadError(f"Download failed: {e}") from e
finally:
self.ClearCache()
def _download_with_recovery(self, season: int, episode: int, key: str, language: str,
temp_path: str, output_path: str, progress_callback: Callable) -> bool:
"""Attempt download with multiple providers and recovery strategies."""
for provider_name in self.SUPPORTED_PROVIDERS:
try:
self.logger.info(f"Attempting download with provider: {provider_name}")
# Get download link and headers for provider
link, headers = recovery_strategies.handle_network_failure(
self._get_direct_link_from_provider,
season, episode, key, language
)
if not link:
self.logger.warning(f"No download link found for provider: {provider_name}")
continue
# Configure yt-dlp options
ydl_opts = {
'fragment_retries': float('inf'),
'outtmpl': temp_path,
'quiet': True,
'no_warnings': True,
'progress_with_newline': False,
'nocheckcertificate': True,
'socket_timeout': self.download_timeout,
'http_chunk_size': 1024 * 1024, # 1MB chunks
}
if headers:
ydl_opts['http_headers'] = headers
if progress_callback:
ydl_opts['progress_hooks'] = [progress_callback]
# Perform download with recovery
success = recovery_strategies.handle_download_failure(
self._perform_ytdl_download,
temp_path,
ydl_opts,
link
)
if success and os.path.exists(temp_path):
# Verify downloaded file
if file_corruption_detector.is_valid_video_file(temp_path):
# Move to final location
shutil.copy2(temp_path, output_path)
# Clean up temp file
try:
os.remove(temp_path)
except Exception as e:
self.logger.warning(f"Failed to remove temp file: {e}")
return True
else:
self.logger.warning(f"Downloaded file failed validation: {temp_path}")
try:
os.remove(temp_path)
except Exception:
pass
except Exception as e:
self.logger.warning(f"Provider {provider_name} failed: {e}")
self.download_stats['retried_downloads'] += 1
continue
return False
def _perform_ytdl_download(self, ydl_opts: Dict[str, Any], link: str) -> bool:
"""Perform actual download using yt-dlp."""
try:
with YoutubeDL(ydl_opts) as ydl:
ydl.download([link])
return True
except Exception as e:
self.logger.error(f"yt-dlp download failed: {e}")
raise DownloadError(f"Download failed: {e}") from e
@with_error_recovery(max_retries=2, context="get_title")
def GetTitle(self, key: str) -> str:
"""Get anime title with error handling."""
try:
soup = BeautifulSoup(self._GetKeyHTML(key).content, 'html.parser')
title_div = soup.find('div', class_='series-title')
if title_div:
title_span = title_div.find('h1')
if title_span:
span = title_span.find('span')
if span:
return span.text.strip()
self.logger.warning(f"Could not extract title for key: {key}")
return f"Unknown_Title_{key}"
except Exception as e:
self.logger.error(f"Failed to get title for key {key}: {e}")
raise RetryableError(f"Title extraction failed: {e}") from e
def GetSiteKey(self) -> str:
"""Get site identifier."""
return "aniworld.to"
@with_error_recovery(max_retries=2, context="get_key_html")
def _GetKeyHTML(self, key: str):
"""Get cached HTML for anime key."""
if key in self._KeyHTMLDict:
return self._KeyHTMLDict[key]
try:
url = f"{self.ANIWORLD_TO}/anime/stream/{key}"
response = recovery_strategies.handle_network_failure(
self.session.get,
url,
timeout=self.DEFAULT_REQUEST_TIMEOUT
)
if not response.ok:
if response.status_code == 404:
self.nokey_logger.error(f"Anime key not found: {key}")
raise NonRetryableError(f"Anime key not found: {key}")
else:
raise RetryableError(f"HTTP error {response.status_code} for key {key}")
self._KeyHTMLDict[key] = response
return self._KeyHTMLDict[key]
except Exception as e:
self.logger.error(f"Failed to get HTML for key {key}: {e}")
raise
@with_error_recovery(max_retries=2, context="get_episode_html")
def _GetEpisodeHTML(self, season: int, episode: int, key: str):
"""Get cached HTML for specific episode."""
cache_key = (key, season, episode)
if cache_key in self._EpisodeHTMLDict:
return self._EpisodeHTMLDict[cache_key]
try:
url = f"{self.ANIWORLD_TO}/anime/stream/{key}/staffel-{season}/episode-{episode}"
response = recovery_strategies.handle_network_failure(
self.session.get,
url,
timeout=self.DEFAULT_REQUEST_TIMEOUT
)
if not response.ok:
if response.status_code == 404:
raise NonRetryableError(f"Episode not found: {key} S{season}E{episode}")
else:
raise RetryableError(f"HTTP error {response.status_code} for episode")
self._EpisodeHTMLDict[cache_key] = response
return self._EpisodeHTMLDict[cache_key]
except Exception as e:
self.logger.error(f"Failed to get episode HTML for {key} S{season}E{episode}: {e}")
raise
def _get_provider_from_html(self, season: int, episode: int, key: str) -> dict:
"""Extract providers from HTML with error handling."""
try:
soup = BeautifulSoup(self._GetEpisodeHTML(season, episode, key).content, 'html.parser')
providers = {}
episode_links = soup.find_all(
'li', class_=lambda x: x and x.startswith('episodeLink')
)
if not episode_links:
self.logger.warning(f"No episode links found for {key} S{season}E{episode}")
return providers
for link in episode_links:
provider_name_tag = link.find('h4')
provider_name = provider_name_tag.text.strip() if provider_name_tag else None
redirect_link_tag = link.find('a', class_='watchEpisode')
redirect_link = redirect_link_tag['href'] if redirect_link_tag else None
lang_key = link.get('data-lang-key')
lang_key = int(lang_key) if lang_key and lang_key.isdigit() else None
if provider_name and redirect_link and lang_key:
if provider_name not in providers:
providers[provider_name] = {}
providers[provider_name][lang_key] = f"{self.ANIWORLD_TO}{redirect_link}"
self.logger.debug(f"Found {len(providers)} providers for {key} S{season}E{episode}")
return providers
except Exception as e:
self.logger.error(f"Failed to parse providers from HTML: {e}")
raise RetryableError(f"Provider parsing failed: {e}") from e
def _get_redirect_link(self, season: int, episode: int, key: str, language: str = "German Dub"):
"""Get redirect link for episode with error handling."""
languageCode = self._GetLanguageKey(language)
if not self.IsLanguage(season, episode, key, language):
raise NonRetryableError(f"Language {language} not available for {key} S{season}E{episode}")
providers = self._get_provider_from_html(season, episode, key)
for provider_name, lang_dict in providers.items():
if languageCode in lang_dict:
return lang_dict[languageCode], provider_name
raise NonRetryableError(f"No provider found for {language} in {key} S{season}E{episode}")
def _get_embeded_link(self, season: int, episode: int, key: str, language: str = "German Dub"):
"""Get embedded link with error handling."""
try:
redirect_link, provider_name = self._get_redirect_link(season, episode, key, language)
response = recovery_strategies.handle_network_failure(
self.session.get,
redirect_link,
timeout=self.DEFAULT_REQUEST_TIMEOUT,
headers={'User-Agent': self.RANDOM_USER_AGENT}
)
return response.url
except Exception as e:
self.logger.error(f"Failed to get embedded link: {e}")
raise
def _get_direct_link_from_provider(self, season: int, episode: int, key: str, language: str = "German Dub"):
"""Get direct download link from provider with error handling."""
try:
embedded_link = self._get_embeded_link(season, episode, key, language)
if not embedded_link:
raise NonRetryableError("No embedded link found")
# Use VOE provider as default (could be made configurable)
provider = self.Providers.GetProvider("VOE")
if not provider:
raise NonRetryableError("VOE provider not available")
return provider.GetLink(embedded_link, self.DEFAULT_REQUEST_TIMEOUT)
except Exception as e:
self.logger.error(f"Failed to get direct link from provider: {e}")
raise
@with_error_recovery(max_retries=2, context="get_season_episode_count")
def get_season_episode_count(self, slug: str) -> dict:
"""Get episode count per season with error handling."""
try:
base_url = f"{self.ANIWORLD_TO}/anime/stream/{slug}/"
response = recovery_strategies.handle_network_failure(
requests.get,
base_url,
timeout=self.DEFAULT_REQUEST_TIMEOUT
)
soup = BeautifulSoup(response.content, 'html.parser')
season_meta = soup.find('meta', itemprop='numberOfSeasons')
number_of_seasons = int(season_meta['content']) if season_meta else 0
episode_counts = {}
for season in range(1, number_of_seasons + 1):
season_url = f"{base_url}staffel-{season}"
season_response = recovery_strategies.handle_network_failure(
requests.get,
season_url,
timeout=self.DEFAULT_REQUEST_TIMEOUT
)
season_soup = BeautifulSoup(season_response.content, 'html.parser')
episode_links = season_soup.find_all('a', href=True)
unique_links = set(
link['href']
for link in episode_links
if f"staffel-{season}/episode-" in link['href']
)
episode_counts[season] = len(unique_links)
return episode_counts
except Exception as e:
self.logger.error(f"Failed to get episode counts for {slug}: {e}")
raise RetryableError(f"Episode count retrieval failed: {e}") from e
def get_download_statistics(self) -> Dict[str, Any]:
"""Get download statistics."""
stats = self.download_stats.copy()
stats['success_rate'] = (
(stats['successful_downloads'] / stats['total_downloads'] * 100)
if stats['total_downloads'] > 0 else 0
)
return stats
def reset_statistics(self):
"""Reset download statistics."""
self.download_stats = {
'total_downloads': 0,
'successful_downloads': 0,
'failed_downloads': 0,
'retried_downloads': 0
}
# For backward compatibility, create wrapper that uses enhanced loader
class AniworldLoader(EnhancedAniWorldLoader):
"""Backward compatibility wrapper for the enhanced loader."""
pass