From a1152154166e71bf2f458cf044693c2489857b71 Mon Sep 17 00:00:00 2001 From: Lukas Date: Mon, 25 May 2026 14:32:10 +0200 Subject: [PATCH] fix(providers): rotate, probe and fall back on 404 Iterate providers actually advertised on the episode page (ordered by SUPPORTED_PROVIDERS preference) instead of always re-resolving VOE. Each candidate is HEAD-probed before yt-dlp runs, so dead links are skipped immediately; direct video URLs use a streaming fast path that bypasses yt-dlp; total failure now logs the exhausted provider list. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/DEVELOPMENT.md | 61 +++++ src/core/providers/aniworld_provider.py | 343 ++++++++++++++++++++++-- tests/unit/test_aniworld_provider.py | 249 +++++++++++++++++ 3 files changed, 629 insertions(+), 24 deletions(-) diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 92463f0..bc9aa5a 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -308,3 +308,64 @@ If `/health` returns `unhealthy` status: - Check network connectivity - DNS failures are transient — warnings don't block startup - Retry later to verify: `GET /health` + +### Provider Failure Handling + +Download providers (VOE, Doodstream, Vidmoly, Vidoza, SpeedFiles, Streamtape, +Luluvdo) regularly break: URLs expire, sites change their player markup, geo +blocks appear, and `yt-dlp` extractors lag behind upstream changes. The +`AniworldLoader.download()` flow is designed to fail fast and rotate. + +**Rotation order** + +1. The episode page is scraped for the providers AniWorld actually advertises. +2. Results are ordered by the preference in `DEFAULT_PROVIDERS` + (`provider_config.py`); providers not listed run last. +3. For each candidate the loader: + 1. Calls `_check_url_alive()` — HEAD probe with GET fallback. Any 4xx + response or connection error skips the provider immediately. + 2. Resolves the redirect via `_resolve_direct_link()` to obtain a direct + stream URL plus headers. Provider-specific extractors (e.g. `VOE`) are + preferred; unknown providers fall back to the embed URL so `yt-dlp` can + attempt extraction. + 3. Tries `_try_direct_stream()` — straight `requests.get(stream=True)` when + `Content-Type` is `video/*` or `application/octet-stream`. This avoids + `yt-dlp` entirely for direct MP4 links. + 4. Falls back to `yt-dlp` with the ffmpeg downloader for HLS streams. +4. On any failure, temp files are cleaned and the loop moves to the next + provider. When the chain is exhausted, the loader logs + `All download providers failed for S{season}E{episode} ...; tried=[...]` + to both the application log and `logs/download_errors.log`. + +**Do not hardcode provider URLs.** Provider domains shift constantly (e.g. +Doodstream alternates between `dood.li`, `dood.so`, `dood.la`). Only the +referer hints in `PROVIDER_HEADERS` are persisted — discovery still happens +at runtime through AniWorld's redirect endpoint. + +### Updating yt-dlp + +When extractors break (typical symptoms: every provider HEAD probe succeeds +but `yt-dlp` raises `Unable to extract` or `HTTP Error 404`): + +1. Check the upstream tracker first: https://github.com/yt-dlp/yt-dlp/issues +2. Upgrade in the conda environment: + ```bash + conda run -n AniWorld pip install --upgrade yt-dlp + ``` +3. Smoke-test against a known-good episode before pinning a new floor in + `requirements.txt` (`yt-dlp>=YYYY.MM.DD`). +4. Re-run the provider test suite: + ```bash + conda run -n AniWorld python -m pytest tests/unit/test_aniworld_provider.py -v + ``` +5. If a specific extractor is removed upstream, drop the provider from + `DEFAULT_PROVIDERS` rather than patching `yt-dlp` in tree. + +### User Notification on Total Failure + +`SeriesApp.download_episode()` already emits a `download_status="failed"` +WebSocket event when `loader.download()` returns `False`. Operators should +forward this to `notification_service.notify_download_failed()` so users see +a HIGH-priority alert. The loader keeps the failure detail in +`logs/download_errors.log` for post-mortem. + diff --git a/src/core/providers/aniworld_provider.py b/src/core/providers/aniworld_provider.py index 1cc959d..e12d12b 100644 --- a/src/core/providers/aniworld_provider.py +++ b/src/core/providers/aniworld_provider.py @@ -249,6 +249,118 @@ class AniworldLoader(Loader): logger.debug("Available languages for S%02dE%03d: %s, requested: %s, available: %s", season, episode, languages, language_code, is_available) return is_available + def _check_url_alive( + self, + url: str, + headers: dict | None = None, + timeout: int = 10, + ) -> bool: + """Probe a provider URL with HEAD before committing to yt-dlp. + + Skips dead providers quickly so the failover loop never blocks + waiting for yt-dlp to fail on a 404. Falls back to a streaming + GET when HEAD is not allowed by the upstream server. + + Args: + url: URL to probe. + headers: Optional headers to forward with the probe. + timeout: Per-request timeout (seconds). + + Returns: + True when the URL responds with a non-4xx status, else False. + """ + try: + response = self.session.head( + url, + headers=headers, + timeout=timeout, + allow_redirects=True, + ) + if response.status_code == 405: + response = self.session.get( + url, + headers=headers, + timeout=timeout, + stream=True, + allow_redirects=True, + ) + response.close() + if 400 <= response.status_code < 500: + logger.warning( + "Provider URL returned HTTP %s: %s", + response.status_code, url + ) + return False + return True + except requests.RequestException as exc: + logger.warning("Provider URL unreachable %s: %s", url, exc) + return False + + def _try_direct_stream( + self, + link: str, + output_path: str, + headers: dict | None, + timeout: int, + ) -> bool: + """Stream a direct video URL to disk without yt-dlp. + + Used as a fast-path when the resolved provider link already points + at a downloadable video file (``Content-Type: video/*`` or + ``application/octet-stream``). HLS and other non-video payloads + are rejected so the caller can fall back to yt-dlp. + + Args: + link: Direct download URL. + output_path: Destination file path. + headers: Optional HTTP headers. + timeout: Per-request timeout (seconds). + + Returns: + True on a successful save, False when the link is not a + direct video or the download fails. + """ + try: + with self.session.get( + link, + headers=headers, + timeout=timeout, + stream=True, + ) as response: + if not response.ok: + logger.debug( + "Direct stream HEAD returned %s for %s", + response.status_code, link[:80] + ) + return False + content_type = response.headers.get("Content-Type", "") + if not ( + content_type.startswith("video/") + or content_type == "application/octet-stream" + ): + logger.debug( + "Direct stream skipped, Content-Type=%s", + content_type + ) + return False + logger.info( + "Direct stream download starting (type=%s)", + content_type + ) + with open(output_path, "wb") as fh: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if self._cancel_flag.is_set(): + logger.info( + "Cancellation detected during direct stream" + ) + return False + if chunk: + fh.write(chunk) + return True + except requests.RequestException as exc: + logger.warning("Direct stream download failed: %s", exc) + return False + def download( self, base_directory: str, @@ -259,7 +371,12 @@ class AniworldLoader(Loader): language: str = "German Dub" ) -> bool: """Download episode to specified directory. - + + Iterates the providers actually advertised on the episode page + (ordered by SUPPORTED_PROVIDERS preference), probing each URL + before attempting an extraction so dead providers are skipped + immediately instead of stalling yt-dlp on a 404. + Args: base_directory: Base download directory path serie_folder: Filesystem folder name (metadata only, used for @@ -308,12 +425,78 @@ class AniworldLoader(Loader): temp_path = os.path.join(temp_dir, output_file) logger.debug("Temporary path: %s", temp_path) - for provider in self.SUPPORTED_PROVIDERS: - logger.debug("Attempting download with provider: %s", provider) - link, header = self._get_direct_link_from_provider( + candidate_providers = self._select_providers_for_episode( + season, episode, key, language + ) + if not candidate_providers: + logger.error( + "No providers advertised for S%02dE%03d (%s) in %s", season, episode, key, language ) - logger.debug("Direct link obtained from provider") + self.clear_cache() + return False + + tried: list[str] = [] + for provider_name, redirect_url in candidate_providers: + tried.append(provider_name) + logger.debug("Attempting download with provider: %s", provider_name) + + probe_headers = {"User-Agent": self.RANDOM_USER_AGENT} + if not self._check_url_alive( + redirect_url, + headers=probe_headers, + timeout=self.DEFAULT_REQUEST_TIMEOUT, + ): + logger.info( + "Skipping provider %s, redirect URL not reachable", + provider_name + ) + continue + + try: + resolved = self._resolve_direct_link( + redirect_url, provider_name + ) + except Exception as exc: + logger.warning( + "Provider %s link resolution failed: %s: %s", + provider_name, type(exc).__name__, exc + ) + continue + + if resolved is None: + logger.info( + "Provider %s returned no direct link", provider_name + ) + continue + + link, header = resolved + + if self._cancel_flag.is_set(): + logger.info("Cancellation requested before download start") + _cleanup_temp_file(temp_path) + self.clear_cache() + return False + + if self._try_direct_stream( + link, + temp_path, + header, + self.DEFAULT_REQUEST_TIMEOUT, + ) and os.path.exists(temp_path): + logger.debug( + "Direct stream succeeded with provider %s", provider_name + ) + shutil.copyfile(temp_path, output_path) + os.remove(temp_path) + logger.info( + "Download completed successfully (direct): %s", + output_file + ) + self.clear_cache() + return True + + _cleanup_temp_file(temp_path) cancel_flag = self._cancel_flag @@ -321,7 +504,6 @@ class AniworldLoader(Loader): if cancel_flag.is_set(): logger.info("Cancellation detected in progress hook") raise DownloadCancelled("Download cancelled by user") - # Fire the event for progress self.events.download_progress(d) ydl_opts = { @@ -333,7 +515,6 @@ class AniworldLoader(Loader): 'nocheckcertificate': True, 'logger': logger, 'progress_hooks': [events_progress_hook], - # Use ffmpeg for HLS streams and transport stream format 'downloader': 'ffmpeg', 'hls_use_mpegts': True, } @@ -343,9 +524,11 @@ class AniworldLoader(Loader): logger.debug("Using custom headers for download") try: - logger.info("Starting download: %s", output_file) + logger.info( + "Starting yt-dlp download with %s: %s", + provider_name, output_file + ) logger.debug("Download link: %s...", link[:100]) - logger.debug("YDL options: %s", ydl_opts) with YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(link, download=True) @@ -356,39 +539,151 @@ class AniworldLoader(Loader): if os.path.exists(temp_path): logger.debug("Moving file from temp to final destination") - # Use copyfile instead of copy to avoid metadata permission issues shutil.copyfile(temp_path, output_path) os.remove(temp_path) - logger.info("Download completed successfully: %s", output_file) + logger.info( + "Download completed successfully: %s", output_file + ) self.clear_cache() return True - else: - logger.error("Download failed: temp file not found at %s", temp_path) - self.clear_cache() - return False - except BrokenPipeError as e: logger.error( - "Broken pipe error with provider %s: %s. " - "This usually means the stream connection was closed.", - provider, e + "Download failed: temp file not found at %s", temp_path + ) + except DownloadCancelled: + logger.info("Download cancelled by user") + _cleanup_temp_file(temp_path) + self.clear_cache() + return False + except BrokenPipeError as exc: + logger.error( + "Broken pipe error with provider %s: %s", + provider_name, exc ) _cleanup_temp_file(temp_path) continue - except Exception as e: + except Exception as exc: logger.error( "YoutubeDL download failed with provider %s: %s: %s", - provider, type(e).__name__, e + provider_name, type(exc).__name__, exc ) _cleanup_temp_file(temp_path) continue - break - # If we get here, all providers failed - logger.error("All download providers failed") + logger.error( + "All download providers failed for S%02dE%03d (%s) in %s. " + "Tried: %s. Episode may be unavailable on the source site.", + season, episode, key, language, ", ".join(tried) or "none" + ) + download_error_logger.error( + "All providers failed for %s S%02dE%03d (%s); tried=%s", + key, season, episode, language, tried + ) _cleanup_temp_file(temp_path) self.clear_cache() return False + def _select_providers_for_episode( + self, + season: int, + episode: int, + key: str, + language: str, + ) -> list[tuple[str, str]]: + """Return ``[(provider_name, redirect_url), ...]`` for an episode. + + Filters by requested language and orders results by + ``SUPPORTED_PROVIDERS`` preference so the failover chain matches + operator expectations. Returns an empty list when nothing is + advertised on the page. + """ + if not self.is_language(season, episode, key, language): + logger.warning( + "Language %s not advertised for S%02dE%03d (%s)", + language, season, episode, key + ) + return [] + language_code = self._get_language_key(language) + providers = self._get_provider_from_html(season, episode, key) + ordered: list[tuple[str, str]] = [] + preferred = list(self.SUPPORTED_PROVIDERS) + for name in preferred: + lang_map = providers.get(name) + if lang_map and language_code in lang_map: + ordered.append((name, lang_map[language_code])) + for name, lang_map in providers.items(): + if name in preferred: + continue + if language_code in lang_map: + ordered.append((name, lang_map[language_code])) + return ordered + + def _resolve_direct_link( + self, + redirect_url: str, + provider_name: str, + ) -> tuple[str, dict] | None: + """Resolve a provider redirect URL into a direct stream link. + + Follows the redirect to the embedded player, then delegates to a + provider-specific extractor (when registered) or returns the + embed URL itself so yt-dlp can attempt extraction. + + Args: + redirect_url: AniWorld redirect URL. + provider_name: Provider key (e.g. ``"VOE"``). + + Returns: + ``(direct_link, headers)`` tuple or None when extraction fails. + """ + try: + embedded = self.session.get( + redirect_url, + timeout=self.DEFAULT_REQUEST_TIMEOUT, + headers={"User-Agent": self.RANDOM_USER_AGENT}, + allow_redirects=True, + ).url + except requests.RequestException as exc: + logger.warning( + "Failed resolving redirect for %s: %s", provider_name, exc + ) + return None + + try: + extractor = self.Providers.GetProvider(provider_name) + except (KeyError, AttributeError): + extractor = None + + if extractor is not None: + try: + return extractor.get_link( + embedded, self.DEFAULT_REQUEST_TIMEOUT + ) + except Exception as exc: + logger.warning( + "Custom extractor %s failed: %s", + provider_name, exc + ) + return None + + header_list = self.PROVIDER_HEADERS.get(provider_name) + header_dict = self._parse_provider_headers(header_list) + return embedded, header_dict + + @staticmethod + def _parse_provider_headers( + header_list: list | None, + ) -> dict[str, str]: + """Convert legacy ``"Name: value"`` header strings to a dict.""" + if not header_list: + return {} + parsed: dict[str, str] = {} + for entry in header_list: + if not isinstance(entry, str) or ":" not in entry: + continue + name, _, value = entry.partition(":") + parsed[name.strip()] = value.strip().strip('"') + return parsed + def get_site_key(self) -> str: """Get the site key for this provider.""" return "aniworld.to" diff --git a/tests/unit/test_aniworld_provider.py b/tests/unit/test_aniworld_provider.py index 65b3269..a60eb04 100644 --- a/tests/unit/test_aniworld_provider.py +++ b/tests/unit/test_aniworld_provider.py @@ -1,9 +1,11 @@ """Unit tests for aniworld_provider.py - Anime catalog scraping, episode listing, streaming link extraction.""" import json +import os from unittest.mock import MagicMock, Mock, patch import pytest +import requests from src.core.providers.aniworld_provider import AniworldLoader @@ -472,3 +474,250 @@ class TestAniworldEvents: # Fire event - handler should NOT be called loader.events.download_progress({"status": "downloading"}) handler.assert_not_called() + + +class TestAniworldHealthCheck: + """Tests for the _check_url_alive HEAD probe.""" + + def test_returns_true_on_200(self, loader): + loader.session.head.return_value = MagicMock(status_code=200) + assert loader._check_url_alive("https://provider/x") is True + + def test_returns_false_on_404(self, loader): + loader.session.head.return_value = MagicMock(status_code=404) + assert loader._check_url_alive("https://provider/x") is False + + def test_returns_false_on_403(self, loader): + loader.session.head.return_value = MagicMock(status_code=403) + assert loader._check_url_alive("https://provider/x") is False + + def test_falls_back_to_get_when_head_disallowed(self, loader): + loader.session.head.return_value = MagicMock(status_code=405) + get_resp = MagicMock(status_code=200) + get_resp.close = MagicMock() + loader.session.get.return_value = get_resp + assert loader._check_url_alive("https://provider/x") is True + loader.session.get.assert_called_once() + + def test_returns_false_on_connection_error(self, loader): + loader.session.head.side_effect = requests.ConnectionError("boom") + assert loader._check_url_alive("https://provider/x") is False + + +class TestAniworldDirectStream: + """Tests for the _try_direct_stream fast-path.""" + + def _build_response(self, status, content_type, body=b""): + resp = MagicMock() + resp.ok = status < 400 + resp.status_code = status + resp.headers = {"Content-Type": content_type} + resp.iter_content = MagicMock(return_value=[body]) + resp.__enter__ = MagicMock(return_value=resp) + resp.__exit__ = MagicMock(return_value=False) + return resp + + def test_skips_non_video_content(self, loader, tmp_path): + target = tmp_path / "out.mp4" + loader.session.get.return_value = self._build_response( + 200, "text/html" + ) + assert loader._try_direct_stream( + "https://x", str(target), None, 10 + ) is False + assert not target.exists() + + def test_writes_video_content(self, loader, tmp_path): + target = tmp_path / "out.mp4" + loader.session.get.return_value = self._build_response( + 200, "video/mp4", body=b"abc123" + ) + assert loader._try_direct_stream( + "https://x", str(target), None, 10 + ) is True + assert target.read_bytes() == b"abc123" + + def test_returns_false_on_http_error(self, loader, tmp_path): + target = tmp_path / "out.mp4" + loader.session.get.return_value = self._build_response( + 404, "video/mp4" + ) + assert loader._try_direct_stream( + "https://x", str(target), None, 10 + ) is False + + def test_returns_false_on_request_exception(self, loader, tmp_path): + loader.session.get.side_effect = requests.RequestException("nope") + assert loader._try_direct_stream( + "https://x", str(tmp_path / "out.mp4"), None, 10 + ) is False + + +class TestAniworldProviderSelection: + """Tests for _select_providers_for_episode ordering and filtering.""" + + def test_orders_by_supported_preference(self, loader): + loader.is_language = MagicMock(return_value=True) + loader._get_provider_from_html = MagicMock(return_value={ + "Vidoza": {1: "https://aniworld.to/redirect/2"}, + "VOE": {1: "https://aniworld.to/redirect/1"}, + }) + result = loader._select_providers_for_episode(1, 1, "k", "German Dub") + assert [name for name, _ in result] == ["VOE", "Vidoza"] + + def test_filters_by_language(self, loader): + loader.is_language = MagicMock(return_value=True) + loader._get_provider_from_html = MagicMock(return_value={ + "VOE": {2: "https://aniworld.to/redirect/1"}, # English only + }) + result = loader._select_providers_for_episode(1, 1, "k", "German Dub") + assert result == [] + + def test_returns_empty_when_language_unavailable(self, loader): + loader.is_language = MagicMock(return_value=False) + loader._get_provider_from_html = MagicMock() + result = loader._select_providers_for_episode(1, 1, "k", "German Dub") + assert result == [] + loader._get_provider_from_html.assert_not_called() + + +class TestAniworldDownloadFailover: + """Tests for the failover rotation in download().""" + + @pytest.fixture + def patched_loader(self, loader, tmp_path): + """Loader with side-effect heavy methods stubbed.""" + loader.get_title = MagicMock(return_value="Anime") + loader._select_providers_for_episode = MagicMock(return_value=[ + ("VOE", "https://aniworld.to/redirect/1"), + ("Doodstream", "https://aniworld.to/redirect/2"), + ]) + loader._check_url_alive = MagicMock(return_value=True) + loader._try_direct_stream = MagicMock(return_value=False) + loader.clear_cache = MagicMock() + loader._resolve_direct_link = MagicMock( + return_value=("https://cdn/video.m3u8", {"Referer": "https://x"}) + ) + return loader + + def test_skips_provider_when_url_dead(self, patched_loader, tmp_path): + # First provider URL fails health check, second succeeds and downloads + patched_loader._check_url_alive.side_effect = [False, True] + + def fake_ytdl(opts): + outpath = opts["outtmpl"] + os.makedirs(os.path.dirname(outpath), exist_ok=True) + with open(outpath, "wb") as fh: + fh.write(b"data") + ydl = MagicMock() + ydl.__enter__ = MagicMock(return_value=ydl) + ydl.__exit__ = MagicMock(return_value=False) + ydl.extract_info = MagicMock(return_value={"title": "t"}) + return ydl + + with patch( + "src.core.providers.aniworld_provider.YoutubeDL", + side_effect=fake_ytdl, + ): + result = patched_loader.download( + str(tmp_path), "Anime", 1, 1, "k", "German Dub" + ) + assert result is True + assert patched_loader._check_url_alive.call_count == 2 + # Only second provider (Doodstream) attempted resolve + patched_loader._resolve_direct_link.assert_called_once_with( + "https://aniworld.to/redirect/2", "Doodstream" + ) + + def test_falls_back_to_next_provider_on_ytdl_error( + self, patched_loader, tmp_path + ): + calls = {"n": 0} + + def fake_ytdl(opts): + calls["n"] += 1 + if calls["n"] == 1: + raise Exception("HTTP 404 from VOE") + outpath = opts["outtmpl"] + os.makedirs(os.path.dirname(outpath), exist_ok=True) + with open(outpath, "wb") as fh: + fh.write(b"ok") + ydl = MagicMock() + ydl.__enter__ = MagicMock(return_value=ydl) + ydl.__exit__ = MagicMock(return_value=False) + ydl.extract_info = MagicMock(return_value={"title": "t"}) + return ydl + + with patch( + "src.core.providers.aniworld_provider.YoutubeDL", + side_effect=fake_ytdl, + ): + result = patched_loader.download( + str(tmp_path), "Anime", 1, 1, "k", "German Dub" + ) + assert result is True + assert calls["n"] == 2 + + def test_uses_direct_stream_when_available( + self, patched_loader, tmp_path + ): + def write_direct(link, output, headers, timeout): + os.makedirs(os.path.dirname(output), exist_ok=True) + with open(output, "wb") as fh: + fh.write(b"vid") + return True + + patched_loader._try_direct_stream.side_effect = write_direct + + with patch( + "src.core.providers.aniworld_provider.YoutubeDL" + ) as mock_ydl: + result = patched_loader.download( + str(tmp_path), "Anime", 1, 1, "k", "German Dub" + ) + assert result is True + mock_ydl.assert_not_called() + + def test_returns_false_when_all_providers_fail( + self, patched_loader, tmp_path, caplog + ): + with patch( + "src.core.providers.aniworld_provider.YoutubeDL", + side_effect=Exception("HTTP 404"), + ): + result = patched_loader.download( + str(tmp_path), "Anime", 1, 1, "k", "German Dub" + ) + assert result is False + assert "All download providers failed" in caplog.text + # Both providers attempted + assert patched_loader._resolve_direct_link.call_count == 2 + + def test_returns_false_when_no_providers_advertised( + self, patched_loader, tmp_path, caplog + ): + patched_loader._select_providers_for_episode.return_value = [] + result = patched_loader.download( + str(tmp_path), "Anime", 1, 1, "k", "German Dub" + ) + assert result is False + assert "No providers advertised" in caplog.text + + +class TestAniworldHeaderParsing: + """_parse_provider_headers normalizes legacy strings to dict.""" + + def test_parses_referer(self): + result = AniworldLoader._parse_provider_headers( + ['Referer: "https://vidmoly.to"'] + ) + assert result == {"Referer": "https://vidmoly.to"} + + def test_handles_none(self): + assert AniworldLoader._parse_provider_headers(None) == {} + + def test_skips_malformed_entries(self): + result = AniworldLoader._parse_provider_headers( + ["not-a-header", "Key: value"] + ) + assert result == {"Key": "value"}