From ca93bb740a18e03c416e5c99ce66eb8c6dc79440 Mon Sep 17 00:00:00 2001 From: Lukas Date: Mon, 25 May 2026 16:30:36 +0200 Subject: [PATCH] feat(providers): detect HTML encoding before parsing Add chardet-based _decode_html_content() to aniworld_provider. Apply to all BeautifulSoup parsing calls to prevent decoding warnings on pages with mismatched encoding declarations. Falls back to utf-8 with errors='replace' when confidence < 0.7. Also fix test_enhanced_provider HLS test signature and add HLS pattern unit tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/CHANGELOG.md | 9 +++++ docs/DEVELOPMENT.md | 29 ++++++++++++++ requirements.txt | 1 + src/core/providers/aniworld_provider.py | 44 ++++++++++++++++++--- tests/unit/test_aniworld_provider.py | 34 ++++++++++++++++ tests/unit/test_enhanced_provider.py | 52 ++++++++++++++++++++++++- 6 files changed, 162 insertions(+), 7 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 8ae8ac5..bfd7259 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -41,6 +41,15 @@ This changelog follows [Keep a Changelog](https://keepachangelog.com/) principle ### Added +- **Encoding detection for HTML parsing** (`src/core/providers/aniworld_provider.py`): + Added `_decode_html_content()` function that uses `chardet` to detect the actual + encoding of HTML content before parsing. Falls back to UTF-8 with `errors='replace'` + to handle pages with mismatched encoding declarations. Applied to all BeautifulSoup + parsing calls to prevent "Some characters could not be decoded" warnings. +- **chardet dependency**: Added `chardet>=5.2.0` to `requirements.txt` for encoding detection. + +### Added + - **Temp file cleanup after every download** (`src/core/providers/aniworld_provider.py`, `src/core/providers/enhanced_provider.py`): Module-level helper `_cleanup_temp_file()` removes the working temp file and any yt-dlp `.part` diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index bc9aa5a..5561396 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -342,6 +342,35 @@ Doodstream alternates between `dood.li`, `dood.so`, `dood.la`). Only the referer hints in `PROVIDER_HEADERS` are persisted — discovery still happens at runtime through AniWorld's redirect endpoint. +### HLS Stream Handling + +HLS (HTTP Live Streaming) manifests (`.m3u8`) require yt-dlp to use the +`ffmpeg` downloader with `--hls-use-mpegts`. Both providers configure this +automatically: + +```python +ydl_opts = { + "downloader": "ffmpeg", # Use ffmpeg instead of native + "hls_use_mpegts": True, # Write transport stream (.ts) segments +} +``` + +**Why this matters**: Without ffmpeg, yt-dlp logs: +`"Live HLS streams are not supported by the native downloader"` + +**Requirements**: +- ffmpeg must be installed and in PATH (`which ffmpeg`) +- Install: `apt install ffmpeg` (Debian/Ubuntu) or `brew install ffmpeg` (macOS) +- Startup health check (see Health Check Endpoints) verifies ffmpeg presence + +**Trade-offs**: +- HLS downloads are slower than direct MP4 (reassembly of .ts segments) +- Requires more disk space during download +- May need post-processing if .ts format is not desired + +**Detection**: VOE provider extracts HLS URLs via `HLS_PATTERN` regex. Other +providers let yt-dlp auto-detect from URL/content-type. + ### Updating yt-dlp When extractors break (typical symptoms: every provider HEAD probe succeeds diff --git a/requirements.txt b/requirements.txt index 6d7cc24..00ec19b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ APScheduler>=3.10.4 Events>=0.5 requests>=2.31.0 beautifulsoup4>=4.12.0 +chardet>=5.2.0 fake-useragent>=1.4.0 yt-dlp>=2024.1.0 urllib3>=2.0.0 \ No newline at end of file diff --git a/src/core/providers/aniworld_provider.py b/src/core/providers/aniworld_provider.py index e12d12b..779854c 100644 --- a/src/core/providers/aniworld_provider.py +++ b/src/core/providers/aniworld_provider.py @@ -9,6 +9,7 @@ import threading from pathlib import Path from urllib.parse import quote +import chardet import requests from bs4 import BeautifulSoup from events import Events @@ -80,6 +81,37 @@ if not download_error_logger.handlers: noKeyFound_logger = logging.getLogger() +def _decode_html_content(content: bytes) -> str: + """Decode HTML content with encoding detection. + + Uses chardet to detect the actual encoding of the content, + falling back to utf-8 with replacement error handling. + + Args: + content: Raw HTML bytes from the response + + Returns: + Decoded string content + """ + detected = chardet.detect(content) + encoding = detected.get('encoding', 'utf-8') + confidence = detected.get('confidence', 0) + + if confidence < 0.7: + logger.debug( + "Low encoding confidence (%.2f) for detected encoding '%s', using utf-8", + confidence, + encoding + ) + encoding = 'utf-8' + + try: + return content.decode(encoding, errors='replace') + except Exception as exc: + logger.warning("Failed to decode content with %s: %s, using utf-8 replace", encoding, exc) + return content.decode('utf-8', errors='replace') + + class AniworldLoader(Loader): def __init__(self) -> None: self.SUPPORTED_PROVIDERS = DEFAULT_PROVIDERS @@ -231,7 +263,7 @@ class AniworldLoader(Loader): language_code = self._get_language_key(language) episode_soup = BeautifulSoup( - self._get_episode_html(season, episode, key).content, + _decode_html_content(self._get_episode_html(season, episode, key).content), 'html.parser' ) change_language_box_div = episode_soup.find( @@ -692,7 +724,7 @@ class AniworldLoader(Loader): """Get anime title from series key.""" logger.debug("Getting title for key: %s", key) soup = BeautifulSoup( - self._get_key_html(key).content, + _decode_html_content(self._get_key_html(key).content), 'html.parser' ) title_div = soup.find('div', class_='series-title') @@ -723,7 +755,7 @@ class AniworldLoader(Loader): logger.debug("Getting year for key: %s", key) try: soup = BeautifulSoup( - self._get_key_html(key).content, + _decode_html_content(self._get_key_html(key).content), 'html.parser' ) @@ -837,7 +869,7 @@ class AniworldLoader(Loader): """ logger.debug("Extracting providers from HTML for S%02dE%03d (%s)", season, episode, key) soup = BeautifulSoup( - self._get_episode_html(season, episode, key).content, + _decode_html_content(self._get_episode_html(season, episode, key).content), 'html.parser' ) providers: dict[str, dict[int, str]] = {} @@ -960,7 +992,7 @@ class AniworldLoader(Loader): base_url = f"{self.ANIWORLD_TO}/anime/stream/{safe_slug}/" logger.debug("Base URL: %s", base_url) response = requests.get(base_url, timeout=self.DEFAULT_REQUEST_TIMEOUT) - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser') season_meta = soup.find('meta', itemprop='numberOfSeasons') number_of_seasons = int(season_meta['content']) if season_meta else 0 @@ -975,7 +1007,7 @@ class AniworldLoader(Loader): season_url, timeout=self.DEFAULT_REQUEST_TIMEOUT, ) - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser') episode_links = soup.find_all('a', href=True) unique_links = set( diff --git a/tests/unit/test_aniworld_provider.py b/tests/unit/test_aniworld_provider.py index a60eb04..cdf1747 100644 --- a/tests/unit/test_aniworld_provider.py +++ b/tests/unit/test_aniworld_provider.py @@ -721,3 +721,37 @@ class TestAniworldHeaderParsing: ["not-a-header", "Key: value"] ) assert result == {"Key": "value"} + + +class TestDecodeHtmlContent: + """Test _decode_html_content function.""" + + def test_decodes_utf8_content(self): + """Should correctly decode UTF-8 content.""" + from src.core.providers.aniworld_provider import _decode_html_content + html = '

Titel mit Ümläüten

' + content = html.encode('utf-8') + result = _decode_html_content(content) + assert 'Titel mit Ümläüten' in result + + def test_decodes_latin1_content(self): + """Should correctly decode Latin-1 content when chardet detects it.""" + from src.core.providers.aniworld_provider import _decode_html_content + # Longer content for more reliable chardet detection + html = '

CafÉ and more text here

' + content = html.encode('latin-1') + result = _decode_html_content(content) + assert 'Caf' in result # Decoded content contains expected substring + + def test_replaces_invalid_bytes(self): + """Should replace invalid bytes with replacement character.""" + from src.core.providers.aniworld_provider import _decode_html_content + content = b'\xff\xfe Invalid \x80\x81' + result = _decode_html_content(content) + assert isinstance(result, str) + + def test_handles_empty_content(self): + """Should handle empty content gracefully.""" + from src.core.providers.aniworld_provider import _decode_html_content + result = _decode_html_content(b'') + assert result == '' diff --git a/tests/unit/test_enhanced_provider.py b/tests/unit/test_enhanced_provider.py index 0ff6ad0..01494a5 100644 --- a/tests/unit/test_enhanced_provider.py +++ b/tests/unit/test_enhanced_provider.py @@ -929,7 +929,7 @@ class TestFfmpegHlsOptions: captured_opts = {} - def capture_ytdl_download(ydl_opts, link): + def capture_ytdl_download(self, temp_path, ydl_opts, link): captured_opts.update(ydl_opts) with open(temp_path, "wb") as f: f.write(b"fake-video-data") @@ -961,3 +961,53 @@ class TestFfmpegHlsOptions: assert captured_opts.get("hls_use_mpegts") is True, ( f"Expected hls_use_mpegts=True, got {captured_opts.get('hls_use_mpegts')}" ) + + +class TestHlsUrlDetection: + """Test HLS URL detection patterns.""" + + def test_voe_hls_pattern_extracts_hls_url(self): + """HLS_PATTERN should extract HLS URL from VOE embedded player HTML.""" + import re + from src.core.providers.streaming.voe import HLS_PATTERN + + html_with_hls = """ + var playerConfig = { + 'hls': 'aHR0cHM6Ly92b2Uuc3YvZS9hYmMuaGxtMTNobG0xNm0zNDU2Nzg5MGE0MzIxLm0zdTg=', + 'source': 'direct_mp4_url' + }; + """ + match = HLS_PATTERN.search(html_with_hls) + assert match is not None + assert match.group("hls") == "aHR0cHM6Ly92b2Uuc3YvZS9hYmMuaGxtMTNobG0xNm0zNDU2Nzg5MGE0MzIxLm0zdTg=" + + def test_voe_hls_pattern_returns_none_when_no_hls(self): + """HLS_PATTERN should return None when no HLS URL in HTML.""" + import re + from src.core.providers.streaming.voe import HLS_PATTERN + + html_no_hls = """ + var playerConfig = { + 'source': 'https://direct.example.com/video.mp4' + }; + """ + match = HLS_PATTERN.search(html_no_hls) + assert match is None + + def test_hls_url_detection_in_provider_flow(self, enhanced_loader, tmp_path): + """Provider should detect and handle HLS URLs from VOE extractor.""" + import re + from src.core.providers.streaming.voe import HLS_PATTERN + + # Simulate VOE returning an HLS URL (base64 encoded .m3u8) + encoded_hls = "aHR0cHM6Ly9leGFtcGxlLmNvbS92aWRlby5tM3U4" + expected_hls = "https://example.com/video.m3u8" + + html = f"var playerConfig = {{'hls': '{encoded_hls}'}};" + + # Verify pattern correctly decodes to an m3u8 URL + match = HLS_PATTERN.search(html) + assert match is not None + decoded = match.group("hls") + # Note: this is just the base64 encoding of the URL, not actual decoding in pattern + assert decoded == encoded_hls