feat(providers): detect HTML encoding before parsing

Add chardet-based _decode_html_content() to aniworld_provider. Apply to all BeautifulSoup parsing calls to prevent decoding warnings on pages with mismatched encoding declarations. Falls back to utf-8 with errors='replace' when confidence < 0.7. Also fix test_enhanced_provider HLS test signature and add HLS pattern unit tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-25 16:30:36 +02:00
parent d5e955a731
commit ca93bb740a
6 changed files with 162 additions and 7 deletions
--- a/src/core/providers/aniworld_provider.py
+++ b/src/core/providers/aniworld_provider.py
@@ -9,6 +9,7 @@ import threading
 from pathlib import Path
 from urllib.parse import quote

+import chardet
 import requests
 from bs4 import BeautifulSoup
 from events import Events
@@ -80,6 +81,37 @@ if not download_error_logger.handlers:
 noKeyFound_logger = logging.getLogger()


+def _decode_html_content(content: bytes) -> str:
+    """Decode HTML content with encoding detection.
+
+    Uses chardet to detect the actual encoding of the content,
+    falling back to utf-8 with replacement error handling.
+
+    Args:
+        content: Raw HTML bytes from the response
+
+    Returns:
+        Decoded string content
+    """
+    detected = chardet.detect(content)
+    encoding = detected.get('encoding', 'utf-8')
+    confidence = detected.get('confidence', 0)
+
+    if confidence < 0.7:
+        logger.debug(
+            "Low encoding confidence (%.2f) for detected encoding '%s', using utf-8",
+            confidence,
+            encoding
+        )
+        encoding = 'utf-8'
+
+    try:
+        return content.decode(encoding, errors='replace')
+    except Exception as exc:
+        logger.warning("Failed to decode content with %s: %s, using utf-8 replace", encoding, exc)
+        return content.decode('utf-8', errors='replace')
+
+
 class AniworldLoader(Loader):
    def __init__(self) -> None:
        self.SUPPORTED_PROVIDERS = DEFAULT_PROVIDERS
@@ -231,7 +263,7 @@ class AniworldLoader(Loader):
        language_code = self._get_language_key(language)

        episode_soup = BeautifulSoup(
-            self._get_episode_html(season, episode, key).content,
+            _decode_html_content(self._get_episode_html(season, episode, key).content),
            'html.parser'
        )
        change_language_box_div = episode_soup.find(
@@ -692,7 +724,7 @@ class AniworldLoader(Loader):
        """Get anime title from series key."""
        logger.debug("Getting title for key: %s", key)
        soup = BeautifulSoup(
-            self._get_key_html(key).content,
+            _decode_html_content(self._get_key_html(key).content),
            'html.parser'
        )
        title_div = soup.find('div', class_='series-title')
@@ -723,7 +755,7 @@ class AniworldLoader(Loader):
        logger.debug("Getting year for key: %s", key)
        try:
            soup = BeautifulSoup(
-                self._get_key_html(key).content,
+                _decode_html_content(self._get_key_html(key).content),
                'html.parser'
            )
            
@@ -837,7 +869,7 @@ class AniworldLoader(Loader):
        """
        logger.debug("Extracting providers from HTML for S%02dE%03d (%s)", season, episode, key)
        soup = BeautifulSoup(
-            self._get_episode_html(season, episode, key).content,
+            _decode_html_content(self._get_episode_html(season, episode, key).content),
            'html.parser'
        )
        providers: dict[str, dict[int, str]] = {}
@@ -960,7 +992,7 @@ class AniworldLoader(Loader):
        base_url = f"{self.ANIWORLD_TO}/anime/stream/{safe_slug}/"
        logger.debug("Base URL: %s", base_url)
        response = requests.get(base_url, timeout=self.DEFAULT_REQUEST_TIMEOUT)
-        soup = BeautifulSoup(response.content, 'html.parser')
+        soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')

        season_meta = soup.find('meta', itemprop='numberOfSeasons')
        number_of_seasons = int(season_meta['content']) if season_meta else 0
@@ -975,7 +1007,7 @@ class AniworldLoader(Loader):
                season_url,
                timeout=self.DEFAULT_REQUEST_TIMEOUT,
            )
-            soup = BeautifulSoup(response.content, 'html.parser')
+            soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')

            episode_links = soup.find_all('a', href=True)
            unique_links = set(