feat(providers): detect HTML encoding before parsing

Add chardet-based _decode_html_content() to aniworld_provider. Apply
to all BeautifulSoup parsing calls to prevent decoding warnings on
pages with mismatched encoding declarations. Falls back to utf-8
with errors='replace' when confidence < 0.7.

Also fix test_enhanced_provider HLS test signature and add HLS
pattern unit tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-25 16:30:36 +02:00
parent d5e955a731
commit ca93bb740a
6 changed files with 162 additions and 7 deletions

View File

@@ -9,6 +9,7 @@ import threading
from pathlib import Path
from urllib.parse import quote
import chardet
import requests
from bs4 import BeautifulSoup
from events import Events
@@ -80,6 +81,37 @@ if not download_error_logger.handlers:
noKeyFound_logger = logging.getLogger()
def _decode_html_content(content: bytes) -> str:
"""Decode HTML content with encoding detection.
Uses chardet to detect the actual encoding of the content,
falling back to utf-8 with replacement error handling.
Args:
content: Raw HTML bytes from the response
Returns:
Decoded string content
"""
detected = chardet.detect(content)
encoding = detected.get('encoding', 'utf-8')
confidence = detected.get('confidence', 0)
if confidence < 0.7:
logger.debug(
"Low encoding confidence (%.2f) for detected encoding '%s', using utf-8",
confidence,
encoding
)
encoding = 'utf-8'
try:
return content.decode(encoding, errors='replace')
except Exception as exc:
logger.warning("Failed to decode content with %s: %s, using utf-8 replace", encoding, exc)
return content.decode('utf-8', errors='replace')
class AniworldLoader(Loader):
def __init__(self) -> None:
self.SUPPORTED_PROVIDERS = DEFAULT_PROVIDERS
@@ -231,7 +263,7 @@ class AniworldLoader(Loader):
language_code = self._get_language_key(language)
episode_soup = BeautifulSoup(
self._get_episode_html(season, episode, key).content,
_decode_html_content(self._get_episode_html(season, episode, key).content),
'html.parser'
)
change_language_box_div = episode_soup.find(
@@ -692,7 +724,7 @@ class AniworldLoader(Loader):
"""Get anime title from series key."""
logger.debug("Getting title for key: %s", key)
soup = BeautifulSoup(
self._get_key_html(key).content,
_decode_html_content(self._get_key_html(key).content),
'html.parser'
)
title_div = soup.find('div', class_='series-title')
@@ -723,7 +755,7 @@ class AniworldLoader(Loader):
logger.debug("Getting year for key: %s", key)
try:
soup = BeautifulSoup(
self._get_key_html(key).content,
_decode_html_content(self._get_key_html(key).content),
'html.parser'
)
@@ -837,7 +869,7 @@ class AniworldLoader(Loader):
"""
logger.debug("Extracting providers from HTML for S%02dE%03d (%s)", season, episode, key)
soup = BeautifulSoup(
self._get_episode_html(season, episode, key).content,
_decode_html_content(self._get_episode_html(season, episode, key).content),
'html.parser'
)
providers: dict[str, dict[int, str]] = {}
@@ -960,7 +992,7 @@ class AniworldLoader(Loader):
base_url = f"{self.ANIWORLD_TO}/anime/stream/{safe_slug}/"
logger.debug("Base URL: %s", base_url)
response = requests.get(base_url, timeout=self.DEFAULT_REQUEST_TIMEOUT)
soup = BeautifulSoup(response.content, 'html.parser')
soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')
season_meta = soup.find('meta', itemprop='numberOfSeasons')
number_of_seasons = int(season_meta['content']) if season_meta else 0
@@ -975,7 +1007,7 @@ class AniworldLoader(Loader):
season_url,
timeout=self.DEFAULT_REQUEST_TIMEOUT,
)
soup = BeautifulSoup(response.content, 'html.parser')
soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')
episode_links = soup.find_all('a', href=True)
unique_links = set(