feat(providers): detect HTML encoding before parsing
Add chardet-based _decode_html_content() to aniworld_provider. Apply to all BeautifulSoup parsing calls to prevent decoding warnings on pages with mismatched encoding declarations. Falls back to utf-8 with errors='replace' when confidence < 0.7. Also fix test_enhanced_provider HLS test signature and add HLS pattern unit tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import threading
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
import chardet
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from events import Events
|
||||
@@ -80,6 +81,37 @@ if not download_error_logger.handlers:
|
||||
noKeyFound_logger = logging.getLogger()
|
||||
|
||||
|
||||
def _decode_html_content(content: bytes) -> str:
|
||||
"""Decode HTML content with encoding detection.
|
||||
|
||||
Uses chardet to detect the actual encoding of the content,
|
||||
falling back to utf-8 with replacement error handling.
|
||||
|
||||
Args:
|
||||
content: Raw HTML bytes from the response
|
||||
|
||||
Returns:
|
||||
Decoded string content
|
||||
"""
|
||||
detected = chardet.detect(content)
|
||||
encoding = detected.get('encoding', 'utf-8')
|
||||
confidence = detected.get('confidence', 0)
|
||||
|
||||
if confidence < 0.7:
|
||||
logger.debug(
|
||||
"Low encoding confidence (%.2f) for detected encoding '%s', using utf-8",
|
||||
confidence,
|
||||
encoding
|
||||
)
|
||||
encoding = 'utf-8'
|
||||
|
||||
try:
|
||||
return content.decode(encoding, errors='replace')
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to decode content with %s: %s, using utf-8 replace", encoding, exc)
|
||||
return content.decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
class AniworldLoader(Loader):
|
||||
def __init__(self) -> None:
|
||||
self.SUPPORTED_PROVIDERS = DEFAULT_PROVIDERS
|
||||
@@ -231,7 +263,7 @@ class AniworldLoader(Loader):
|
||||
language_code = self._get_language_key(language)
|
||||
|
||||
episode_soup = BeautifulSoup(
|
||||
self._get_episode_html(season, episode, key).content,
|
||||
_decode_html_content(self._get_episode_html(season, episode, key).content),
|
||||
'html.parser'
|
||||
)
|
||||
change_language_box_div = episode_soup.find(
|
||||
@@ -692,7 +724,7 @@ class AniworldLoader(Loader):
|
||||
"""Get anime title from series key."""
|
||||
logger.debug("Getting title for key: %s", key)
|
||||
soup = BeautifulSoup(
|
||||
self._get_key_html(key).content,
|
||||
_decode_html_content(self._get_key_html(key).content),
|
||||
'html.parser'
|
||||
)
|
||||
title_div = soup.find('div', class_='series-title')
|
||||
@@ -723,7 +755,7 @@ class AniworldLoader(Loader):
|
||||
logger.debug("Getting year for key: %s", key)
|
||||
try:
|
||||
soup = BeautifulSoup(
|
||||
self._get_key_html(key).content,
|
||||
_decode_html_content(self._get_key_html(key).content),
|
||||
'html.parser'
|
||||
)
|
||||
|
||||
@@ -837,7 +869,7 @@ class AniworldLoader(Loader):
|
||||
"""
|
||||
logger.debug("Extracting providers from HTML for S%02dE%03d (%s)", season, episode, key)
|
||||
soup = BeautifulSoup(
|
||||
self._get_episode_html(season, episode, key).content,
|
||||
_decode_html_content(self._get_episode_html(season, episode, key).content),
|
||||
'html.parser'
|
||||
)
|
||||
providers: dict[str, dict[int, str]] = {}
|
||||
@@ -960,7 +992,7 @@ class AniworldLoader(Loader):
|
||||
base_url = f"{self.ANIWORLD_TO}/anime/stream/{safe_slug}/"
|
||||
logger.debug("Base URL: %s", base_url)
|
||||
response = requests.get(base_url, timeout=self.DEFAULT_REQUEST_TIMEOUT)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')
|
||||
|
||||
season_meta = soup.find('meta', itemprop='numberOfSeasons')
|
||||
number_of_seasons = int(season_meta['content']) if season_meta else 0
|
||||
@@ -975,7 +1007,7 @@ class AniworldLoader(Loader):
|
||||
season_url,
|
||||
timeout=self.DEFAULT_REQUEST_TIMEOUT,
|
||||
)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
soup = BeautifulSoup(_decode_html_content(response.content), 'html.parser')
|
||||
|
||||
episode_links = soup.find_all('a', href=True)
|
||||
unique_links = set(
|
||||
|
||||
Reference in New Issue
Block a user