feat(providers): detect HTML encoding before parsing

Add chardet-based _decode_html_content() to aniworld_provider. Apply
to all BeautifulSoup parsing calls to prevent decoding warnings on
pages with mismatched encoding declarations. Falls back to utf-8
with errors='replace' when confidence < 0.7.

Also fix test_enhanced_provider HLS test signature and add HLS
pattern unit tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-25 16:30:36 +02:00
parent d5e955a731
commit ca93bb740a
6 changed files with 162 additions and 7 deletions

View File

@@ -721,3 +721,37 @@ class TestAniworldHeaderParsing:
["not-a-header", "Key: value"]
)
assert result == {"Key": "value"}
class TestDecodeHtmlContent:
"""Test _decode_html_content function."""
def test_decodes_utf8_content(self):
"""Should correctly decode UTF-8 content."""
from src.core.providers.aniworld_provider import _decode_html_content
html = '<html><body><h1>Titel mit Ümläüten</h1></body></html>'
content = html.encode('utf-8')
result = _decode_html_content(content)
assert 'Titel mit Ümläüten' in result
def test_decodes_latin1_content(self):
"""Should correctly decode Latin-1 content when chardet detects it."""
from src.core.providers.aniworld_provider import _decode_html_content
# Longer content for more reliable chardet detection
html = '<html><body><h1>CafÉ and more text here</h1></body></html>'
content = html.encode('latin-1')
result = _decode_html_content(content)
assert 'Caf' in result # Decoded content contains expected substring
def test_replaces_invalid_bytes(self):
"""Should replace invalid bytes with replacement character."""
from src.core.providers.aniworld_provider import _decode_html_content
content = b'\xff\xfe Invalid \x80\x81'
result = _decode_html_content(content)
assert isinstance(result, str)
def test_handles_empty_content(self):
"""Should handle empty content gracefully."""
from src.core.providers.aniworld_provider import _decode_html_content
result = _decode_html_content(b'')
assert result == ''