feat(providers): detect HTML encoding before parsing
Add chardet-based _decode_html_content() to aniworld_provider. Apply to all BeautifulSoup parsing calls to prevent decoding warnings on pages with mismatched encoding declarations. Falls back to utf-8 with errors='replace' when confidence < 0.7. Also fix test_enhanced_provider HLS test signature and add HLS pattern unit tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -721,3 +721,37 @@ class TestAniworldHeaderParsing:
|
||||
["not-a-header", "Key: value"]
|
||||
)
|
||||
assert result == {"Key": "value"}
|
||||
|
||||
|
||||
class TestDecodeHtmlContent:
|
||||
"""Test _decode_html_content function."""
|
||||
|
||||
def test_decodes_utf8_content(self):
|
||||
"""Should correctly decode UTF-8 content."""
|
||||
from src.core.providers.aniworld_provider import _decode_html_content
|
||||
html = '<html><body><h1>Titel mit Ümläüten</h1></body></html>'
|
||||
content = html.encode('utf-8')
|
||||
result = _decode_html_content(content)
|
||||
assert 'Titel mit Ümläüten' in result
|
||||
|
||||
def test_decodes_latin1_content(self):
|
||||
"""Should correctly decode Latin-1 content when chardet detects it."""
|
||||
from src.core.providers.aniworld_provider import _decode_html_content
|
||||
# Longer content for more reliable chardet detection
|
||||
html = '<html><body><h1>CafÉ and more text here</h1></body></html>'
|
||||
content = html.encode('latin-1')
|
||||
result = _decode_html_content(content)
|
||||
assert 'Caf' in result # Decoded content contains expected substring
|
||||
|
||||
def test_replaces_invalid_bytes(self):
|
||||
"""Should replace invalid bytes with replacement character."""
|
||||
from src.core.providers.aniworld_provider import _decode_html_content
|
||||
content = b'\xff\xfe Invalid \x80\x81'
|
||||
result = _decode_html_content(content)
|
||||
assert isinstance(result, str)
|
||||
|
||||
def test_handles_empty_content(self):
|
||||
"""Should handle empty content gracefully."""
|
||||
from src.core.providers.aniworld_provider import _decode_html_content
|
||||
result = _decode_html_content(b'')
|
||||
assert result == ''
|
||||
|
||||
Reference in New Issue
Block a user