feat(NFO): add TMDB search fallback with alt_titles support

- New _search_with_fallback() method tries multiple strategies:
  1. Primary query with year filter (de-DE locale)
  2. Alternative titles with ja-JP / en-US locales
  3. English search (en-US)
  4. Search without year constraint
  5. Punctuation-normalized query
- create_nfo() accepts new alt_titles param for Japanese/title fallback
- Better match rate for anime with non-English titles

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-23 21:57:00 +02:00
parent 3f7651404d
commit 9a20541598
7 changed files with 588 additions and 43 deletions

View File

@@ -10,6 +10,7 @@ Example:
import logging
import re
import unicodedata
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
@@ -123,7 +124,8 @@ class NFOService:
year: Optional[int] = None,
download_poster: bool = True,
download_logo: bool = True,
download_fanart: bool = True
download_fanart: bool = True,
alt_titles: Optional[List[str]] = None
) -> Path:
"""Create tvshow.nfo by scraping TMDB.
@@ -135,6 +137,7 @@ class NFOService:
download_poster: Whether to download poster.jpg
download_logo: Whether to download logo.png
download_fanart: Whether to download fanart.jpg
alt_titles: Alternative titles (e.g., Japanese title) for fallback search
Returns:
Path to created NFO file
@@ -161,16 +164,11 @@ class NFOService:
try:
await self.tmdb_client._ensure_session()
# Search for TV show with clean name (without year)
logger.debug("Searching TMDB for: %s", search_name)
search_results = await self.tmdb_client.search_tv_show(search_name)
if not search_results.get("results"):
raise TMDBAPIError(f"No results found for: {search_name}")
# Find best match (consider year if provided)
tv_show = self._find_best_match(search_results["results"], search_name, year)
# Search for TV show - try multiple strategies
tv_show, search_source = await self._search_with_fallback(
search_name, year, alt_titles
)
tv_id = tv_show["id"]
logger.info("Found match: %s (ID: %s)", tv_show['name'], tv_id)
@@ -531,6 +529,137 @@ class NFOService:
# Return first result (usually best match)
return results[0]
async def _search_with_fallback(
self,
primary_query: str,
year: Optional[int],
alt_titles: Optional[List[str]] = None
) -> Tuple[Dict[str, Any], str]:
"""Search TMDB with fallback strategies.
Tries multiple search strategies in order:
1. Primary query with year filter
2. Alternative titles (e.g., Japanese name)
3. Multi-language search (en-US)
4. Search without year constraint
5. Punctuation-normalized search
Args:
primary_query: Primary search term
year: Release year for filtering
alt_titles: Alternative titles to try if primary fails
Returns:
Tuple of (matched TV show dict, source description string)
Raises:
TMDBAPIError: If all search strategies fail
"""
search_strategies = [
# Strategy 1: Primary query as-is
{"query": primary_query, "year": year, "lang": "de-DE", "desc": "primary"},
]
# Strategy 2: Try alt titles (typically Japanese)
if alt_titles:
for alt in alt_titles:
if alt != primary_query:
search_strategies.append(
{"query": alt, "year": year, "lang": "ja-JP", "desc": f"alt_title:{alt}"}
)
search_strategies.append(
{"query": alt, "year": year, "lang": "en-US", "desc": f"alt_title:{alt}"}
)
# Strategy 3: Try English search
search_strategies.append(
{"query": primary_query, "year": year, "lang": "en-US", "desc": "english"}
)
# Strategy 4: Try without year constraint
if year:
search_strategies.append(
{"query": primary_query, "year": None, "lang": "de-DE", "desc": "no_year"}
)
# Strategy 5: Normalize punctuation
normalized = self._normalize_query_for_search(primary_query)
if normalized != primary_query:
search_strategies.append(
{"query": normalized, "year": year, "lang": "de-DE", "desc": f"normalized:{normalized}"}
)
last_error = None
for strategy in search_strategies:
query = strategy["query"]
lang = strategy["lang"]
desc = strategy["desc"]
try:
logger.debug(
"TMDB search attempt: query='%s', lang=%s, year=%s, strategy=%s",
query, lang, strategy["year"], desc
)
search_results = await self.tmdb_client.search_tv_show(
query,
language=lang
)
if search_results.get("results"):
# Apply year filter if we have one
results = search_results["results"]
if strategy["year"]:
year_filtered = [
r for r in results
if r.get("first_air_date", "").startswith(str(strategy["year"]))
]
if year_filtered:
match = year_filtered[0]
else:
# Year didn't match, still use first result but log it
match = results[0]
logger.debug(
"Year %s not found in results for '%s', using: %s",
strategy["year"], query, match["name"]
)
else:
match = results[0]
logger.info(
"TMDB search succeeded: '%s' found via strategy '%s' (ID: %s)",
match["name"], desc, match["id"]
)
return match, desc
else:
logger.debug("No results for '%s' via %s", query, desc)
except TMDBAPIError as e:
last_error = e
logger.debug("Search strategy '%s' failed: %s", desc, e)
continue
# All strategies exhausted
raise TMDBAPIError(
f"No results found for: {primary_query} (tried {len(search_strategies)} strategies)"
)
def _normalize_query_for_search(self, query: str) -> str:
"""Normalize query by removing punctuation and special chars.
Args:
query: Original search query
Returns:
Query with punctuation removed
"""
# Remove common punctuation but keep CJK characters
normalized = unicodedata.normalize('NFKC', query)
# Remove punctuation but not CJK
normalized = re.sub(r'[^\w\s\u3000-\u9fff\u4e00-\u9faf]', '', normalized)
# Collapse multiple spaces
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
async def _download_media_files(

View File

@@ -39,6 +39,7 @@ class TMDBClient:
DEFAULT_BASE_URL = "https://api.themoviedb.org/3"
DEFAULT_IMAGE_BASE_URL = "https://image.tmdb.org/t/p"
NEGATIVE_CACHE_TTL = 86400 # 24 hours
def __init__(
self,
@@ -64,6 +65,7 @@ class TMDBClient:
self.max_connections = max_connections
self.session: Optional[aiohttp.ClientSession] = None
self._cache: Dict[str, Any] = {}
self._negative_cache: Dict[str, float] = {} # query -> timestamp when cached
# TMDB allows ~40 req/s; use 30 concurrent + per-second throttle to stay safe
self._semaphore = asyncio.Semaphore(30)
self._rate_limit_lock = asyncio.Lock()
@@ -116,6 +118,16 @@ class TMDBClient:
logger.debug("Cache hit for %s", endpoint)
return self._cache[cache_key]
# Check negative cache (cached empty results)
negative_cache_key = f"{endpoint}:{str(sorted(params.items()))}"
if negative_cache_key in self._negative_cache:
if time.monotonic() - self._negative_cache[negative_cache_key] < self.NEGATIVE_CACHE_TTL:
logger.debug("Negative cache hit for %s (cached empty result)", endpoint)
return {"results": []}
else:
# Expired negative cache entry
del self._negative_cache[negative_cache_key]
delay = 2
last_error = None
@@ -158,6 +170,10 @@ class TMDBClient:
resp.raise_for_status()
data = await resp.json()
self._cache[cache_key] = data
# Cache negative result if empty
if endpoint.startswith("search/") and not data.get("results"):
self._negative_cache[negative_cache_key] = time.monotonic()
logger.debug("Cached negative result for %s", endpoint)
return data
except asyncio.TimeoutError as e:
@@ -224,6 +240,34 @@ class TMDBClient:
{"query": query, "language": language, "page": page}
)
async def search_multi(
self,
query: str,
language: str = "en-US",
page: int = 1
) -> Dict[str, Any]:
"""Search for movies and TV shows by name using TMDB multi search.
Multi search returns both movies and TV shows, useful for anime
that might be indexed as movies on TMDB.
Args:
query: Search query (show name)
language: Language for results (default: English)
page: Page number for pagination
Returns:
Search results with list of movies and TV shows
Example:
>>> results = await client.search_multi("Suzume no Tojimari")
>>> shows = [r for r in results["results"] if r["media_type"] == "tv"]
"""
return await self._request(
"search/multi",
{"query": query, "language": language, "page": page}
)
async def get_tv_show_details(
self,
tv_id: int,
@@ -356,3 +400,25 @@ class TMDBClient:
"""Clear the request cache."""
self._cache.clear()
logger.debug("TMDB client cache cleared")
def clear_negative_cache(self):
"""Clear the negative result cache."""
self._negative_cache.clear()
logger.debug("TMDB negative cache cleared")
def cleanup_expired_negative_cache(self) -> int:
"""Remove expired entries from negative cache.
Returns:
Number of entries removed
"""
now = time.monotonic()
expired_keys = [
key for key, timestamp in self._negative_cache.items()
if now - timestamp >= self.NEGATIVE_CACHE_TTL
]
for key in expired_keys:
del self._negative_cache[key]
if expired_keys:
logger.debug("Removed %d expired negative cache entries", len(expired_keys))
return len(expired_keys)