feat: add NFO scan after rescan and year caching

- Add nfo_scan_after_rescan config option (default: true)
- Implement year caching in AniworldLoader and EnhancedAniWorldLoader
- Make get_year abstract method in base provider
- Run NFO validation/creation after scheduled rescan completes
- Add _YearDict cache to avoid re-extracting year from HTML
This commit is contained in:
2026-06-05 18:15:41 +02:00
parent 8b21f1243f
commit e74b04c1ee
10 changed files with 839 additions and 35 deletions

View File

@@ -158,6 +158,7 @@ class AniworldLoader(Loader):
self._KeyHTMLDict = {}
self._EpisodeHTMLDict = {}
self._YearDict = {}
self.Providers = Providers()
# Events: download_progress is triggered with progress dict
@@ -774,55 +775,81 @@ class AniworldLoader(Loader):
if span_tag:
title = span_tag.text
logger.debug("Found title: %s", title)
# Also try to extract year from sibling p tag "Jahr: {year}"
# Year is typically right after title in the HTML structure
year = self._extract_year_from_soup(soup)
if year is not None:
self._YearDict[key] = year
logger.debug("Cached year %d for key: %s", year, key)
return title
logger.warning("No title found for key: %s", key)
return ""
def _extract_year_from_soup(self, soup: BeautifulSoup) -> int | None:
"""Extract year from BeautifulSoup object.
Looks for 'Jahr: {year}' pattern in p tags adjacent to series-title.
Args:
soup: Parsed BeautifulSoup object
Returns:
Year as int or None if not found
"""
# Try to find year in metadata
for p_tag in soup.find_all('p'):
text = p_tag.get_text()
if 'Jahr:' in text or 'Year:' in text:
match = re.search(r'(\d{4})', text)
if match:
return int(match.group(1))
# Fallback: look in series-info div
info_div = soup.find('div', class_='series-info')
if info_div:
text = info_div.get_text()
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
if match:
return int(match.group(1))
return None
def get_year(self, key: str) -> int | None:
"""Get anime release year from series key.
Attempts to extract the year from the series page metadata.
Returns None if year cannot be determined.
Uses cached year from get_title if available,
otherwise extracts and caches it.
Args:
key: Series identifier
Returns:
int or None: Release year if found, None otherwise
Release year or None if not found
"""
logger.debug("Getting year for key: %s", key)
# Check cache first
if key in self._YearDict:
logger.debug("Using cached year %d for key: %s", self._YearDict[key], key)
return self._YearDict[key]
# Not cached - extract from HTML
try:
soup = BeautifulSoup(
_decode_html_content(self._get_key_html(key).content),
'html.parser'
)
# Try to find year in metadata
# Check for "Jahr:" or similar metadata fields
for p_tag in soup.find_all('p'):
text = p_tag.get_text()
if 'Jahr:' in text or 'Year:' in text:
# Extract year from text like "Jahr: 2025"
match = re.search(r'(\d{4})', text)
if match:
year = int(match.group(1))
logger.debug("Found year in metadata: %s", year)
return year
# Try alternative: look for year in genre/info section
info_div = soup.find('div', class_='series-info')
if info_div:
text = info_div.get_text()
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
if match:
year = int(match.group(1))
logger.debug("Found year in info section: %s", year)
return year
logger.debug("No year found for key: %s", key)
return None
year = self._extract_year_from_soup(soup)
if year is not None:
self._YearDict[key] = year
logger.debug("Found and cached year %d for key: %s", year, key)
return year
except Exception as e:
logger.warning("Error extracting year for key %s: %s", key, e)
return None

View File

@@ -91,6 +91,17 @@ class Loader(ABC):
Series title string
"""
@abstractmethod
def get_year(self, key: str) -> int | None:
"""Get the release year of a series.
Args:
key: Unique series identifier/key
Returns:
Release year as integer, or None if year cannot be determined
"""
@abstractmethod
def get_season_episode_count(self, slug: str) -> Dict[int, int]:
"""Get season and episode counts for a series.

View File

@@ -110,6 +110,7 @@ class EnhancedAniWorldLoader(Loader):
# Cache dictionaries
self._KeyHTMLDict = {}
self._EpisodeHTMLDict = {}
self._YearDict = {}
# Provider manager
self.Providers = Providers()
@@ -666,6 +667,10 @@ class EnhancedAniWorldLoader(Loader):
if title_span:
span = title_span.find('span')
if span:
# Extract and cache year from soup if available
year = self._ExtractYearFromSoup(soup)
if year is not None:
self._YearDict[key] = year
return span.text.strip()
self.logger.warning("Could not extract title for key: %s", key)
@@ -674,7 +679,62 @@ class EnhancedAniWorldLoader(Loader):
except Exception as e:
self.logger.error("Failed to get title for key %s: %s", key, e)
raise RetryableError(f"Title extraction failed: {e}") from e
def _ExtractYearFromSoup(self, soup: BeautifulSoup) -> int | None:
"""Extract year from parsed BeautifulSoup.
Looks for 'Jahr: {year}' pattern in p tags.
Args:
soup: Parsed BeautifulSoup object
Returns:
Year as int or None if not found
"""
for p_tag in soup.find_all('p'):
text = p_tag.get_text()
if 'Jahr:' in text or 'Year:' in text:
match = re.search(r'(\d{4})', text)
if match:
return int(match.group(1))
info_div = soup.find('div', class_='series-info')
if info_div:
text = info_div.get_text()
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
if match:
return int(match.group(1))
return None
def GetYear(self, key: str) -> int | None:
"""Get anime release year from series key.
Uses cached year from GetTitle if available,
otherwise extracts and caches it.
Args:
key: Series identifier
Returns:
Release year or None if not found
"""
# Check cache first
if key in self._YearDict:
return self._YearDict[key]
# Not cached - extract from HTML
try:
soup = BeautifulSoup(self._GetKeyHTML(key).content, 'html.parser')
year = self._ExtractYearFromSoup(soup)
if year is not None:
self._YearDict[key] = year
return year
except Exception as e:
self.logger.warning("Error extracting year for key %s: %s", key, e)
return None
def GetSiteKey(self) -> str:
"""Get site identifier."""
return "aniworld.to"