feat: add NFO scan after rescan and year caching
- Add nfo_scan_after_rescan config option (default: true) - Implement year caching in AniworldLoader and EnhancedAniWorldLoader - Make get_year abstract method in base provider - Run NFO validation/creation after scheduled rescan completes - Add _YearDict cache to avoid re-extracting year from HTML
This commit is contained in:
@@ -158,6 +158,7 @@ class AniworldLoader(Loader):
|
||||
|
||||
self._KeyHTMLDict = {}
|
||||
self._EpisodeHTMLDict = {}
|
||||
self._YearDict = {}
|
||||
self.Providers = Providers()
|
||||
|
||||
# Events: download_progress is triggered with progress dict
|
||||
@@ -774,55 +775,81 @@ class AniworldLoader(Loader):
|
||||
if span_tag:
|
||||
title = span_tag.text
|
||||
logger.debug("Found title: %s", title)
|
||||
|
||||
# Also try to extract year from sibling p tag "Jahr: {year}"
|
||||
# Year is typically right after title in the HTML structure
|
||||
year = self._extract_year_from_soup(soup)
|
||||
if year is not None:
|
||||
self._YearDict[key] = year
|
||||
logger.debug("Cached year %d for key: %s", year, key)
|
||||
|
||||
return title
|
||||
|
||||
logger.warning("No title found for key: %s", key)
|
||||
return ""
|
||||
|
||||
def _extract_year_from_soup(self, soup: BeautifulSoup) -> int | None:
|
||||
"""Extract year from BeautifulSoup object.
|
||||
|
||||
Looks for 'Jahr: {year}' pattern in p tags adjacent to series-title.
|
||||
|
||||
Args:
|
||||
soup: Parsed BeautifulSoup object
|
||||
|
||||
Returns:
|
||||
Year as int or None if not found
|
||||
"""
|
||||
# Try to find year in metadata
|
||||
for p_tag in soup.find_all('p'):
|
||||
text = p_tag.get_text()
|
||||
if 'Jahr:' in text or 'Year:' in text:
|
||||
match = re.search(r'(\d{4})', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# Fallback: look in series-info div
|
||||
info_div = soup.find('div', class_='series-info')
|
||||
if info_div:
|
||||
text = info_div.get_text()
|
||||
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
def get_year(self, key: str) -> int | None:
|
||||
"""Get anime release year from series key.
|
||||
|
||||
Attempts to extract the year from the series page metadata.
|
||||
Returns None if year cannot be determined.
|
||||
|
||||
|
||||
Uses cached year from get_title if available,
|
||||
otherwise extracts and caches it.
|
||||
|
||||
Args:
|
||||
key: Series identifier
|
||||
|
||||
|
||||
Returns:
|
||||
int or None: Release year if found, None otherwise
|
||||
Release year or None if not found
|
||||
"""
|
||||
logger.debug("Getting year for key: %s", key)
|
||||
|
||||
# Check cache first
|
||||
if key in self._YearDict:
|
||||
logger.debug("Using cached year %d for key: %s", self._YearDict[key], key)
|
||||
return self._YearDict[key]
|
||||
|
||||
# Not cached - extract from HTML
|
||||
try:
|
||||
soup = BeautifulSoup(
|
||||
_decode_html_content(self._get_key_html(key).content),
|
||||
'html.parser'
|
||||
)
|
||||
|
||||
# Try to find year in metadata
|
||||
# Check for "Jahr:" or similar metadata fields
|
||||
for p_tag in soup.find_all('p'):
|
||||
text = p_tag.get_text()
|
||||
if 'Jahr:' in text or 'Year:' in text:
|
||||
# Extract year from text like "Jahr: 2025"
|
||||
match = re.search(r'(\d{4})', text)
|
||||
if match:
|
||||
year = int(match.group(1))
|
||||
logger.debug("Found year in metadata: %s", year)
|
||||
return year
|
||||
|
||||
# Try alternative: look for year in genre/info section
|
||||
info_div = soup.find('div', class_='series-info')
|
||||
if info_div:
|
||||
text = info_div.get_text()
|
||||
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
|
||||
if match:
|
||||
year = int(match.group(1))
|
||||
logger.debug("Found year in info section: %s", year)
|
||||
return year
|
||||
|
||||
logger.debug("No year found for key: %s", key)
|
||||
return None
|
||||
|
||||
|
||||
year = self._extract_year_from_soup(soup)
|
||||
if year is not None:
|
||||
self._YearDict[key] = year
|
||||
logger.debug("Found and cached year %d for key: %s", year, key)
|
||||
|
||||
return year
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error extracting year for key %s: %s", key, e)
|
||||
return None
|
||||
|
||||
@@ -91,6 +91,17 @@ class Loader(ABC):
|
||||
Series title string
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_year(self, key: str) -> int | None:
|
||||
"""Get the release year of a series.
|
||||
|
||||
Args:
|
||||
key: Unique series identifier/key
|
||||
|
||||
Returns:
|
||||
Release year as integer, or None if year cannot be determined
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_season_episode_count(self, slug: str) -> Dict[int, int]:
|
||||
"""Get season and episode counts for a series.
|
||||
|
||||
@@ -110,6 +110,7 @@ class EnhancedAniWorldLoader(Loader):
|
||||
# Cache dictionaries
|
||||
self._KeyHTMLDict = {}
|
||||
self._EpisodeHTMLDict = {}
|
||||
self._YearDict = {}
|
||||
|
||||
# Provider manager
|
||||
self.Providers = Providers()
|
||||
@@ -666,6 +667,10 @@ class EnhancedAniWorldLoader(Loader):
|
||||
if title_span:
|
||||
span = title_span.find('span')
|
||||
if span:
|
||||
# Extract and cache year from soup if available
|
||||
year = self._ExtractYearFromSoup(soup)
|
||||
if year is not None:
|
||||
self._YearDict[key] = year
|
||||
return span.text.strip()
|
||||
|
||||
self.logger.warning("Could not extract title for key: %s", key)
|
||||
@@ -674,7 +679,62 @@ class EnhancedAniWorldLoader(Loader):
|
||||
except Exception as e:
|
||||
self.logger.error("Failed to get title for key %s: %s", key, e)
|
||||
raise RetryableError(f"Title extraction failed: {e}") from e
|
||||
|
||||
|
||||
def _ExtractYearFromSoup(self, soup: BeautifulSoup) -> int | None:
|
||||
"""Extract year from parsed BeautifulSoup.
|
||||
|
||||
Looks for 'Jahr: {year}' pattern in p tags.
|
||||
|
||||
Args:
|
||||
soup: Parsed BeautifulSoup object
|
||||
|
||||
Returns:
|
||||
Year as int or None if not found
|
||||
"""
|
||||
for p_tag in soup.find_all('p'):
|
||||
text = p_tag.get_text()
|
||||
if 'Jahr:' in text or 'Year:' in text:
|
||||
match = re.search(r'(\d{4})', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
info_div = soup.find('div', class_='series-info')
|
||||
if info_div:
|
||||
text = info_div.get_text()
|
||||
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
def GetYear(self, key: str) -> int | None:
|
||||
"""Get anime release year from series key.
|
||||
|
||||
Uses cached year from GetTitle if available,
|
||||
otherwise extracts and caches it.
|
||||
|
||||
Args:
|
||||
key: Series identifier
|
||||
|
||||
Returns:
|
||||
Release year or None if not found
|
||||
"""
|
||||
# Check cache first
|
||||
if key in self._YearDict:
|
||||
return self._YearDict[key]
|
||||
|
||||
# Not cached - extract from HTML
|
||||
try:
|
||||
soup = BeautifulSoup(self._GetKeyHTML(key).content, 'html.parser')
|
||||
year = self._ExtractYearFromSoup(soup)
|
||||
if year is not None:
|
||||
self._YearDict[key] = year
|
||||
return year
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning("Error extracting year for key %s: %s", key, e)
|
||||
return None
|
||||
|
||||
def GetSiteKey(self) -> str:
|
||||
"""Get site identifier."""
|
||||
return "aniworld.to"
|
||||
|
||||
Reference in New Issue
Block a user