feat: add NFO scan after rescan and year caching

- Add nfo_scan_after_rescan config option (default: true) - Implement year caching in AniworldLoader and EnhancedAniWorldLoader - Make get_year abstract method in base provider - Run NFO validation/creation after scheduled rescan completes - Add _YearDict cache to avoid re-extracting year from HTML
2026-06-05 18:15:41 +02:00
parent 8b21f1243f
commit e74b04c1ee
10 changed files with 839 additions and 35 deletions
--- a/src/server/providers/aniworld_provider.py
+++ b/src/server/providers/aniworld_provider.py
@@ -158,6 +158,7 @@ class AniworldLoader(Loader):

        self._KeyHTMLDict = {}
        self._EpisodeHTMLDict = {}
+        self._YearDict = {}
        self.Providers = Providers()

        # Events: download_progress is triggered with progress dict
@@ -774,55 +775,81 @@ class AniworldLoader(Loader):
            if span_tag:
                title = span_tag.text
                logger.debug("Found title: %s", title)
+
+                # Also try to extract year from sibling p tag "Jahr: {year}"
+                # Year is typically right after title in the HTML structure
+                year = self._extract_year_from_soup(soup)
+                if year is not None:
+                    self._YearDict[key] = year
+                    logger.debug("Cached year %d for key: %s", year, key)
+
                return title

        logger.warning("No title found for key: %s", key)
        return ""

+    def _extract_year_from_soup(self, soup: BeautifulSoup) -> int | None:
+        """Extract year from BeautifulSoup object.
+
+        Looks for 'Jahr: {year}' pattern in p tags adjacent to series-title.
+
+        Args:
+            soup: Parsed BeautifulSoup object
+
+        Returns:
+            Year as int or None if not found
+        """
+        # Try to find year in metadata
+        for p_tag in soup.find_all('p'):
+            text = p_tag.get_text()
+            if 'Jahr:' in text or 'Year:' in text:
+                match = re.search(r'(\d{4})', text)
+                if match:
+                    return int(match.group(1))
+
+        # Fallback: look in series-info div
+        info_div = soup.find('div', class_='series-info')
+        if info_div:
+            text = info_div.get_text()
+            match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
+            if match:
+                return int(match.group(1))
+
+        return None
+
    def get_year(self, key: str) -> int | None:
        """Get anime release year from series key.
-        
-        Attempts to extract the year from the series page metadata.
-        Returns None if year cannot be determined.
-        
+
+        Uses cached year from get_title if available,
+        otherwise extracts and caches it.
+
        Args:
            key: Series identifier
-            
+
        Returns:
-            int or None: Release year if found, None otherwise
+            Release year or None if not found
        """
        logger.debug("Getting year for key: %s", key)
+
+        # Check cache first
+        if key in self._YearDict:
+            logger.debug("Using cached year %d for key: %s", self._YearDict[key], key)
+            return self._YearDict[key]
+
+        # Not cached - extract from HTML
        try:
            soup = BeautifulSoup(
                _decode_html_content(self._get_key_html(key).content),
                'html.parser'
            )
-            
-            # Try to find year in metadata
-            # Check for "Jahr:" or similar metadata fields
-            for p_tag in soup.find_all('p'):
-                text = p_tag.get_text()
-                if 'Jahr:' in text or 'Year:' in text:
-                    # Extract year from text like "Jahr: 2025"
-                    match = re.search(r'(\d{4})', text)
-                    if match:
-                        year = int(match.group(1))
-                        logger.debug("Found year in metadata: %s", year)
-                        return year
-            
-            # Try alternative: look for year in genre/info section
-            info_div = soup.find('div', class_='series-info')
-            if info_div:
-                text = info_div.get_text()
-                match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
-                if match:
-                    year = int(match.group(1))
-                    logger.debug("Found year in info section: %s", year)
-                    return year
-            
-            logger.debug("No year found for key: %s", key)
-            return None
-            
+
+            year = self._extract_year_from_soup(soup)
+            if year is not None:
+                self._YearDict[key] = year
+                logger.debug("Found and cached year %d for key: %s", year, key)
+
+            return year
+
        except Exception as e:
            logger.warning("Error extracting year for key %s: %s", key, e)
            return None
--- a/src/server/providers/base_provider.py
+++ b/src/server/providers/base_provider.py
@@ -91,6 +91,17 @@ class Loader(ABC):
            Series title string
        """

+    @abstractmethod
+    def get_year(self, key: str) -> int | None:
+        """Get the release year of a series.
+
+        Args:
+            key: Unique series identifier/key
+
+        Returns:
+            Release year as integer, or None if year cannot be determined
+        """
+
    @abstractmethod
    def get_season_episode_count(self, slug: str) -> Dict[int, int]:
        """Get season and episode counts for a series.
--- a/src/server/providers/enhanced_provider.py
+++ b/src/server/providers/enhanced_provider.py
@@ -110,6 +110,7 @@ class EnhancedAniWorldLoader(Loader):
        # Cache dictionaries
        self._KeyHTMLDict = {}
        self._EpisodeHTMLDict = {}
+        self._YearDict = {}
        
        # Provider manager
        self.Providers = Providers()
@@ -666,6 +667,10 @@ class EnhancedAniWorldLoader(Loader):
                if title_span:
                    span = title_span.find('span')
                    if span:
+                        # Extract and cache year from soup if available
+                        year = self._ExtractYearFromSoup(soup)
+                        if year is not None:
+                            self._YearDict[key] = year
                        return span.text.strip()
            
            self.logger.warning("Could not extract title for key: %s", key)
@@ -674,7 +679,62 @@ class EnhancedAniWorldLoader(Loader):
        except Exception as e:
            self.logger.error("Failed to get title for key %s: %s", key, e)
            raise RetryableError(f"Title extraction failed: {e}") from e
-    
+
+    def _ExtractYearFromSoup(self, soup: BeautifulSoup) -> int | None:
+        """Extract year from parsed BeautifulSoup.
+
+        Looks for 'Jahr: {year}' pattern in p tags.
+
+        Args:
+            soup: Parsed BeautifulSoup object
+
+        Returns:
+            Year as int or None if not found
+        """
+        for p_tag in soup.find_all('p'):
+            text = p_tag.get_text()
+            if 'Jahr:' in text or 'Year:' in text:
+                match = re.search(r'(\d{4})', text)
+                if match:
+                    return int(match.group(1))
+
+        info_div = soup.find('div', class_='series-info')
+        if info_div:
+            text = info_div.get_text()
+            match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
+            if match:
+                return int(match.group(1))
+
+        return None
+
+    def GetYear(self, key: str) -> int | None:
+        """Get anime release year from series key.
+
+        Uses cached year from GetTitle if available,
+        otherwise extracts and caches it.
+
+        Args:
+            key: Series identifier
+
+        Returns:
+            Release year or None if not found
+        """
+        # Check cache first
+        if key in self._YearDict:
+            return self._YearDict[key]
+
+        # Not cached - extract from HTML
+        try:
+            soup = BeautifulSoup(self._GetKeyHTML(key).content, 'html.parser')
+            year = self._ExtractYearFromSoup(soup)
+            if year is not None:
+                self._YearDict[key] = year
+            return year
+
+        except Exception as e:
+            self.logger.warning("Error extracting year for key %s: %s", key, e)
+            return None
+
    def GetSiteKey(self) -> str:
        """Get site identifier."""
        return "aniworld.to"