feat: add NFO scan after rescan and year caching

- Add nfo_scan_after_rescan config option (default: true) - Implement year caching in AniworldLoader and EnhancedAniWorldLoader - Make get_year abstract method in base provider - Run NFO validation/creation after scheduled rescan completes - Add _YearDict cache to avoid re-extracting year from HTML
2026-06-05 18:15:41 +02:00
parent 8b21f1243f
commit e74b04c1ee
10 changed files with 839 additions and 35 deletions
--- a/src/server/providers/aniworld_provider.py
+++ b/src/server/providers/aniworld_provider.py
@@ -158,6 +158,7 @@ class AniworldLoader(Loader):

        self._KeyHTMLDict = {}
        self._EpisodeHTMLDict = {}
+        self._YearDict = {}
        self.Providers = Providers()

        # Events: download_progress is triggered with progress dict
@@ -774,55 +775,81 @@ class AniworldLoader(Loader):
            if span_tag:
                title = span_tag.text
                logger.debug("Found title: %s", title)
+
+                # Also try to extract year from sibling p tag "Jahr: {year}"
+                # Year is typically right after title in the HTML structure
+                year = self._extract_year_from_soup(soup)
+                if year is not None:
+                    self._YearDict[key] = year
+                    logger.debug("Cached year %d for key: %s", year, key)
+
                return title

        logger.warning("No title found for key: %s", key)
        return ""

+    def _extract_year_from_soup(self, soup: BeautifulSoup) -> int | None:
+        """Extract year from BeautifulSoup object.
+
+        Looks for 'Jahr: {year}' pattern in p tags adjacent to series-title.
+
+        Args:
+            soup: Parsed BeautifulSoup object
+
+        Returns:
+            Year as int or None if not found
+        """
+        # Try to find year in metadata
+        for p_tag in soup.find_all('p'):
+            text = p_tag.get_text()
+            if 'Jahr:' in text or 'Year:' in text:
+                match = re.search(r'(\d{4})', text)
+                if match:
+                    return int(match.group(1))
+
+        # Fallback: look in series-info div
+        info_div = soup.find('div', class_='series-info')
+        if info_div:
+            text = info_div.get_text()
+            match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
+            if match:
+                return int(match.group(1))
+
+        return None
+
    def get_year(self, key: str) -> int | None:
        """Get anime release year from series key.
-        
-        Attempts to extract the year from the series page metadata.
-        Returns None if year cannot be determined.
-        
+
+        Uses cached year from get_title if available,
+        otherwise extracts and caches it.
+
        Args:
            key: Series identifier
-            
+
        Returns:
-            int or None: Release year if found, None otherwise
+            Release year or None if not found
        """
        logger.debug("Getting year for key: %s", key)
+
+        # Check cache first
+        if key in self._YearDict:
+            logger.debug("Using cached year %d for key: %s", self._YearDict[key], key)
+            return self._YearDict[key]
+
+        # Not cached - extract from HTML
        try:
            soup = BeautifulSoup(
                _decode_html_content(self._get_key_html(key).content),
                'html.parser'
            )
-            
-            # Try to find year in metadata
-            # Check for "Jahr:" or similar metadata fields
-            for p_tag in soup.find_all('p'):
-                text = p_tag.get_text()
-                if 'Jahr:' in text or 'Year:' in text:
-                    # Extract year from text like "Jahr: 2025"
-                    match = re.search(r'(\d{4})', text)
-                    if match:
-                        year = int(match.group(1))
-                        logger.debug("Found year in metadata: %s", year)
-                        return year
-            
-            # Try alternative: look for year in genre/info section
-            info_div = soup.find('div', class_='series-info')
-            if info_div:
-                text = info_div.get_text()
-                match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
-                if match:
-                    year = int(match.group(1))
-                    logger.debug("Found year in info section: %s", year)
-                    return year
-            
-            logger.debug("No year found for key: %s", key)
-            return None
-            
+
+            year = self._extract_year_from_soup(soup)
+            if year is not None:
+                self._YearDict[key] = year
+                logger.debug("Found and cached year %d for key: %s", year, key)
+
+            return year
+
        except Exception as e:
            logger.warning("Error extracting year for key %s: %s", key, e)
            return None