diff --git a/scripts/migrate_populate_year_from_folder.py b/scripts/migrate_populate_year_from_folder.py new file mode 100644 index 0000000..7c59b8a --- /dev/null +++ b/scripts/migrate_populate_year_from_folder.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Migration script to populate year for existing series from folder names. + +This script: +1. Finds all series in the database with year=NULL +2. Extracts year from their folder names using the same pattern as SerieScanner +3. Updates the database records + +Usage: + python scripts/migrate_populate_year_from_folder.py [--dry-run] +""" + +import argparse +import re +import sys +from pathlib import Path + +# Add project root to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from sqlalchemy import select, update +from src.server.database.models import AnimeSeries +from src.server.database.service import DatabaseSession + + +def extract_year_from_folder_name(folder_name: str) -> int | None: + """Extract year from folder name if present. + + Same logic as SerieScanner._extract_year_from_folder_name. + + Args: + folder_name: The folder name to check + + Returns: + int or None: Year if found, None otherwise + """ + if not folder_name: + return None + + # Look for year in format (YYYY) - typically at end of name + match = re.search(r'\((\d{4})\)', folder_name) + if match: + try: + year = int(match.group(1)) + # Validate year is reasonable (between 1900 and 2100) + if 1900 <= year <= 2100: + return year + except ValueError: + pass + + return None + + +async def migrate_year_from_folder(dry_run: bool = True) -> tuple[int, int]: + """Migrate year field for existing series. + + Args: + dry_run: If True, only report what would be changed + + Returns: + Tuple of (updated_count, skipped_count) + """ + updated_count = 0 + skipped_count = 0 + + async with DatabaseSession() as db: + # Find all series with NULL year + result = await db.execute( + select(AnimeSeries).where(AnimeSeries.year.is_(None)) + ) + series_list = result.scalars().all() + + print(f"Found {len(series_list)} series with year=NULL") + + for series in series_list: + year_from_folder = extract_year_from_folder_name(series.folder) + + if year_from_folder: + print(f" {series.folder} -> {year_from_folder}") + + if not dry_run: + await db.execute( + update(AnimeSeries) + .where(AnimeSeries.id == series.id) + .values(year=year_from_folder) + ) + + updated_count += 1 + else: + print(f" {series.folder} -> (no year found)") + skipped_count += 1 + + return updated_count, skipped_count + + +def main(): + parser = argparse.ArgumentParser(description="Migrate year from folder name") + parser.add_argument( + "--dry-run", + action="store_true", + default=True, + help="Show what would be changed without making changes" + ) + parser.add_argument( + "--execute", + action="store_true", + help="Actually execute the migration (disabled by default)" + ) + args = parser.parse_args() + + dry_run = not args.execute + + if dry_run: + print("=== DRY RUN MODE ===") + print("No changes will be made. Use --execute to apply changes.\n") + + import asyncio + + try: + updated, skipped = asyncio.run(migrate_year_from_folder(dry_run=dry_run)) + + print(f"\n{'Would update' if dry_run else 'Updated'}: {updated} series") + print(f"Skipped (no year in folder): {skipped} series") + + if dry_run: + print("\nRun with --execute to apply these changes.") + + return 0 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/src/core/SerieScanner.py b/src/core/SerieScanner.py index 60b52a2..a159580 100644 --- a/src/core/SerieScanner.py +++ b/src/core/SerieScanner.py @@ -1,806 +1,807 @@ -""" -SerieScanner - Scans directories for anime series and missing episodes. - -This module provides functionality to scan anime directories, identify -missing episodes, and report progress through callback interfaces. - -Note: - This module is pure domain logic. Database operations are handled - by the service layer (AnimeService). -""" -from __future__ import annotations - -import logging -import os -import re -import traceback -import uuid -from typing import Callable, Iterable, Iterator, Optional - -from events import Events - -from src.core.entities.series import Serie -from src.core.exceptions.Exceptions import MatchNotFoundError, NoKeyFoundException -from src.core.providers.base_provider import Loader - -logger = logging.getLogger(__name__) -error_logger = logging.getLogger("error") -no_key_found_logger = logging.getLogger("series.nokey") - - -class SerieScanner: - """ - Scans directories for anime series and identifies missing episodes. - - Supports progress callbacks for real-time scanning updates. - - Note: - This class is pure domain logic. Database operations are handled - by the service layer (AnimeService). Scan results are stored - in keyDict and can be retrieved after scanning. - - Example: - scanner = SerieScanner("/path/to/anime", loader) - scanner.scan() - # Results are in scanner.keyDict - - # With DB lookup fallback: - scanner = SerieScanner("/path/to/anime", loader, - db_lookup=lambda folder: my_db.get_by_folder(folder)) - """ - - def __init__( - self, - basePath: str, - loader: Loader, - db_lookup: Optional[Callable[[str], Optional["Serie"]]] = None, - ) -> None: - """ - Initialize the SerieScanner. - - Args: - basePath: Base directory containing anime series - loader: Loader instance for fetching series information - db_lookup: Optional callable ``(folder_name) -> Serie | None``. - When provided, it is called as a fallback when neither a - ``key`` file nor a ``data`` file is found in the folder. - This allows the database to supply the series key for - folders that have never had a local key file. - - Raises: - ValueError: If basePath is invalid or doesn't exist - """ - # Validate basePath to prevent directory traversal attacks - if not basePath or not basePath.strip(): - raise ValueError("Base path cannot be empty") - - # Resolve to absolute path and validate it exists - abs_path = os.path.abspath(basePath) - if not os.path.exists(abs_path): - raise ValueError(f"Base path does not exist: {abs_path}") - if not os.path.isdir(abs_path): - raise ValueError(f"Base path is not a directory: {abs_path}") - - self.directory: str = abs_path - self.keyDict: dict[str, Serie] = {} - self.loader: Loader = loader - self._db_lookup: Optional[Callable[[str], Optional[Serie]]] = db_lookup - self._current_operation_id: Optional[str] = None - self.events = Events() - - self.events.on_progress = [] - self.events.on_error = [] - self.events.on_completion = [] - - logger.info("Initialized SerieScanner with base path: %s", abs_path) - - def _safe_call_event(self, event_handler, data: dict) -> None: - """Safely call an event handler if it exists. - - Args: - event_handler: Event handler attribute (e.g., self.events.on_progress) - data: Data dictionary to pass to the event handler - """ - if event_handler: - try: - # Event handlers are stored as lists, iterate over them - for handler in event_handler: - handler(data) - except Exception as e: - logger.error("Error calling event handler: %s", e, exc_info=True) - - def subscribe_on_progress(self, handler): - """ - Subscribe a handler to an event. - Args: - handler: Callable to handle the event - """ - if handler not in self.events.on_progress: - self.events.on_progress.append(handler) - - def unsubscribe_on_progress(self, handler): - """ - Unsubscribe a handler from an event. - Args: - handler: Callable to remove - """ - if handler in self.events.on_progress: - self.events.on_progress.remove(handler) - - def _extract_year_from_folder_name(self, folder_name: str) -> int | None: - """Extract year from folder name if present. - - Looks for year in format "(YYYY)" at the end of folder name. - - Args: - folder_name: The folder name to check - - Returns: - int or None: Year if found, None otherwise - - Example: - >>> _extract_year_from_folder_name("Dororo (2025)") - 2025 - >>> _extract_year_from_folder_name("Dororo") - None - """ - if not folder_name: - return None - - # Look for year in format (YYYY) - typically at end of name - match = re.search(r'\((\d{4})\)', folder_name) - if match: - try: - year = int(match.group(1)) - # Validate year is reasonable (between 1900 and 2100) - if 1900 <= year <= 2100: - logger.debug( - "Extracted year from folder name: %s -> %d", - folder_name, - year - ) - return year - except ValueError: - pass - - return None - - def subscribe_on_error(self, handler): - """ - Subscribe a handler to an event. - Args: - handler: Callable to handle the event - """ - if handler not in self.events.on_error: - self.events.on_error.append(handler) - - def unsubscribe_on_error(self, handler): - """ - Unsubscribe a handler from an event. - Args: - handler: Callable to remove - """ - if handler in self.events.on_error: - self.events.on_error.remove(handler) - - def subscribe_on_completion(self, handler): - """ - Subscribe a handler to an event. - Args: - handler: Callable to handle the event - """ - if handler not in self.events.on_completion: - self.events.on_completion.append(handler) - - def unsubscribe_on_completion(self, handler): - """ - Unsubscribe a handler from an event. - Args: - handler: Callable to remove - """ - if handler in self.events.on_completion: - self.events.on_completion.remove(handler) - - def reinit(self) -> None: - """Reinitialize the series dictionary (keyed by serie.key).""" - self.keyDict: dict[str, Serie] = {} - - def get_total_to_scan(self) -> int: - """Get the total number of folders to scan. - - Returns: - Total count of folders with MP4 files - """ - result = self.__find_mp4_files() - return sum(1 for _ in result) - - def scan(self) -> None: - """ - Scan directories for anime series and missing episodes. - - Results are stored in self.keyDict and can be retrieved after - scanning. Data files are also saved to disk for persistence. - - Raises: - Exception: If scan fails critically - """ - # Generate unique operation ID - self._current_operation_id = str(uuid.uuid4()) - - logger.info("Starting scan for missing episodes") - - # Notify scan starting - self._safe_call_event( - self.events.on_progress, - { - "operation_id": self._current_operation_id, - "phase": "STARTING", - "current": 0, - "total": 0, - "percentage": 0.0, - "message": "Initializing scan" - } - ) - - try: - # Get total items to process - total_to_scan = self.get_total_to_scan() - logger.info("Total folders to scan: %d", total_to_scan) - - # The scanner enumerates folders with mp4 files, loads existing - # metadata, calculates the missing episodes via the provider, and - # persists the refreshed metadata while emitting progress events. - result = self.__find_mp4_files() - counter = 0 - - for folder, mp4_files in result: - try: - counter += 1 - - # Calculate progress - if total_to_scan > 0: - percentage = (counter / total_to_scan) * 100 - else: - percentage = 0.0 - - # Notify progress - self._safe_call_event( - self.events.on_progress, - { - "operation_id": self._current_operation_id, - "phase": "IN_PROGRESS", - "current": counter, - "total": total_to_scan, - "percentage": percentage, - "message": f"Scanning: {folder}", - "details": f"Found {len(mp4_files)} episodes" - } - ) - - serie = self.__read_data_from_file(folder) - if serie is None or not serie.key or not serie.key.strip(): - # Fallback: ask the database for a matching series - if self._db_lookup is not None: - try: - serie = self._db_lookup(folder) - if serie: - logger.info( - "DB lookup resolved folder '%s' -> key='%s'", - folder, - serie.key, - ) - except Exception as exc: - logger.warning( - "DB lookup failed for folder '%s': %s", - folder, - exc, - ) - serie = None - - if serie is None or not serie.key or not serie.key.strip(): - logger.warning( - "No key or data file found for folder '%s', skipping", - folder, - ) - if ( - serie is not None - and serie.key - and serie.key.strip() - ): - # Try to extract year from folder name first - if not hasattr(serie, 'year') or not serie.year: - year_from_folder = self._extract_year_from_folder_name(folder) - if year_from_folder: - serie.year = year_from_folder - logger.info( - "Using year from folder name: %s (year=%d)", - folder, - year_from_folder - ) - else: - # If not in folder name, fetch from provider - try: - serie.year = self.loader.get_year(serie.key) - if serie.year: - logger.info( - "Fetched year from provider: %s (year=%d)", - serie.key, - serie.year - ) - except Exception as e: - logger.warning( - "Could not fetch year for %s: %s", - serie.key, - str(e) - ) - - # Delegate the provider to compare local files with - # remote metadata, yielding missing episodes per - # season. Results are saved back to disk so that both - # CLI and API consumers see consistent state. - missing_episodes, _site = ( - self.__get_missing_episodes_and_season( - serie.key, mp4_files - ) - ) - serie.episodeDict = missing_episodes - serie.folder = folder - data_path = os.path.join( - self.directory, folder, 'data' - ) - serie.save_to_file(data_path) - - # Store by key (primary identifier), not folder - if serie.key in self.keyDict: - logger.error( - "Duplicate series found with key '%s' " - "(folder: '%s')", - serie.key, - folder - ) - else: - self.keyDict[serie.key] = serie - logger.debug( - "Stored series with key '%s' (folder: '%s')", - serie.key, - folder - ) - no_key_found_logger.info( - "Saved Serie: '%s'", str(serie) - ) - - except NoKeyFoundException as nkfe: - # Log error and notify via callback - error_msg = f"Error processing folder '{folder}': {nkfe}" - logger.error(error_msg) - - self._safe_call_event( - self.events.on_error, - { - "operation_id": self._current_operation_id, - "error": nkfe, - "message": error_msg, - "recoverable": True, - "metadata": {"folder": folder, "key": None} - } - ) - except Exception as e: - # Log error and notify via callback - error_msg = ( - f"Folder: '{folder}' - " - f"Unexpected error: {e}" - ) - error_logger.error( - "%s\n%s", - error_msg, - traceback.format_exc() - ) - - self._safe_call_event( - self.events.on_error, - { - "operation_id": self._current_operation_id, - "error": e, - "message": error_msg, - "recoverable": True, - "metadata": {"folder": folder, "key": None} - } - ) - continue - - # Notify scan completion - self._safe_call_event( - self.events.on_completion, - { - "operation_id": self._current_operation_id, - "success": True, - "message": f"Scan completed. Processed {counter} folders.", - "statistics": { - "total_folders": counter, - "series_found": len(self.keyDict) - } - } - ) - - logger.info( - "Scan completed. Processed %d folders, found %d series", - counter, - len(self.keyDict) - ) - - except Exception as e: - # Critical error - notify and re-raise - error_msg = f"Critical scan error: {e}" - logger.error("%s\n%s", error_msg, traceback.format_exc()) - - self._safe_call_event( - self.events.on_error, - { - "operation_id": self._current_operation_id, - "error": e, - "message": error_msg, - "recoverable": False - } - ) - - self._safe_call_event( - self.events.on_completion, - { - "operation_id": self._current_operation_id, - "success": False, - "message": error_msg - } - ) - - raise - - def __find_mp4_files(self) -> Iterator[tuple[str, list[str]]]: - """Find all .mp4 files in the directory structure.""" - logger.info("Scanning for .mp4 files") - for anime_name in os.listdir(self.directory): - anime_path = os.path.join(self.directory, anime_name) - if os.path.isdir(anime_path): - mp4_files: list[str] = [] - has_files = False - for root, _, files in os.walk(anime_path): - for file in files: - if file.endswith(".mp4"): - mp4_files.append(os.path.join(root, file)) - has_files = True - yield anime_name, mp4_files if has_files else [] - - def __read_data_from_file(self, folder_name: str) -> Optional[Serie]: - """Read serie data from file or key file. - - Args: - folder_name: Filesystem folder name - (used only to locate data files) - - Returns: - Serie object with valid key if found, None otherwise - - Note: - The returned Serie will have its 'key' as the primary identifier. - The 'folder' field is metadata only. - """ - folder_path = os.path.join(self.directory, folder_name) - key = None - key_file = os.path.join(folder_path, 'key') - serie_file = os.path.join(folder_path, 'data') - - if os.path.exists(key_file): - with open(key_file, 'r', encoding='utf-8') as file: - key = file.read().strip() - logger.info( - "Key found for folder '%s': %s", - folder_name, - key - ) - return Serie(key, "", "aniworld.to", folder_name, dict()) - - if os.path.exists(serie_file): - with open(serie_file, "rb") as file: - logger.info( - "load serie_file from '%s': %s", - folder_name, - serie_file - ) - return Serie.load_from_file(serie_file) - - return None - - def __get_episode_and_season(self, filename: str) -> tuple[int, int]: - """Extract season and episode numbers from filename. - - Args: - filename: Filename to parse - - Returns: - Tuple of (season, episode) as integers - - Raises: - MatchNotFoundError: If pattern not found - """ - pattern = r'S(\d+)E(\d+)' - match = re.search(pattern, filename) - if match: - season = match.group(1) - episode = match.group(2) - logger.debug( - "Extracted season %s, episode %s from '%s'", - season, - episode, - filename - ) - return int(season), int(episode) - else: - logger.error( - "Failed to find season/episode pattern in '%s'", - filename - ) - raise MatchNotFoundError( - "Season and episode pattern not found in the filename." - ) - - def __get_episodes_and_seasons( - self, - mp4_files: Iterable[str] - ) -> dict[int, list[int]]: - """Get episodes grouped by season from mp4 files. - - Args: - mp4_files: List of MP4 filenames - - Returns: - Dictionary mapping season to list of episode numbers - """ - episodes_dict: dict[int, list[int]] = {} - - for file in mp4_files: - season, episode = self.__get_episode_and_season(file) - - if season in episodes_dict: - episodes_dict[season].append(episode) - else: - episodes_dict[season] = [episode] - return episodes_dict - - def __get_missing_episodes_and_season( - self, - key: str, - mp4_files: Iterable[str] - ) -> tuple[dict[int, list[int]], str]: - """Get missing episodes for a serie. - - Args: - key: Series key - mp4_files: List of MP4 filenames - - Returns: - Tuple of (episodes_dict, site_name) - """ - # key season , value count of episodes - expected_dict = self.loader.get_season_episode_count(key) - filedict = self.__get_episodes_and_seasons(mp4_files) - episodes_dict: dict[int, list[int]] = {} - for season, expected_count in expected_dict.items(): - existing_episodes = filedict.get(season, []) - missing_episodes = [ - ep for ep in range(1, expected_count + 1) - if ep not in existing_episodes - and self.loader.is_language(season, ep, key) - ] - - if missing_episodes: - episodes_dict[season] = missing_episodes - - return episodes_dict, "aniworld.to" - - def scan_single_series( - self, - key: str, - folder: str, - ) -> dict[int, list[int]]: - """ - Scan a single series for missing episodes. - - This method performs a targeted scan for only the specified series, - without triggering a full library rescan. It fetches available - episodes from the provider and compares with local files. - - Args: - key: The unique provider key for the series - folder: The filesystem folder name where the series is stored - - Returns: - dict[int, list[int]]: Dictionary mapping season numbers to lists - of missing episode numbers. Empty dict if no missing episodes. - - Raises: - ValueError: If key or folder is empty - - Example: - >>> scanner = SerieScanner("/path/to/anime", loader) - >>> missing = scanner.scan_single_series( - ... "attack-on-titan", - ... "Attack on Titan" - ... ) - >>> print(missing) - {1: [5, 6, 7], 2: [1, 2]} - """ - if not key or not key.strip(): - raise ValueError("Series key cannot be empty") - if not folder or not folder.strip(): - raise ValueError("Series folder cannot be empty") - - logger.info( - "Starting targeted scan for series: %s (folder: %s)", - key, - folder - ) - - # Generate unique operation ID for this targeted scan - operation_id = str(uuid.uuid4()) - # Notify scan starting - self._safe_call_event( - self.events.on_progress, - { - "operation_id": operation_id, - "phase": "STARTING", - "current": 0, - "total": 1, - "percentage": 0.0, - "message": f"Scanning series: {folder}", - "details": f"Key: {key}" - } - ) - - try: - # Get the folder path - folder_path = os.path.join(self.directory, folder) - - # Check if folder exists - if not os.path.isdir(folder_path): - logger.info( - "Series folder does not exist yet: %s - " - "will scan for available episodes from provider", - folder_path - ) - mp4_files: list[str] = [] - else: - # Find existing MP4 files in the folder - mp4_files = [] - for root, _, files in os.walk(folder_path): - for file in files: - if file.endswith(".mp4"): - mp4_files.append(os.path.join(root, file)) - - logger.debug( - "Found %d existing MP4 files in folder %s", - len(mp4_files), - folder - ) - - # Get missing episodes from provider - missing_episodes, site = self.__get_missing_episodes_and_season( - key, mp4_files - ) - - # Update progress - self._safe_call_event( - self.events.on_progress, - { - "operation_id": operation_id, - "phase": "IN_PROGRESS", - "current": 1, - "total": 1, - "percentage": 100.0, - "message": f"Scanned: {folder}", - "details": f"Found {sum(len(eps) for eps in missing_episodes.values())} missing episodes" - } - ) - - # Create or update Serie in keyDict - if key in self.keyDict: - # Update existing serie - self.keyDict[key].episodeDict = missing_episodes - logger.debug( - "Updated existing series %s with %d missing episodes", - key, - sum(len(eps) for eps in missing_episodes.values()) - ) - else: - # Try to extract year from folder name first - year = self._extract_year_from_folder_name(folder) - if year: - logger.info( - "Using year from folder name: %s (year=%d)", - folder, - year - ) - else: - # If not in folder name, fetch from provider - try: - year = self.loader.get_year(key) - if year: - logger.info( - "Fetched year from provider: %s (year=%d)", - key, - year - ) - except Exception as e: - logger.warning( - "Could not fetch year for %s: %s", - key, - str(e) - ) - - # Create new serie entry - serie = Serie( - key=key, - name="", # Will be populated by caller if needed - site=site, - folder=folder, - episodeDict=missing_episodes, - year=year - ) - self.keyDict[key] = serie - logger.debug( - "Created new series entry for %s with %d missing episodes (year=%s)", - key, - sum(len(eps) for eps in missing_episodes.values()), - year - ) - - # Notify completion - self._safe_call_event( - self.events.on_completion, - { - "operation_id": operation_id, - "success": True, - "message": f"Scan completed for {folder}", - "statistics": { - "missing_episodes": sum( - len(eps) for eps in missing_episodes.values() - ), - "seasons_with_missing": len(missing_episodes) - } - } - ) - - logger.info( - "Targeted scan completed for %s: %d missing episodes across %d seasons", - key, - sum(len(eps) for eps in missing_episodes.values()), - len(missing_episodes) - ) - - return missing_episodes - - except Exception as e: - error_msg = f"Failed to scan series {key}: {e}" - logger.error(error_msg, exc_info=True) - - # Notify error - self._safe_call_event( - self.events.on_error, - { - "operation_id": operation_id, - "error": e, - "message": error_msg, - "recoverable": True, - "metadata": {"key": key, "folder": folder} - } - ) - # Notify completion with failure - self._safe_call_event( - self.events.on_completion, - { - "operation_id": operation_id, - "success": False, - "message": error_msg - } - ) - # Return empty dict on error (scan failed but not critical) - return {} - +""" +SerieScanner - Scans directories for anime series and missing episodes. + +This module provides functionality to scan anime directories, identify +missing episodes, and report progress through callback interfaces. + +Note: + This module is pure domain logic. Database operations are handled + by the service layer (AnimeService). +""" +from __future__ import annotations + +import logging +import os +import re +import traceback +import uuid +from typing import Callable, Iterable, Iterator, Optional + +from events import Events + +from src.core.entities.series import Serie +from src.core.exceptions.Exceptions import MatchNotFoundError, NoKeyFoundException +from src.core.providers.base_provider import Loader + +logger = logging.getLogger(__name__) +error_logger = logging.getLogger("error") +no_key_found_logger = logging.getLogger("series.nokey") + + +class SerieScanner: + """ + Scans directories for anime series and identifies missing episodes. + + Supports progress callbacks for real-time scanning updates. + + Note: + This class is pure domain logic. Database operations are handled + by the service layer (AnimeService). Scan results are stored + in keyDict and can be retrieved after scanning. + + Example: + scanner = SerieScanner("/path/to/anime", loader) + scanner.scan() + # Results are in scanner.keyDict + + # With DB lookup fallback: + scanner = SerieScanner("/path/to/anime", loader, + db_lookup=lambda folder: my_db.get_by_folder(folder)) + """ + + def __init__( + self, + basePath: str, + loader: Loader, + db_lookup: Optional[Callable[[str], Optional["Serie"]]] = None, + ) -> None: + """ + Initialize the SerieScanner. + + Args: + basePath: Base directory containing anime series + loader: Loader instance for fetching series information + db_lookup: Optional callable ``(folder_name) -> Serie | None``. + When provided, it is called as a fallback when neither a + ``key`` file nor a ``data`` file is found in the folder. + This allows the database to supply the series key for + folders that have never had a local key file. + + Raises: + ValueError: If basePath is invalid or doesn't exist + """ + # Validate basePath to prevent directory traversal attacks + if not basePath or not basePath.strip(): + raise ValueError("Base path cannot be empty") + + # Resolve to absolute path and validate it exists + abs_path = os.path.abspath(basePath) + if not os.path.exists(abs_path): + raise ValueError(f"Base path does not exist: {abs_path}") + if not os.path.isdir(abs_path): + raise ValueError(f"Base path is not a directory: {abs_path}") + + self.directory: str = abs_path + self.keyDict: dict[str, Serie] = {} + self.loader: Loader = loader + self._db_lookup: Optional[Callable[[str], Optional[Serie]]] = db_lookup + self._current_operation_id: Optional[str] = None + self.events = Events() + + self.events.on_progress = [] + self.events.on_error = [] + self.events.on_completion = [] + + logger.info("Initialized SerieScanner with base path: %s", abs_path) + + def _safe_call_event(self, event_handler, data: dict) -> None: + """Safely call an event handler if it exists. + + Args: + event_handler: Event handler attribute (e.g., self.events.on_progress) + data: Data dictionary to pass to the event handler + """ + if event_handler: + try: + # Event handlers are stored as lists, iterate over them + for handler in event_handler: + handler(data) + except Exception as e: + logger.error("Error calling event handler: %s", e, exc_info=True) + + def subscribe_on_progress(self, handler): + """ + Subscribe a handler to an event. + Args: + handler: Callable to handle the event + """ + if handler not in self.events.on_progress: + self.events.on_progress.append(handler) + + def unsubscribe_on_progress(self, handler): + """ + Unsubscribe a handler from an event. + Args: + handler: Callable to remove + """ + if handler in self.events.on_progress: + self.events.on_progress.remove(handler) + + def _extract_year_from_folder_name(self, folder_name: str) -> int | None: + """Extract year from folder name if present. + + Looks for year in format "(YYYY)" at the end of folder name. + + Args: + folder_name: The folder name to check + + Returns: + int or None: Year if found, None otherwise + + Example: + >>> _extract_year_from_folder_name("Dororo (2025)") + 2025 + >>> _extract_year_from_folder_name("Dororo") + None + """ + if not folder_name: + return None + + # Look for year in format (YYYY) - typically at end of name + match = re.search(r'\((\d{4})\)', folder_name) + if match: + try: + year = int(match.group(1)) + # Validate year is reasonable (between 1900 and 2100) + if 1900 <= year <= 2100: + logger.debug( + "Extracted year from folder name: %s -> %d", + folder_name, + year + ) + return year + except ValueError: + pass + + return None + + def subscribe_on_error(self, handler): + """ + Subscribe a handler to an event. + Args: + handler: Callable to handle the event + """ + if handler not in self.events.on_error: + self.events.on_error.append(handler) + + def unsubscribe_on_error(self, handler): + """ + Unsubscribe a handler from an event. + Args: + handler: Callable to remove + """ + if handler in self.events.on_error: + self.events.on_error.remove(handler) + + def subscribe_on_completion(self, handler): + """ + Subscribe a handler to an event. + Args: + handler: Callable to handle the event + """ + if handler not in self.events.on_completion: + self.events.on_completion.append(handler) + + def unsubscribe_on_completion(self, handler): + """ + Unsubscribe a handler from an event. + Args: + handler: Callable to remove + """ + if handler in self.events.on_completion: + self.events.on_completion.remove(handler) + + def reinit(self) -> None: + """Reinitialize the series dictionary (keyed by serie.key).""" + self.keyDict: dict[str, Serie] = {} + + def get_total_to_scan(self) -> int: + """Get the total number of folders to scan. + + Returns: + Total count of folders with MP4 files + """ + result = self.__find_mp4_files() + return sum(1 for _ in result) + + def scan(self) -> None: + """ + Scan directories for anime series and missing episodes. + + Results are stored in self.keyDict and can be retrieved after + scanning. Data files are also saved to disk for persistence. + + Raises: + Exception: If scan fails critically + """ + # Generate unique operation ID + self._current_operation_id = str(uuid.uuid4()) + + logger.info("Starting scan for missing episodes") + + # Notify scan starting + self._safe_call_event( + self.events.on_progress, + { + "operation_id": self._current_operation_id, + "phase": "STARTING", + "current": 0, + "total": 0, + "percentage": 0.0, + "message": "Initializing scan" + } + ) + + try: + # Get total items to process + total_to_scan = self.get_total_to_scan() + logger.info("Total folders to scan: %d", total_to_scan) + + # The scanner enumerates folders with mp4 files, loads existing + # metadata, calculates the missing episodes via the provider, and + # persists the refreshed metadata while emitting progress events. + result = self.__find_mp4_files() + counter = 0 + + for folder, mp4_files in result: + try: + counter += 1 + + # Calculate progress + if total_to_scan > 0: + percentage = (counter / total_to_scan) * 100 + else: + percentage = 0.0 + + # Notify progress + self._safe_call_event( + self.events.on_progress, + { + "operation_id": self._current_operation_id, + "phase": "IN_PROGRESS", + "current": counter, + "total": total_to_scan, + "percentage": percentage, + "message": f"Scanning: {folder}", + "details": f"Found {len(mp4_files)} episodes" + } + ) + + serie = self.__read_data_from_file(folder) + if serie is None or not serie.key or not serie.key.strip(): + # Fallback: ask the database for a matching series + if self._db_lookup is not None: + try: + serie = self._db_lookup(folder) + if serie: + logger.info( + "DB lookup resolved folder '%s' -> key='%s'", + folder, + serie.key, + ) + except Exception as exc: + logger.warning( + "DB lookup failed for folder '%s': %s", + folder, + exc, + ) + serie = None + + if serie is None or not serie.key or not serie.key.strip(): + logger.warning( + "No key or data file found for folder '%s', skipping", + folder, + ) + if ( + serie is not None + and serie.key + and serie.key.strip() + ): + # Try to extract year from folder name first + if not hasattr(serie, 'year') or not serie.year: + year_from_folder = self._extract_year_from_folder_name(folder) + if year_from_folder: + serie.year = year_from_folder + logger.info( + "Using year from folder name: %s (year=%d)", + folder, + year_from_folder + ) + else: + # If not in folder name, fetch from provider + try: + serie.year = self.loader.get_year(serie.key) + if serie.year: + logger.info( + "Fetched year from provider: %s (year=%d)", + serie.key, + serie.year + ) + except Exception as e: + logger.warning( + "Could not fetch year for %s: %s", + serie.key, + str(e) + ) + + # Delegate the provider to compare local files with + # remote metadata, yielding missing episodes per + # season. Results are saved back to disk so that both + # CLI and API consumers see consistent state. + missing_episodes, _site = ( + self.__get_missing_episodes_and_season( + serie.key, mp4_files + ) + ) + serie.episodeDict = missing_episodes + serie.folder = folder + data_path = os.path.join( + self.directory, folder, 'data' + ) + serie.save_to_file(data_path) + + # Store by key (primary identifier), not folder + if serie.key in self.keyDict: + logger.error( + "Duplicate series found with key '%s' " + "(folder: '%s')", + serie.key, + folder + ) + else: + self.keyDict[serie.key] = serie + logger.debug( + "Stored series with key '%s' (folder: '%s')", + serie.key, + folder + ) + no_key_found_logger.info( + "Saved Serie: '%s'", str(serie) + ) + + except NoKeyFoundException as nkfe: + # Log error and notify via callback + error_msg = f"Error processing folder '{folder}': {nkfe}" + logger.error(error_msg) + + self._safe_call_event( + self.events.on_error, + { + "operation_id": self._current_operation_id, + "error": nkfe, + "message": error_msg, + "recoverable": True, + "metadata": {"folder": folder, "key": None} + } + ) + except Exception as e: + # Log error and notify via callback + error_msg = ( + f"Folder: '{folder}' - " + f"Unexpected error: {e}" + ) + error_logger.error( + "%s\n%s", + error_msg, + traceback.format_exc() + ) + + self._safe_call_event( + self.events.on_error, + { + "operation_id": self._current_operation_id, + "error": e, + "message": error_msg, + "recoverable": True, + "metadata": {"folder": folder, "key": None} + } + ) + continue + + # Notify scan completion + self._safe_call_event( + self.events.on_completion, + { + "operation_id": self._current_operation_id, + "success": True, + "message": f"Scan completed. Processed {counter} folders.", + "statistics": { + "total_folders": counter, + "series_found": len(self.keyDict) + } + } + ) + + logger.info( + "Scan completed. Processed %d folders, found %d series", + counter, + len(self.keyDict) + ) + + except Exception as e: + # Critical error - notify and re-raise + error_msg = f"Critical scan error: {e}" + logger.error("%s\n%s", error_msg, traceback.format_exc()) + + self._safe_call_event( + self.events.on_error, + { + "operation_id": self._current_operation_id, + "error": e, + "message": error_msg, + "recoverable": False + } + ) + + self._safe_call_event( + self.events.on_completion, + { + "operation_id": self._current_operation_id, + "success": False, + "message": error_msg + } + ) + + raise + + def __find_mp4_files(self) -> Iterator[tuple[str, list[str]]]: + """Find all .mp4 files in the directory structure.""" + logger.info("Scanning for .mp4 files") + for anime_name in os.listdir(self.directory): + anime_path = os.path.join(self.directory, anime_name) + if os.path.isdir(anime_path): + mp4_files: list[str] = [] + has_files = False + for root, _, files in os.walk(anime_path): + for file in files: + if file.endswith(".mp4"): + mp4_files.append(os.path.join(root, file)) + has_files = True + yield anime_name, mp4_files if has_files else [] + + def __read_data_from_file(self, folder_name: str) -> Optional[Serie]: + """Read serie data from file or key file. + + Args: + folder_name: Filesystem folder name + (used only to locate data files) + + Returns: + Serie object with valid key if found, None otherwise + + Note: + The returned Serie will have its 'key' as the primary identifier. + The 'folder' field is metadata only. + """ + folder_path = os.path.join(self.directory, folder_name) + key = None + key_file = os.path.join(folder_path, 'key') + serie_file = os.path.join(folder_path, 'data') + + if os.path.exists(key_file): + with open(key_file, 'r', encoding='utf-8') as file: + key = file.read().strip() + logger.info( + "Key found for folder '%s': %s", + folder_name, + key + ) + year_from_folder = self._extract_year_from_folder_name(folder_name) + return Serie(key, "", "aniworld.to", folder_name, dict(), year=year_from_folder) + + if os.path.exists(serie_file): + with open(serie_file, "rb") as file: + logger.info( + "load serie_file from '%s': %s", + folder_name, + serie_file + ) + return Serie.load_from_file(serie_file) + + return None + + def __get_episode_and_season(self, filename: str) -> tuple[int, int]: + """Extract season and episode numbers from filename. + + Args: + filename: Filename to parse + + Returns: + Tuple of (season, episode) as integers + + Raises: + MatchNotFoundError: If pattern not found + """ + pattern = r'S(\d+)E(\d+)' + match = re.search(pattern, filename) + if match: + season = match.group(1) + episode = match.group(2) + logger.debug( + "Extracted season %s, episode %s from '%s'", + season, + episode, + filename + ) + return int(season), int(episode) + else: + logger.error( + "Failed to find season/episode pattern in '%s'", + filename + ) + raise MatchNotFoundError( + "Season and episode pattern not found in the filename." + ) + + def __get_episodes_and_seasons( + self, + mp4_files: Iterable[str] + ) -> dict[int, list[int]]: + """Get episodes grouped by season from mp4 files. + + Args: + mp4_files: List of MP4 filenames + + Returns: + Dictionary mapping season to list of episode numbers + """ + episodes_dict: dict[int, list[int]] = {} + + for file in mp4_files: + season, episode = self.__get_episode_and_season(file) + + if season in episodes_dict: + episodes_dict[season].append(episode) + else: + episodes_dict[season] = [episode] + return episodes_dict + + def __get_missing_episodes_and_season( + self, + key: str, + mp4_files: Iterable[str] + ) -> tuple[dict[int, list[int]], str]: + """Get missing episodes for a serie. + + Args: + key: Series key + mp4_files: List of MP4 filenames + + Returns: + Tuple of (episodes_dict, site_name) + """ + # key season , value count of episodes + expected_dict = self.loader.get_season_episode_count(key) + filedict = self.__get_episodes_and_seasons(mp4_files) + episodes_dict: dict[int, list[int]] = {} + for season, expected_count in expected_dict.items(): + existing_episodes = filedict.get(season, []) + missing_episodes = [ + ep for ep in range(1, expected_count + 1) + if ep not in existing_episodes + and self.loader.is_language(season, ep, key) + ] + + if missing_episodes: + episodes_dict[season] = missing_episodes + + return episodes_dict, "aniworld.to" + + def scan_single_series( + self, + key: str, + folder: str, + ) -> dict[int, list[int]]: + """ + Scan a single series for missing episodes. + + This method performs a targeted scan for only the specified series, + without triggering a full library rescan. It fetches available + episodes from the provider and compares with local files. + + Args: + key: The unique provider key for the series + folder: The filesystem folder name where the series is stored + + Returns: + dict[int, list[int]]: Dictionary mapping season numbers to lists + of missing episode numbers. Empty dict if no missing episodes. + + Raises: + ValueError: If key or folder is empty + + Example: + >>> scanner = SerieScanner("/path/to/anime", loader) + >>> missing = scanner.scan_single_series( + ... "attack-on-titan", + ... "Attack on Titan" + ... ) + >>> print(missing) + {1: [5, 6, 7], 2: [1, 2]} + """ + if not key or not key.strip(): + raise ValueError("Series key cannot be empty") + if not folder or not folder.strip(): + raise ValueError("Series folder cannot be empty") + + logger.info( + "Starting targeted scan for series: %s (folder: %s)", + key, + folder + ) + + # Generate unique operation ID for this targeted scan + operation_id = str(uuid.uuid4()) + # Notify scan starting + self._safe_call_event( + self.events.on_progress, + { + "operation_id": operation_id, + "phase": "STARTING", + "current": 0, + "total": 1, + "percentage": 0.0, + "message": f"Scanning series: {folder}", + "details": f"Key: {key}" + } + ) + + try: + # Get the folder path + folder_path = os.path.join(self.directory, folder) + + # Check if folder exists + if not os.path.isdir(folder_path): + logger.info( + "Series folder does not exist yet: %s - " + "will scan for available episodes from provider", + folder_path + ) + mp4_files: list[str] = [] + else: + # Find existing MP4 files in the folder + mp4_files = [] + for root, _, files in os.walk(folder_path): + for file in files: + if file.endswith(".mp4"): + mp4_files.append(os.path.join(root, file)) + + logger.debug( + "Found %d existing MP4 files in folder %s", + len(mp4_files), + folder + ) + + # Get missing episodes from provider + missing_episodes, site = self.__get_missing_episodes_and_season( + key, mp4_files + ) + + # Update progress + self._safe_call_event( + self.events.on_progress, + { + "operation_id": operation_id, + "phase": "IN_PROGRESS", + "current": 1, + "total": 1, + "percentage": 100.0, + "message": f"Scanned: {folder}", + "details": f"Found {sum(len(eps) for eps in missing_episodes.values())} missing episodes" + } + ) + + # Create or update Serie in keyDict + if key in self.keyDict: + # Update existing serie + self.keyDict[key].episodeDict = missing_episodes + logger.debug( + "Updated existing series %s with %d missing episodes", + key, + sum(len(eps) for eps in missing_episodes.values()) + ) + else: + # Try to extract year from folder name first + year = self._extract_year_from_folder_name(folder) + if year: + logger.info( + "Using year from folder name: %s (year=%d)", + folder, + year + ) + else: + # If not in folder name, fetch from provider + try: + year = self.loader.get_year(key) + if year: + logger.info( + "Fetched year from provider: %s (year=%d)", + key, + year + ) + except Exception as e: + logger.warning( + "Could not fetch year for %s: %s", + key, + str(e) + ) + + # Create new serie entry + serie = Serie( + key=key, + name="", # Will be populated by caller if needed + site=site, + folder=folder, + episodeDict=missing_episodes, + year=year + ) + self.keyDict[key] = serie + logger.debug( + "Created new series entry for %s with %d missing episodes (year=%s)", + key, + sum(len(eps) for eps in missing_episodes.values()), + year + ) + + # Notify completion + self._safe_call_event( + self.events.on_completion, + { + "operation_id": operation_id, + "success": True, + "message": f"Scan completed for {folder}", + "statistics": { + "missing_episodes": sum( + len(eps) for eps in missing_episodes.values() + ), + "seasons_with_missing": len(missing_episodes) + } + } + ) + + logger.info( + "Targeted scan completed for %s: %d missing episodes across %d seasons", + key, + sum(len(eps) for eps in missing_episodes.values()), + len(missing_episodes) + ) + + return missing_episodes + + except Exception as e: + error_msg = f"Failed to scan series {key}: {e}" + logger.error(error_msg, exc_info=True) + + # Notify error + self._safe_call_event( + self.events.on_error, + { + "operation_id": operation_id, + "error": e, + "message": error_msg, + "recoverable": True, + "metadata": {"key": key, "folder": folder} + } + ) + # Notify completion with failure + self._safe_call_event( + self.events.on_completion, + { + "operation_id": operation_id, + "success": False, + "message": error_msg + } + ) + # Return empty dict on error (scan failed but not critical) + return {} + diff --git a/src/server/services/anime_service.py b/src/server/services/anime_service.py index 581009d..84261b4 100644 --- a/src/server/services/anime_service.py +++ b/src/server/services/anime_service.py @@ -1550,6 +1550,7 @@ async def sync_series_from_data_files( name=serie.name, site=serie.site, folder=serie.folder, + year=serie.year if hasattr(serie, 'year') else None, ) # Create Episode records for each episode in episodeDict