""" SerieScanner - Scans directories for anime series and missing episodes. This module provides functionality to scan anime directories, identify missing episodes, and report progress through callback interfaces. Note: This module is pure domain logic. Database operations are handled by the service layer (AnimeService). """ from __future__ import annotations import asyncio import logging import os import re import traceback import uuid from typing import Callable, Iterable, Iterator, Optional from events import Events from src.config.settings import settings from src.core.entities.series import Serie from src.core.exceptions.Exceptions import MatchNotFoundError, NoKeyFoundException from src.core.providers.base_provider import Loader from src.core.utils.key_utils import generate_key_from_folder from src.server.database.connection import get_sync_session from src.server.database.service import AnimeSeriesService, EpisodeService logger = logging.getLogger(__name__) error_logger = logging.getLogger("error") no_key_found_logger = logging.getLogger("series.nokey") class SerieScanner: """ Scans directories for anime series and identifies missing episodes. Supports progress callbacks for real-time scanning updates. Note: This class is pure domain logic. Database operations are handled by the service layer (AnimeService). Scan results are stored in keyDict and can be retrieved after scanning. Example: # Synchronous context (CLI): scanner = SerieScanner("/path/to/anime", loader) scanner.scan() # asyncio.run() used internally when no event loop # Asynchronous context (server/scheduler): # scan() detects running event loop and uses create_task() # internally, so no special handling needed by caller. # Results are in scanner.keyDict # With DB lookup fallback: scanner = SerieScanner("/path/to/anime", loader, db_lookup=lambda folder: my_db.get_by_folder(folder)) # With scan key overrides: overrides = {"Folder Name": "correct-provider-key"} scanner = SerieScanner("/path/to/anime", loader, scan_key_overrides=overrides) """ def __init__( self, basePath: str, loader: Loader, db_lookup: Optional[Callable[[str], Optional["Serie"]]] = None, scan_key_overrides: Optional[dict[str, str]] = None, ) -> None: """ Initialize the SerieScanner. Args: basePath: Base directory containing anime series loader: Loader instance for fetching series information db_lookup: Optional callable ``(folder_name) -> Serie | None``. When provided, it is called as a fallback when neither a ``key`` file nor a ``data`` file is found in the folder. This allows the database to supply the series key for folders that have never had a local key file. scan_key_overrides: Optional dict mapping folder names to provider keys. When a folder name is found in this dict, the override key is used instead of auto-generating from folder name. Format: {"Folder Name": "actual-provider-key"} Raises: ValueError: If basePath is invalid or doesn't exist """ # Validate basePath to prevent directory traversal attacks if not basePath or not basePath.strip(): raise ValueError("Base path cannot be empty") # Resolve to absolute path and validate it exists abs_path = os.path.abspath(basePath) if not os.path.exists(abs_path): raise ValueError(f"Base path does not exist: {abs_path}") if not os.path.isdir(abs_path): raise ValueError(f"Base path is not a directory: {abs_path}") self.directory: str = abs_path self.keyDict: dict[str, Serie] = {} self.loader: Loader = loader self._db_lookup: Optional[Callable[[str], Optional[Serie]]] = db_lookup self._scan_key_overrides: Optional[dict[str, str]] = scan_key_overrides self._current_operation_id: Optional[str] = None self.events = Events() self.events.on_progress = [] self.events.on_error = [] self.events.on_warning = [] self.events.on_completion = [] logger.info("Initialized SerieScanner with base path: %s", abs_path) def _safe_call_event(self, event_handler, data: dict) -> None: """Safely call an event handler if it exists. Args: event_handler: Event handler attribute (e.g., self.events.on_progress) data: Data dictionary to pass to the event handler """ if event_handler: try: # Event handlers are stored as lists, iterate over them for handler in event_handler: handler(data) except Exception as e: logger.error("Error calling event handler: %s", e, exc_info=True) def subscribe_on_progress(self, handler): """ Subscribe a handler to an event. Args: handler: Callable to handle the event """ if handler not in self.events.on_progress: self.events.on_progress.append(handler) def unsubscribe_on_progress(self, handler): """ Unsubscribe a handler from an event. Args: handler: Callable to remove """ if handler in self.events.on_progress: self.events.on_progress.remove(handler) def _extract_year_from_folder_name(self, folder_name: str) -> int | None: """Extract year from folder name if present. Looks for year in format "(YYYY)" at the end of folder name. Args: folder_name: The folder name to check Returns: int or None: Year if found, None otherwise Example: >>> _extract_year_from_folder_name("Dororo (2025)") 2025 >>> _extract_year_from_folder_name("Dororo") None """ if not folder_name: return None # Look for year in format (YYYY) - typically at end of name match = re.search(r'\((\d{4})\)', folder_name) if match: try: year = int(match.group(1)) # Validate year is reasonable (between 1900 and 2100) if 1900 <= year <= 2100: logger.debug( "Extracted year from folder name: %s -> %d", folder_name, year ) return year except ValueError: pass return None def subscribe_on_error(self, handler): """ Subscribe a handler to an event. Args: handler: Callable to handle the event """ if handler not in self.events.on_error: self.events.on_error.append(handler) def unsubscribe_on_error(self, handler): """ Unsubscribe a handler from an event. Args: handler: Callable to remove """ if handler in self.events.on_error: self.events.on_error.remove(handler) def subscribe_on_warning(self, handler): """ Subscribe a handler to an event. Args: handler: Callable to handle the event """ if handler not in self.events.on_warning: self.events.on_warning.append(handler) def unsubscribe_on_warning(self, handler): """ Unsubscribe a handler from an event. Args: handler: Callable to remove """ if handler in self.events.on_warning: self.events.on_warning.remove(handler) def subscribe_on_completion(self, handler): """ Subscribe a handler to an event. Args: handler: Callable to handle the event """ if handler not in self.events.on_completion: self.events.on_completion.append(handler) def unsubscribe_on_completion(self, handler): """ Unsubscribe a handler from an event. Args: handler: Callable to remove """ if handler in self.events.on_completion: self.events.on_completion.remove(handler) def reinit(self) -> None: """Reinitialize the series dictionary (keyed by serie.key).""" self.keyDict: dict[str, Serie] = {} async def _persist_serie_to_db(self, serie: Serie) -> None: """Persist serie to database (create or update). Args: serie: Serie domain object to persist """ try: from src.server.database.connection import get_async_session_factory db = get_async_session_factory() try: existing = await AnimeSeriesService.get_by_key(db, serie.key) if existing: await AnimeSeriesService.update( db, existing.id, name=serie.name, folder=serie.folder, year=serie.year ) await self._sync_episodes_to_db(db, existing.id, serie.episodeDict) else: anime_series = await AnimeSeriesService.create( db=db, key=serie.key, name=serie.name, site=serie.site, folder=serie.folder, year=serie.year ) for season, eps in serie.episodeDict.items(): for ep in eps: await EpisodeService.create( db=db, series_id=anime_series.id, season=season, episode_number=ep ) await db.commit() logger.debug( "Persisted serie '%s' (key=%s) to database", serie.name, serie.key ) except Exception as e: await db.rollback() logger.error( "Failed to persist serie '%s' to DB: %s", serie.key, e, exc_info=True ) raise finally: await db.close() except Exception as e: logger.error( "Could not persist serie '%s' to DB (DB unavailable?): %s", serie.key, e ) async def _sync_episodes_to_db( self, db, series_id: int, episode_dict: dict[int, list[int]] ) -> None: """Sync episodes to database, preserving downloaded flags. Adds missing episodes, removes episodes no longer missing, and preserves is_downloaded=True episodes. Args: db: Async database session series_id: Database ID of the series episode_dict: Dict mapping season -> list of episode numbers """ existing_episodes = await EpisodeService.get_by_series(db, series_id) existing_map = { (ep.season, ep.episode_number): ep for ep in existing_episodes } new_keys = set() for season, eps in episode_dict.items(): for ep_num in eps: new_keys.add((season, ep_num)) for (season, ep_num), ep in existing_map.items(): if (season, ep_num) not in new_keys: if ep.is_downloaded: logger.debug( "Preserving downloaded episode S%02dE%02d for series_id=%d", season, ep_num, series_id ) else: await EpisodeService.delete_by_series( db, series_id, season, ep_num ) for season, eps in episode_dict.items(): for ep_num in eps: if (season, ep_num) not in existing_map: await EpisodeService.create( db=db, series_id=series_id, season=season, episode_number=ep_num ) def get_total_to_scan(self) -> int: """Get the total number of folders to scan. Returns: Total count of folders with MP4 files """ result = self.__find_mp4_files() return sum(1 for _ in result) def scan(self) -> None: """ Scan directories for anime series and missing episodes. Results are stored in self.keyDict and can be retrieved after scanning. Data files are also saved to disk for persistence. Raises: Exception: If scan fails critically """ # Generate unique operation ID self._current_operation_id = str(uuid.uuid4()) logger.info("Starting scan for missing episodes") # Notify scan starting self._safe_call_event( self.events.on_progress, { "operation_id": self._current_operation_id, "phase": "STARTING", "current": 0, "total": 0, "percentage": 0.0, "message": "Initializing scan" } ) try: # Get total items to process total_to_scan = self.get_total_to_scan() logger.info("Total folders to scan: %d", total_to_scan) # The scanner enumerates folders with mp4 files, loads existing # metadata, calculates the missing episodes via the provider, and # persists the refreshed metadata while emitting progress events. result = self.__find_mp4_files() counter = 0 for folder, mp4_files in result: try: counter += 1 # Calculate progress if total_to_scan > 0: percentage = (counter / total_to_scan) * 100 else: percentage = 0.0 # Notify progress self._safe_call_event( self.events.on_progress, { "operation_id": self._current_operation_id, "phase": "IN_PROGRESS", "current": counter, "total": total_to_scan, "percentage": percentage, "message": f"Scanning: {folder}", "details": f"Found {len(mp4_files)} episodes" } ) serie = self.__read_data_from_file(folder) if serie is None or not serie.key or not serie.key.strip(): logger.warning( "No key or data file found for folder '%s', skipping", folder, ) if ( serie is not None and serie.key and serie.key.strip() ): # Try to extract year from folder name first if not hasattr(serie, 'year') or not serie.year: year_from_folder = self._extract_year_from_folder_name(folder) if year_from_folder: serie.year = year_from_folder logger.info( "Using year from folder name: %s (year=%d)", folder, year_from_folder ) else: # If not in folder name, fetch from provider try: serie.year = self.loader.get_year(serie.key) if serie.year: logger.info( "Fetched year from provider: %s (year=%d)", serie.key, serie.year ) except Exception as e: logger.warning( "Could not fetch year for %s: %s", serie.key, str(e) ) # Delegate the provider to compare local files with # remote metadata, yielding missing episodes per # season. Results are saved back to disk so that both # CLI and API consumers see consistent state. missing_episodes, _site = ( self.__get_missing_episodes_and_season( serie.key, mp4_files ) ) serie.episodeDict = missing_episodes serie.folder = folder # Persist to database (async) try: try: loop = asyncio.get_running_loop() except RuntimeError: # No running loop — safe to use asyncio.run() asyncio.run(self._persist_serie_to_db(serie)) else: # Already in async context — schedule as task asyncio.create_task(self._persist_serie_to_db(serie)) except Exception as e: logger.warning( "DB persistence failed for '%s', " "continuing without DB: %s", serie.key, e ) # Store by key (primary identifier), not folder if serie.key in self.keyDict: existing = self.keyDict[serie.key] logger.warning( "Duplicate series found with key '%s': " "folder '%s' maps to same key as existing folder '%s'. " "Skipping duplicate folder.", serie.key, folder, existing.folder ) self._safe_call_event( self.events.on_warning, { "operation_id": self._current_operation_id, "warning": "duplicate_key", "message": f"Duplicate series skipped: '{folder}' maps to key '{serie.key}' already used by '{existing.folder}'", "metadata": { "key": serie.key, "duplicate_folder": folder, "existing_folder": existing.folder, } } ) else: self.keyDict[serie.key] = serie logger.debug( "Stored series with key '%s' (folder: '%s')", serie.key, folder ) no_key_found_logger.info( "Saved Serie: '%s'", str(serie) ) except NoKeyFoundException as nkfe: # Log error and notify via callback error_msg = f"Error processing folder '{folder}': {nkfe}" logger.error(error_msg) self._safe_call_event( self.events.on_error, { "operation_id": self._current_operation_id, "error": nkfe, "message": error_msg, "recoverable": True, "metadata": {"folder": folder, "key": None} } ) except Exception as e: # Log error and notify via callback error_msg = ( f"Folder: '{folder}' - " f"Unexpected error: {e}" ) error_logger.error( "%s\n%s", error_msg, traceback.format_exc() ) self._safe_call_event( self.events.on_error, { "operation_id": self._current_operation_id, "error": e, "message": error_msg, "recoverable": True, "metadata": {"folder": folder, "key": None} } ) continue # Notify scan completion self._safe_call_event( self.events.on_completion, { "operation_id": self._current_operation_id, "success": True, "message": f"Scan completed. Processed {counter} folders.", "statistics": { "total_folders": counter, "series_found": len(self.keyDict) } } ) logger.info( "Scan completed. Processed %d folders, found %d series", counter, len(self.keyDict) ) except Exception as e: # Critical error - notify and re-raise error_msg = f"Critical scan error: {e}" logger.error("%s\n%s", error_msg, traceback.format_exc()) self._safe_call_event( self.events.on_error, { "operation_id": self._current_operation_id, "error": e, "message": error_msg, "recoverable": False } ) self._safe_call_event( self.events.on_completion, { "operation_id": self._current_operation_id, "success": False, "message": error_msg } ) raise def __find_mp4_files(self) -> Iterator[tuple[str, list[str]]]: """Find all .mp4 files in the directory structure.""" logger.info("Scanning for .mp4 files") for anime_name in os.listdir(self.directory): anime_path = os.path.join(self.directory, anime_name) if os.path.isdir(anime_path): if settings.should_ignore_folder(anime_name): logger.debug("Skipping ignored folder: %s", anime_name) continue mp4_files: list[str] = [] has_files = False for root, _, files in os.walk(anime_path): for file in files: if file.endswith(".mp4"): mp4_files.append(os.path.join(root, file)) has_files = True yield anime_name, mp4_files if has_files else [] def __read_data_from_file(self, folder_name: str) -> Optional[Serie]: """Load or discover a Serie for the given folder. Strategy: 1. Query DB by folder name 2. If found, return cached Serie object 3. If not in DB, fall back to provider search via _db_lookup callback 4. If still not found, try reading 'data' file for legacy deployments 5. Check user-provided key overrides in scan_key_overrides 6. Generate key from folder name as last resort Args: folder_name: Filesystem folder name Returns: Serie object with valid key if found, None otherwise Note: DB is the source of truth. File-based lookups (data files) are temporary backward compatibility for CLI-only deployments. """ # Step 1: Try DB lookup by folder name try: session = get_sync_session() try: anime_series = AnimeSeriesService.get_by_folder_sync(session, folder_name) if anime_series: # Reconstruct Serie from DB record episode_dict: dict[int, list[int]] = {} if anime_series.episodes: for ep in anime_series.episodes: season = ep.season or 1 if season not in episode_dict: episode_dict[season] = [] episode_dict[season].append(ep.episode_number or ep.number or 0) return Serie( key=anime_series.key, name=anime_series.name, site=anime_series.site, folder=anime_series.folder, episodeDict=episode_dict, year=anime_series.year ) finally: session.close() except Exception as exc: logger.warning( "DB lookup failed for folder '%s': %s", folder_name, exc ) # Step 2: Fall back to provider search callback if self._db_lookup is not None: try: serie = self._db_lookup(folder_name) if serie and serie.key and serie.key.strip(): logger.info( "Provider lookup resolved folder '%s' -> key='%s'", folder_name, serie.key ) return serie except Exception as exc: logger.warning( "Provider lookup failed for folder '%s': %s", folder_name, exc ) # Step 3: Legacy data file fallback (CLI-only deployments) folder_path = os.path.join(self.directory, folder_name) serie_file = os.path.join(folder_path, 'data') if os.path.exists(serie_file): with open(serie_file, "rb") as file: logger.info( "load serie_file from '%s': %s", folder_name, serie_file ) return Serie.load_from_file(serie_file) # Step 4: Check for user-provided key overrides before generating if self._scan_key_overrides and folder_name in self._scan_key_overrides: override_key = self._scan_key_overrides[folder_name] year_from_folder = self._extract_year_from_folder_name(folder_name) logger.info( "Using scan key override for folder '%s' -> key='%s'", folder_name, override_key ) return Serie( key=override_key, name="", # Name will be fetched from provider if needed site="aniworld.to", folder=folder_name, episodeDict=dict(), year=year_from_folder ) # Step 5: Generate key from folder name as last resort # This handles edge cases like non-Latin characters or special symbols try: generated_key = generate_key_from_folder(folder_name) year_from_folder = self._extract_year_from_folder_name(folder_name) logger.info( "Generated key for folder '%s' -> key='%s'", folder_name, generated_key ) return Serie( key=generated_key, name="", # Name will be fetched from provider if needed site="aniworld.to", folder=folder_name, episodeDict=dict(), year=year_from_folder ) except Exception as exc: logger.warning( "Failed to generate key for folder '%s': %s", folder_name, exc ) return None def __get_episode_and_season(self, filename: str) -> tuple[int, int]: """Extract season and episode numbers from filename. Args: filename: Filename to parse Returns: Tuple of (season, episode) as integers Raises: MatchNotFoundError: If pattern not found """ pattern = r'S(\d+)E(\d+)' match = re.search(pattern, filename) if match: season = match.group(1) episode = match.group(2) logger.debug( "Extracted season %s, episode %s from '%s'", season, episode, filename ) return int(season), int(episode) else: logger.error( "Failed to find season/episode pattern in '%s'", filename ) raise MatchNotFoundError( "Season and episode pattern not found in the filename." ) def __get_episodes_and_seasons( self, mp4_files: Iterable[str] ) -> dict[int, list[int]]: """Get episodes grouped by season from mp4 files. Args: mp4_files: List of MP4 filenames Returns: Dictionary mapping season to list of episode numbers """ episodes_dict: dict[int, list[int]] = {} for file in mp4_files: season, episode = self.__get_episode_and_season(file) if season in episodes_dict: episodes_dict[season].append(episode) else: episodes_dict[season] = [episode] return episodes_dict def __get_missing_episodes_and_season( self, key: str, mp4_files: Iterable[str] ) -> tuple[dict[int, list[int]], str]: """Get missing episodes for a serie. Args: key: Series key mp4_files: List of MP4 filenames Returns: Tuple of (episodes_dict, site_name) """ # key season , value count of episodes expected_dict = self.loader.get_season_episode_count(key) filedict = self.__get_episodes_and_seasons(mp4_files) episodes_dict: dict[int, list[int]] = {} for season, expected_count in expected_dict.items(): existing_episodes = filedict.get(season, []) missing_episodes = [ ep for ep in range(1, expected_count + 1) if ep not in existing_episodes and self.loader.is_language(season, ep, key) ] if missing_episodes: episodes_dict[season] = missing_episodes return episodes_dict, "aniworld.to" def scan_single_series( self, key: str, folder: str, ) -> dict[int, list[int]]: """ Scan a single series for missing episodes. This method performs a targeted scan for only the specified series, without triggering a full library rescan. It fetches available episodes from the provider and compares with local files. Args: key: The unique provider key for the series folder: The filesystem folder name where the series is stored Returns: dict[int, list[int]]: Dictionary mapping season numbers to lists of missing episode numbers. Empty dict if no missing episodes. Raises: ValueError: If key or folder is empty Example: >>> scanner = SerieScanner("/path/to/anime", loader) >>> missing = scanner.scan_single_series( ... "attack-on-titan", ... "Attack on Titan" ... ) >>> print(missing) {1: [5, 6, 7], 2: [1, 2]} """ if not key or not key.strip(): raise ValueError("Series key cannot be empty") if not folder or not folder.strip(): raise ValueError("Series folder cannot be empty") logger.info( "Starting targeted scan for series: %s (folder: %s)", key, folder ) # Generate unique operation ID for this targeted scan operation_id = str(uuid.uuid4()) # Notify scan starting self._safe_call_event( self.events.on_progress, { "operation_id": operation_id, "phase": "STARTING", "current": 0, "total": 1, "percentage": 0.0, "message": f"Scanning series: {folder}", "details": f"Key: {key}" } ) try: # Get the folder path folder_path = os.path.join(self.directory, folder) # Check if folder exists if not os.path.isdir(folder_path): logger.info( "Series folder does not exist yet: %s - " "will scan for available episodes from provider", folder_path ) mp4_files: list[str] = [] else: # Find existing MP4 files in the folder mp4_files = [] for root, _, files in os.walk(folder_path): for file in files: if file.endswith(".mp4"): mp4_files.append(os.path.join(root, file)) logger.debug( "Found %d existing MP4 files in folder %s", len(mp4_files), folder ) # Get missing episodes from provider missing_episodes, site = self.__get_missing_episodes_and_season( key, mp4_files ) # Update progress self._safe_call_event( self.events.on_progress, { "operation_id": operation_id, "phase": "IN_PROGRESS", "current": 1, "total": 1, "percentage": 100.0, "message": f"Scanned: {folder}", "details": f"Found {sum(len(eps) for eps in missing_episodes.values())} missing episodes" } ) # Create or update Serie in keyDict if key in self.keyDict: # Update existing serie self.keyDict[key].episodeDict = missing_episodes logger.debug( "Updated existing series %s with %d missing episodes", key, sum(len(eps) for eps in missing_episodes.values()) ) else: # Try to extract year from folder name first year = self._extract_year_from_folder_name(folder) if year: logger.info( "Using year from folder name: %s (year=%d)", folder, year ) else: # If not in folder name, fetch from provider try: year = self.loader.get_year(key) if year: logger.info( "Fetched year from provider: %s (year=%d)", key, year ) except Exception as e: logger.warning( "Could not fetch year for %s: %s", key, str(e) ) # Create new serie entry serie = Serie( key=key, name="", # Will be populated by caller if needed site=site, folder=folder, episodeDict=missing_episodes, year=year ) self.keyDict[key] = serie logger.debug( "Created new series entry for %s with %d missing episodes (year=%s)", key, sum(len(eps) for eps in missing_episodes.values()), year ) # Notify completion self._safe_call_event( self.events.on_completion, { "operation_id": operation_id, "success": True, "message": f"Scan completed for {folder}", "statistics": { "missing_episodes": sum( len(eps) for eps in missing_episodes.values() ), "seasons_with_missing": len(missing_episodes) } } ) logger.info( "Targeted scan completed for %s: %d missing episodes across %d seasons", key, sum(len(eps) for eps in missing_episodes.values()), len(missing_episodes) ) return missing_episodes except Exception as e: error_msg = f"Failed to scan series {key}: {e}" logger.error(error_msg, exc_info=True) # Notify error self._safe_call_event( self.events.on_error, { "operation_id": operation_id, "error": e, "message": error_msg, "recoverable": True, "metadata": {"key": key, "folder": folder} } ) # Notify completion with failure self._safe_call_event( self.events.on_completion, { "operation_id": operation_id, "success": False, "message": error_msg } ) # Return empty dict on error (scan failed but not critical) return {}