feat: add duplicate folder detection and /duplicate-folders API endpoint
- Add DuplicateFolderGroup and DuplicateFoldersResponse Pydantic models - Add /duplicate-folders GET endpoint for listing pre-existing duplicates - Add _scan_for_pre_existing_duplicates() function for NFO-based detection - Add _try_merge_duplicate_group() for auto-merging empty/symlink-only duplicates - Integrate duplicate detection into validate_and_rename_series_folders workflow - Skip rename for flagged duplicates to prevent data loss during merge
This commit is contained in:
@@ -1,12 +1,13 @@
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from src.core.entities.series import Serie
|
||||
from src.config.settings import settings
|
||||
from src.core.utils.key_utils import generate_key_from_folder, is_valid_key
|
||||
from src.server.database.service import AnimeSeriesService
|
||||
from src.server.exceptions import (
|
||||
@@ -26,6 +27,9 @@ from src.server.utils.dependencies import (
|
||||
)
|
||||
from src.server.utils.filesystem import sanitize_folder_name
|
||||
from src.server.utils.validators import validate_filter_value, validate_search_query
|
||||
from src.server.services.folder_rename_service import (
|
||||
_scan_for_pre_existing_duplicates,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -71,6 +75,100 @@ async def get_anime_status(
|
||||
) from exc
|
||||
|
||||
|
||||
class DuplicateFolderGroup(BaseModel):
|
||||
"""A group of duplicate folders for the same series.
|
||||
|
||||
Attributes:
|
||||
key: Series key (provider-assigned unique identifier)
|
||||
folders: List of folder names that are duplicates
|
||||
folder_count: Number of duplicate folders
|
||||
"""
|
||||
key: str = Field(..., description="Series key (unique identifier)")
|
||||
folders: List[str] = Field(..., description="List of duplicate folder names")
|
||||
folder_count: int = Field(..., description="Number of duplicate folders")
|
||||
|
||||
|
||||
class DuplicateFoldersResponse(BaseModel):
|
||||
"""Response model for duplicate folders listing.
|
||||
|
||||
Attributes:
|
||||
total_groups: Total number of duplicate groups found
|
||||
duplicate_groups: List of duplicate folder groups
|
||||
message: Human-readable summary
|
||||
"""
|
||||
total_groups: int = Field(..., description="Total number of duplicate groups")
|
||||
duplicate_groups: List[DuplicateFolderGroup] = Field(
|
||||
..., description="List of duplicate folder groups"
|
||||
)
|
||||
message: str = Field(..., description="Human-readable summary")
|
||||
|
||||
|
||||
@router.get("/duplicate-folders", response_model=DuplicateFoldersResponse)
|
||||
async def get_duplicate_folders(
|
||||
_auth: dict = Depends(require_auth),
|
||||
) -> DuplicateFoldersResponse:
|
||||
"""List all pre-existing duplicate folder groups.
|
||||
|
||||
Scans the anime directory for folders with tvshow.nfo files that
|
||||
map to the same series key. Returns groups of duplicates for
|
||||
manual review and cleanup.
|
||||
|
||||
Returns:
|
||||
DuplicateFoldersResponse with groups of duplicate folders
|
||||
|
||||
Note:
|
||||
Not all duplicate folders are safe to merge - some may belong
|
||||
to different releases (e.g., dubbed vs. subbed). Review carefully
|
||||
before taking action.
|
||||
"""
|
||||
try:
|
||||
if not settings.anime_directory:
|
||||
return DuplicateFoldersResponse(
|
||||
total_groups=0,
|
||||
duplicate_groups=[],
|
||||
message="Anime directory not configured",
|
||||
)
|
||||
|
||||
anime_dir = Path(settings.anime_directory)
|
||||
if not anime_dir.is_dir():
|
||||
return DuplicateFoldersResponse(
|
||||
total_groups=0,
|
||||
duplicate_groups=[],
|
||||
message=f"Anime directory not found: {anime_dir}",
|
||||
)
|
||||
|
||||
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
|
||||
|
||||
groups = [
|
||||
DuplicateFolderGroup(
|
||||
key=dup.key,
|
||||
folders=dup.folders,
|
||||
folder_count=dup.count,
|
||||
)
|
||||
for dup in duplicates
|
||||
]
|
||||
|
||||
if groups:
|
||||
message = (
|
||||
f"Found {len(groups)} duplicate group(s). "
|
||||
"Review carefully - some duplicates may be different releases "
|
||||
"(e.g., dubbed vs. subbed)."
|
||||
)
|
||||
else:
|
||||
message = "No duplicate folders found."
|
||||
|
||||
return DuplicateFoldersResponse(
|
||||
total_groups=len(groups),
|
||||
duplicate_groups=groups,
|
||||
message=message,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("Failed to scan for duplicate folders: %s", str(exc))
|
||||
raise ServerError(
|
||||
message=f"Failed to scan for duplicates: {str(exc)}"
|
||||
) from exc
|
||||
|
||||
|
||||
class AnimeSummary(BaseModel):
|
||||
"""Summary of an anime series with missing episodes.
|
||||
|
||||
|
||||
@@ -13,8 +13,9 @@ reflect the new paths.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@@ -34,6 +35,141 @@ logger = logging.getLogger(__name__)
|
||||
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
|
||||
|
||||
|
||||
class DuplicateGroup:
|
||||
"""Represents a group of duplicate folders for the same series.
|
||||
|
||||
Attributes:
|
||||
key: The series key (folder name before rename).
|
||||
folders: List of folder paths that map to this series.
|
||||
nfo_paths: List of corresponding NFO file paths.
|
||||
"""
|
||||
|
||||
def __init__(self, key: str, folders: List[str], nfo_paths: List[Path]):
|
||||
self.key = key
|
||||
self.folders = folders
|
||||
self.nfo_paths = nfo_paths
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return len(self.folders)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"DuplicateGroup(key={self.key!r}, folders={self.folders})"
|
||||
|
||||
|
||||
def _scan_for_pre_existing_duplicates(anime_dir: Path) -> List[DuplicateGroup]:
|
||||
"""Scan anime directory for pre-existing duplicate folders.
|
||||
|
||||
Groups folders by the series key extracted from their NFO files.
|
||||
Folders with the same title+year (same expected name) are flagged as duplicates.
|
||||
|
||||
Args:
|
||||
anime_dir: Path to the anime directory to scan.
|
||||
|
||||
Returns:
|
||||
List of DuplicateGroup objects, one per series with duplicate folders.
|
||||
"""
|
||||
# Group folders by their expected name (title+year from NFO)
|
||||
groups: Dict[str, List[Tuple[str, Path]]] = defaultdict(list)
|
||||
|
||||
for series_dir in anime_dir.iterdir():
|
||||
if not series_dir.is_dir():
|
||||
continue
|
||||
nfo_path = series_dir / "tvshow.nfo"
|
||||
if not nfo_path.exists():
|
||||
continue
|
||||
title, year = _parse_nfo_title_and_year(nfo_path)
|
||||
if not title or not year:
|
||||
continue
|
||||
expected_name = _compute_expected_folder_name(title, year)
|
||||
groups[expected_name].append((series_dir.name, nfo_path))
|
||||
|
||||
# Filter to only groups with more than one folder
|
||||
duplicates = []
|
||||
for key, items in groups.items():
|
||||
if len(items) > 1:
|
||||
folders = [item[0] for item in items]
|
||||
nfo_paths = [item[1] for item in items]
|
||||
duplicates.append(DuplicateGroup(key=key, folders=folders, nfo_paths=nfo_paths))
|
||||
|
||||
return duplicates
|
||||
|
||||
|
||||
def _try_merge_duplicate_group(group: DuplicateGroup, dry_run: bool = False) -> bool:
|
||||
"""Attempt to merge a duplicate group automatically.
|
||||
|
||||
Uses the first folder as the canonical one and removes others if they are
|
||||
empty or contain only symlinks.
|
||||
|
||||
Args:
|
||||
group: The DuplicateGroup to merge.
|
||||
dry_run: If True, only log actions without executing them.
|
||||
|
||||
Returns:
|
||||
True if merge was successful, False otherwise.
|
||||
"""
|
||||
if len(group.folders) < 2:
|
||||
return True
|
||||
|
||||
# Keep first folder as canonical, mark others for removal
|
||||
canonical = group.folders[0]
|
||||
to_remove = group.folders[1:]
|
||||
|
||||
for folder in to_remove:
|
||||
folder_path = group.nfo_paths[0].parent.parent / folder # same parent dir
|
||||
if not folder_path.exists():
|
||||
continue
|
||||
|
||||
# Check if folder is empty or only has symlinks
|
||||
try:
|
||||
contents = list(folder_path.iterdir())
|
||||
except PermissionError:
|
||||
logger.warning("Permission denied accessing %s, skip merge", folder_path)
|
||||
return False
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
if not contents:
|
||||
# Empty folder - safe to remove
|
||||
if dry_run:
|
||||
logger.info("[DRY-RUN] Would delete empty duplicate folder: %s", folder_path)
|
||||
else:
|
||||
try:
|
||||
folder_path.rmdir()
|
||||
logger.info("Deleted empty duplicate folder: %s", folder_path)
|
||||
except OSError:
|
||||
return False
|
||||
continue
|
||||
|
||||
# Check if all contents are symlinks pointing to canonical
|
||||
all_symlinks = all(
|
||||
item.is_symlink() and item.resolve() == (folder_path.parent / canonical).resolve()
|
||||
for item in contents
|
||||
)
|
||||
if all_symlinks:
|
||||
if dry_run:
|
||||
logger.info("[DRY-RUN] Would remove symlinks in duplicate folder: %s", folder_path)
|
||||
else:
|
||||
for item in contents:
|
||||
item.unlink()
|
||||
try:
|
||||
folder_path.rmdir()
|
||||
logger.info("Removed symlink-only duplicate folder: %s", folder_path)
|
||||
except OSError:
|
||||
return False
|
||||
continue
|
||||
|
||||
# Cannot auto-merge - requires manual intervention
|
||||
logger.warning(
|
||||
"Cannot auto-merge duplicate folders for '%s': %s (manual merge required)",
|
||||
group.key,
|
||||
[canonical] + to_remove,
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Parse a tvshow.nfo and return (title, year) text values.
|
||||
|
||||
@@ -383,6 +519,28 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
||||
|
||||
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
|
||||
|
||||
# Detect pre-existing duplicates before rename loop
|
||||
pre_existing_duplicates: Set[str] = set()
|
||||
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
|
||||
for dup_group in duplicates:
|
||||
# Try automatic merge first
|
||||
if _try_merge_duplicate_group(dup_group, dry_run=dry_run):
|
||||
logger.info(
|
||||
"Auto-merged duplicate group for '%s' (%d folders)",
|
||||
dup_group.key,
|
||||
dup_group.count,
|
||||
)
|
||||
else:
|
||||
# Flag all folders in this group as pre-existing duplicates
|
||||
for folder in dup_group.folders:
|
||||
pre_existing_duplicates.add(folder)
|
||||
logger.warning(
|
||||
"Duplicate folders detected for series '%s': %s — "
|
||||
"manual cleanup required (different releases or non-empty duplicates)",
|
||||
dup_group.key,
|
||||
dup_group.folders,
|
||||
)
|
||||
|
||||
for series_dir in sorted(anime_dir.iterdir()):
|
||||
if not series_dir.is_dir():
|
||||
continue
|
||||
@@ -422,6 +580,15 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
||||
|
||||
expected_path = anime_dir / expected_name
|
||||
|
||||
# Check for pre-existing duplicate
|
||||
if current_name in pre_existing_duplicates:
|
||||
logger.warning(
|
||||
"Skipping rename for '%s' — pre-existing duplicate folder detected",
|
||||
current_name,
|
||||
)
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
# Check for duplicate target
|
||||
if expected_path.exists():
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user