feat: add duplicate folder detection and /duplicate-folders API endpoint

- Add DuplicateFolderGroup and DuplicateFoldersResponse Pydantic models
- Add /duplicate-folders GET endpoint for listing pre-existing duplicates
- Add _scan_for_pre_existing_duplicates() function for NFO-based detection
- Add _try_merge_duplicate_group() for auto-merging empty/symlink-only duplicates
- Integrate duplicate detection into validate_and_rename_series_folders workflow
- Skip rename for flagged duplicates to prevent data loss during merge
This commit is contained in:
2026-05-28 21:46:08 +02:00
parent 239341629c
commit 1ef59c5283
2 changed files with 267 additions and 2 deletions

View File

@@ -1,12 +1,13 @@
import logging
import warnings
from pathlib import Path
from typing import Any, List, Optional
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field, field_validator
from sqlalchemy.ext.asyncio import AsyncSession
from src.core.entities.series import Serie
from src.config.settings import settings
from src.core.utils.key_utils import generate_key_from_folder, is_valid_key
from src.server.database.service import AnimeSeriesService
from src.server.exceptions import (
@@ -26,6 +27,9 @@ from src.server.utils.dependencies import (
)
from src.server.utils.filesystem import sanitize_folder_name
from src.server.utils.validators import validate_filter_value, validate_search_query
from src.server.services.folder_rename_service import (
_scan_for_pre_existing_duplicates,
)
logger = logging.getLogger(__name__)
@@ -71,6 +75,100 @@ async def get_anime_status(
) from exc
class DuplicateFolderGroup(BaseModel):
"""A group of duplicate folders for the same series.
Attributes:
key: Series key (provider-assigned unique identifier)
folders: List of folder names that are duplicates
folder_count: Number of duplicate folders
"""
key: str = Field(..., description="Series key (unique identifier)")
folders: List[str] = Field(..., description="List of duplicate folder names")
folder_count: int = Field(..., description="Number of duplicate folders")
class DuplicateFoldersResponse(BaseModel):
"""Response model for duplicate folders listing.
Attributes:
total_groups: Total number of duplicate groups found
duplicate_groups: List of duplicate folder groups
message: Human-readable summary
"""
total_groups: int = Field(..., description="Total number of duplicate groups")
duplicate_groups: List[DuplicateFolderGroup] = Field(
..., description="List of duplicate folder groups"
)
message: str = Field(..., description="Human-readable summary")
@router.get("/duplicate-folders", response_model=DuplicateFoldersResponse)
async def get_duplicate_folders(
_auth: dict = Depends(require_auth),
) -> DuplicateFoldersResponse:
"""List all pre-existing duplicate folder groups.
Scans the anime directory for folders with tvshow.nfo files that
map to the same series key. Returns groups of duplicates for
manual review and cleanup.
Returns:
DuplicateFoldersResponse with groups of duplicate folders
Note:
Not all duplicate folders are safe to merge - some may belong
to different releases (e.g., dubbed vs. subbed). Review carefully
before taking action.
"""
try:
if not settings.anime_directory:
return DuplicateFoldersResponse(
total_groups=0,
duplicate_groups=[],
message="Anime directory not configured",
)
anime_dir = Path(settings.anime_directory)
if not anime_dir.is_dir():
return DuplicateFoldersResponse(
total_groups=0,
duplicate_groups=[],
message=f"Anime directory not found: {anime_dir}",
)
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
groups = [
DuplicateFolderGroup(
key=dup.key,
folders=dup.folders,
folder_count=dup.count,
)
for dup in duplicates
]
if groups:
message = (
f"Found {len(groups)} duplicate group(s). "
"Review carefully - some duplicates may be different releases "
"(e.g., dubbed vs. subbed)."
)
else:
message = "No duplicate folders found."
return DuplicateFoldersResponse(
total_groups=len(groups),
duplicate_groups=groups,
message=message,
)
except Exception as exc:
logger.error("Failed to scan for duplicate folders: %s", str(exc))
raise ServerError(
message=f"Failed to scan for duplicates: {str(exc)}"
) from exc
class AnimeSummary(BaseModel):
"""Summary of an anime series with missing episodes.

View File

@@ -13,8 +13,9 @@ reflect the new paths.
from __future__ import annotations
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Set, Tuple
from lxml import etree
@@ -34,6 +35,141 @@ logger = logging.getLogger(__name__)
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
class DuplicateGroup:
"""Represents a group of duplicate folders for the same series.
Attributes:
key: The series key (folder name before rename).
folders: List of folder paths that map to this series.
nfo_paths: List of corresponding NFO file paths.
"""
def __init__(self, key: str, folders: List[str], nfo_paths: List[Path]):
self.key = key
self.folders = folders
self.nfo_paths = nfo_paths
@property
def count(self) -> int:
return len(self.folders)
def __repr__(self) -> str:
return f"DuplicateGroup(key={self.key!r}, folders={self.folders})"
def _scan_for_pre_existing_duplicates(anime_dir: Path) -> List[DuplicateGroup]:
"""Scan anime directory for pre-existing duplicate folders.
Groups folders by the series key extracted from their NFO files.
Folders with the same title+year (same expected name) are flagged as duplicates.
Args:
anime_dir: Path to the anime directory to scan.
Returns:
List of DuplicateGroup objects, one per series with duplicate folders.
"""
# Group folders by their expected name (title+year from NFO)
groups: Dict[str, List[Tuple[str, Path]]] = defaultdict(list)
for series_dir in anime_dir.iterdir():
if not series_dir.is_dir():
continue
nfo_path = series_dir / "tvshow.nfo"
if not nfo_path.exists():
continue
title, year = _parse_nfo_title_and_year(nfo_path)
if not title or not year:
continue
expected_name = _compute_expected_folder_name(title, year)
groups[expected_name].append((series_dir.name, nfo_path))
# Filter to only groups with more than one folder
duplicates = []
for key, items in groups.items():
if len(items) > 1:
folders = [item[0] for item in items]
nfo_paths = [item[1] for item in items]
duplicates.append(DuplicateGroup(key=key, folders=folders, nfo_paths=nfo_paths))
return duplicates
def _try_merge_duplicate_group(group: DuplicateGroup, dry_run: bool = False) -> bool:
"""Attempt to merge a duplicate group automatically.
Uses the first folder as the canonical one and removes others if they are
empty or contain only symlinks.
Args:
group: The DuplicateGroup to merge.
dry_run: If True, only log actions without executing them.
Returns:
True if merge was successful, False otherwise.
"""
if len(group.folders) < 2:
return True
# Keep first folder as canonical, mark others for removal
canonical = group.folders[0]
to_remove = group.folders[1:]
for folder in to_remove:
folder_path = group.nfo_paths[0].parent.parent / folder # same parent dir
if not folder_path.exists():
continue
# Check if folder is empty or only has symlinks
try:
contents = list(folder_path.iterdir())
except PermissionError:
logger.warning("Permission denied accessing %s, skip merge", folder_path)
return False
except OSError:
return False
if not contents:
# Empty folder - safe to remove
if dry_run:
logger.info("[DRY-RUN] Would delete empty duplicate folder: %s", folder_path)
else:
try:
folder_path.rmdir()
logger.info("Deleted empty duplicate folder: %s", folder_path)
except OSError:
return False
continue
# Check if all contents are symlinks pointing to canonical
all_symlinks = all(
item.is_symlink() and item.resolve() == (folder_path.parent / canonical).resolve()
for item in contents
)
if all_symlinks:
if dry_run:
logger.info("[DRY-RUN] Would remove symlinks in duplicate folder: %s", folder_path)
else:
for item in contents:
item.unlink()
try:
folder_path.rmdir()
logger.info("Removed symlink-only duplicate folder: %s", folder_path)
except OSError:
return False
continue
# Cannot auto-merge - requires manual intervention
logger.warning(
"Cannot auto-merge duplicate folders for '%s': %s (manual merge required)",
group.key,
[canonical] + to_remove,
)
return False
return True
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
"""Parse a tvshow.nfo and return (title, year) text values.
@@ -383,6 +519,28 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
# Detect pre-existing duplicates before rename loop
pre_existing_duplicates: Set[str] = set()
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
for dup_group in duplicates:
# Try automatic merge first
if _try_merge_duplicate_group(dup_group, dry_run=dry_run):
logger.info(
"Auto-merged duplicate group for '%s' (%d folders)",
dup_group.key,
dup_group.count,
)
else:
# Flag all folders in this group as pre-existing duplicates
for folder in dup_group.folders:
pre_existing_duplicates.add(folder)
logger.warning(
"Duplicate folders detected for series '%s': %s"
"manual cleanup required (different releases or non-empty duplicates)",
dup_group.key,
dup_group.folders,
)
for series_dir in sorted(anime_dir.iterdir()):
if not series_dir.is_dir():
continue
@@ -422,6 +580,15 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
expected_path = anime_dir / expected_name
# Check for pre-existing duplicate
if current_name in pre_existing_duplicates:
logger.warning(
"Skipping rename for '%s' — pre-existing duplicate folder detected",
current_name,
)
stats["errors"] += 1
continue
# Check for duplicate target
if expected_path.exists():
logger.warning(