feat: add duplicate folder detection and /duplicate-folders API endpoint

- Add DuplicateFolderGroup and DuplicateFoldersResponse Pydantic models
- Add /duplicate-folders GET endpoint for listing pre-existing duplicates
- Add _scan_for_pre_existing_duplicates() function for NFO-based detection
- Add _try_merge_duplicate_group() for auto-merging empty/symlink-only duplicates
- Integrate duplicate detection into validate_and_rename_series_folders workflow
- Skip rename for flagged duplicates to prevent data loss during merge
This commit is contained in:
2026-05-28 21:46:08 +02:00
parent 239341629c
commit 1ef59c5283
2 changed files with 267 additions and 2 deletions

View File

@@ -13,8 +13,9 @@ reflect the new paths.
from __future__ import annotations
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Set, Tuple
from lxml import etree
@@ -34,6 +35,141 @@ logger = logging.getLogger(__name__)
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
class DuplicateGroup:
"""Represents a group of duplicate folders for the same series.
Attributes:
key: The series key (folder name before rename).
folders: List of folder paths that map to this series.
nfo_paths: List of corresponding NFO file paths.
"""
def __init__(self, key: str, folders: List[str], nfo_paths: List[Path]):
self.key = key
self.folders = folders
self.nfo_paths = nfo_paths
@property
def count(self) -> int:
return len(self.folders)
def __repr__(self) -> str:
return f"DuplicateGroup(key={self.key!r}, folders={self.folders})"
def _scan_for_pre_existing_duplicates(anime_dir: Path) -> List[DuplicateGroup]:
"""Scan anime directory for pre-existing duplicate folders.
Groups folders by the series key extracted from their NFO files.
Folders with the same title+year (same expected name) are flagged as duplicates.
Args:
anime_dir: Path to the anime directory to scan.
Returns:
List of DuplicateGroup objects, one per series with duplicate folders.
"""
# Group folders by their expected name (title+year from NFO)
groups: Dict[str, List[Tuple[str, Path]]] = defaultdict(list)
for series_dir in anime_dir.iterdir():
if not series_dir.is_dir():
continue
nfo_path = series_dir / "tvshow.nfo"
if not nfo_path.exists():
continue
title, year = _parse_nfo_title_and_year(nfo_path)
if not title or not year:
continue
expected_name = _compute_expected_folder_name(title, year)
groups[expected_name].append((series_dir.name, nfo_path))
# Filter to only groups with more than one folder
duplicates = []
for key, items in groups.items():
if len(items) > 1:
folders = [item[0] for item in items]
nfo_paths = [item[1] for item in items]
duplicates.append(DuplicateGroup(key=key, folders=folders, nfo_paths=nfo_paths))
return duplicates
def _try_merge_duplicate_group(group: DuplicateGroup, dry_run: bool = False) -> bool:
"""Attempt to merge a duplicate group automatically.
Uses the first folder as the canonical one and removes others if they are
empty or contain only symlinks.
Args:
group: The DuplicateGroup to merge.
dry_run: If True, only log actions without executing them.
Returns:
True if merge was successful, False otherwise.
"""
if len(group.folders) < 2:
return True
# Keep first folder as canonical, mark others for removal
canonical = group.folders[0]
to_remove = group.folders[1:]
for folder in to_remove:
folder_path = group.nfo_paths[0].parent.parent / folder # same parent dir
if not folder_path.exists():
continue
# Check if folder is empty or only has symlinks
try:
contents = list(folder_path.iterdir())
except PermissionError:
logger.warning("Permission denied accessing %s, skip merge", folder_path)
return False
except OSError:
return False
if not contents:
# Empty folder - safe to remove
if dry_run:
logger.info("[DRY-RUN] Would delete empty duplicate folder: %s", folder_path)
else:
try:
folder_path.rmdir()
logger.info("Deleted empty duplicate folder: %s", folder_path)
except OSError:
return False
continue
# Check if all contents are symlinks pointing to canonical
all_symlinks = all(
item.is_symlink() and item.resolve() == (folder_path.parent / canonical).resolve()
for item in contents
)
if all_symlinks:
if dry_run:
logger.info("[DRY-RUN] Would remove symlinks in duplicate folder: %s", folder_path)
else:
for item in contents:
item.unlink()
try:
folder_path.rmdir()
logger.info("Removed symlink-only duplicate folder: %s", folder_path)
except OSError:
return False
continue
# Cannot auto-merge - requires manual intervention
logger.warning(
"Cannot auto-merge duplicate folders for '%s': %s (manual merge required)",
group.key,
[canonical] + to_remove,
)
return False
return True
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
"""Parse a tvshow.nfo and return (title, year) text values.
@@ -383,6 +519,28 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
# Detect pre-existing duplicates before rename loop
pre_existing_duplicates: Set[str] = set()
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
for dup_group in duplicates:
# Try automatic merge first
if _try_merge_duplicate_group(dup_group, dry_run=dry_run):
logger.info(
"Auto-merged duplicate group for '%s' (%d folders)",
dup_group.key,
dup_group.count,
)
else:
# Flag all folders in this group as pre-existing duplicates
for folder in dup_group.folders:
pre_existing_duplicates.add(folder)
logger.warning(
"Duplicate folders detected for series '%s': %s"
"manual cleanup required (different releases or non-empty duplicates)",
dup_group.key,
dup_group.folders,
)
for series_dir in sorted(anime_dir.iterdir()):
if not series_dir.is_dir():
continue
@@ -422,6 +580,15 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
expected_path = anime_dir / expected_name
# Check for pre-existing duplicate
if current_name in pre_existing_duplicates:
logger.warning(
"Skipping rename for '%s' — pre-existing duplicate folder detected",
current_name,
)
stats["errors"] += 1
continue
# Check for duplicate target
if expected_path.exists():
logger.warning(