feat: add duplicate folder detection and /duplicate-folders API endpoint
- Add DuplicateFolderGroup and DuplicateFoldersResponse Pydantic models - Add /duplicate-folders GET endpoint for listing pre-existing duplicates - Add _scan_for_pre_existing_duplicates() function for NFO-based detection - Add _try_merge_duplicate_group() for auto-merging empty/symlink-only duplicates - Integrate duplicate detection into validate_and_rename_series_folders workflow - Skip rename for flagged duplicates to prevent data loss during merge
This commit is contained in:
@@ -13,8 +13,9 @@ reflect the new paths.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@@ -34,6 +35,141 @@ logger = logging.getLogger(__name__)
|
||||
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
|
||||
|
||||
|
||||
class DuplicateGroup:
|
||||
"""Represents a group of duplicate folders for the same series.
|
||||
|
||||
Attributes:
|
||||
key: The series key (folder name before rename).
|
||||
folders: List of folder paths that map to this series.
|
||||
nfo_paths: List of corresponding NFO file paths.
|
||||
"""
|
||||
|
||||
def __init__(self, key: str, folders: List[str], nfo_paths: List[Path]):
|
||||
self.key = key
|
||||
self.folders = folders
|
||||
self.nfo_paths = nfo_paths
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return len(self.folders)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"DuplicateGroup(key={self.key!r}, folders={self.folders})"
|
||||
|
||||
|
||||
def _scan_for_pre_existing_duplicates(anime_dir: Path) -> List[DuplicateGroup]:
|
||||
"""Scan anime directory for pre-existing duplicate folders.
|
||||
|
||||
Groups folders by the series key extracted from their NFO files.
|
||||
Folders with the same title+year (same expected name) are flagged as duplicates.
|
||||
|
||||
Args:
|
||||
anime_dir: Path to the anime directory to scan.
|
||||
|
||||
Returns:
|
||||
List of DuplicateGroup objects, one per series with duplicate folders.
|
||||
"""
|
||||
# Group folders by their expected name (title+year from NFO)
|
||||
groups: Dict[str, List[Tuple[str, Path]]] = defaultdict(list)
|
||||
|
||||
for series_dir in anime_dir.iterdir():
|
||||
if not series_dir.is_dir():
|
||||
continue
|
||||
nfo_path = series_dir / "tvshow.nfo"
|
||||
if not nfo_path.exists():
|
||||
continue
|
||||
title, year = _parse_nfo_title_and_year(nfo_path)
|
||||
if not title or not year:
|
||||
continue
|
||||
expected_name = _compute_expected_folder_name(title, year)
|
||||
groups[expected_name].append((series_dir.name, nfo_path))
|
||||
|
||||
# Filter to only groups with more than one folder
|
||||
duplicates = []
|
||||
for key, items in groups.items():
|
||||
if len(items) > 1:
|
||||
folders = [item[0] for item in items]
|
||||
nfo_paths = [item[1] for item in items]
|
||||
duplicates.append(DuplicateGroup(key=key, folders=folders, nfo_paths=nfo_paths))
|
||||
|
||||
return duplicates
|
||||
|
||||
|
||||
def _try_merge_duplicate_group(group: DuplicateGroup, dry_run: bool = False) -> bool:
|
||||
"""Attempt to merge a duplicate group automatically.
|
||||
|
||||
Uses the first folder as the canonical one and removes others if they are
|
||||
empty or contain only symlinks.
|
||||
|
||||
Args:
|
||||
group: The DuplicateGroup to merge.
|
||||
dry_run: If True, only log actions without executing them.
|
||||
|
||||
Returns:
|
||||
True if merge was successful, False otherwise.
|
||||
"""
|
||||
if len(group.folders) < 2:
|
||||
return True
|
||||
|
||||
# Keep first folder as canonical, mark others for removal
|
||||
canonical = group.folders[0]
|
||||
to_remove = group.folders[1:]
|
||||
|
||||
for folder in to_remove:
|
||||
folder_path = group.nfo_paths[0].parent.parent / folder # same parent dir
|
||||
if not folder_path.exists():
|
||||
continue
|
||||
|
||||
# Check if folder is empty or only has symlinks
|
||||
try:
|
||||
contents = list(folder_path.iterdir())
|
||||
except PermissionError:
|
||||
logger.warning("Permission denied accessing %s, skip merge", folder_path)
|
||||
return False
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
if not contents:
|
||||
# Empty folder - safe to remove
|
||||
if dry_run:
|
||||
logger.info("[DRY-RUN] Would delete empty duplicate folder: %s", folder_path)
|
||||
else:
|
||||
try:
|
||||
folder_path.rmdir()
|
||||
logger.info("Deleted empty duplicate folder: %s", folder_path)
|
||||
except OSError:
|
||||
return False
|
||||
continue
|
||||
|
||||
# Check if all contents are symlinks pointing to canonical
|
||||
all_symlinks = all(
|
||||
item.is_symlink() and item.resolve() == (folder_path.parent / canonical).resolve()
|
||||
for item in contents
|
||||
)
|
||||
if all_symlinks:
|
||||
if dry_run:
|
||||
logger.info("[DRY-RUN] Would remove symlinks in duplicate folder: %s", folder_path)
|
||||
else:
|
||||
for item in contents:
|
||||
item.unlink()
|
||||
try:
|
||||
folder_path.rmdir()
|
||||
logger.info("Removed symlink-only duplicate folder: %s", folder_path)
|
||||
except OSError:
|
||||
return False
|
||||
continue
|
||||
|
||||
# Cannot auto-merge - requires manual intervention
|
||||
logger.warning(
|
||||
"Cannot auto-merge duplicate folders for '%s': %s (manual merge required)",
|
||||
group.key,
|
||||
[canonical] + to_remove,
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Parse a tvshow.nfo and return (title, year) text values.
|
||||
|
||||
@@ -383,6 +519,28 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
||||
|
||||
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
|
||||
|
||||
# Detect pre-existing duplicates before rename loop
|
||||
pre_existing_duplicates: Set[str] = set()
|
||||
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
|
||||
for dup_group in duplicates:
|
||||
# Try automatic merge first
|
||||
if _try_merge_duplicate_group(dup_group, dry_run=dry_run):
|
||||
logger.info(
|
||||
"Auto-merged duplicate group for '%s' (%d folders)",
|
||||
dup_group.key,
|
||||
dup_group.count,
|
||||
)
|
||||
else:
|
||||
# Flag all folders in this group as pre-existing duplicates
|
||||
for folder in dup_group.folders:
|
||||
pre_existing_duplicates.add(folder)
|
||||
logger.warning(
|
||||
"Duplicate folders detected for series '%s': %s — "
|
||||
"manual cleanup required (different releases or non-empty duplicates)",
|
||||
dup_group.key,
|
||||
dup_group.folders,
|
||||
)
|
||||
|
||||
for series_dir in sorted(anime_dir.iterdir()):
|
||||
if not series_dir.is_dir():
|
||||
continue
|
||||
@@ -422,6 +580,15 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
||||
|
||||
expected_path = anime_dir / expected_name
|
||||
|
||||
# Check for pre-existing duplicate
|
||||
if current_name in pre_existing_duplicates:
|
||||
logger.warning(
|
||||
"Skipping rename for '%s' — pre-existing duplicate folder detected",
|
||||
current_name,
|
||||
)
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
# Check for duplicate target
|
||||
if expected_path.exists():
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user