feat: add duplicate folder detection and /duplicate-folders API endpoint
- Add DuplicateFolderGroup and DuplicateFoldersResponse Pydantic models - Add /duplicate-folders GET endpoint for listing pre-existing duplicates - Add _scan_for_pre_existing_duplicates() function for NFO-based detection - Add _try_merge_duplicate_group() for auto-merging empty/symlink-only duplicates - Integrate duplicate detection into validate_and_rename_series_folders workflow - Skip rename for flagged duplicates to prevent data loss during merge
This commit is contained in:
@@ -1,12 +1,13 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, status
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
from pydantic import BaseModel, Field, field_validator
|
from pydantic import BaseModel, Field, field_validator
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from src.core.entities.series import Serie
|
from src.config.settings import settings
|
||||||
from src.core.utils.key_utils import generate_key_from_folder, is_valid_key
|
from src.core.utils.key_utils import generate_key_from_folder, is_valid_key
|
||||||
from src.server.database.service import AnimeSeriesService
|
from src.server.database.service import AnimeSeriesService
|
||||||
from src.server.exceptions import (
|
from src.server.exceptions import (
|
||||||
@@ -26,6 +27,9 @@ from src.server.utils.dependencies import (
|
|||||||
)
|
)
|
||||||
from src.server.utils.filesystem import sanitize_folder_name
|
from src.server.utils.filesystem import sanitize_folder_name
|
||||||
from src.server.utils.validators import validate_filter_value, validate_search_query
|
from src.server.utils.validators import validate_filter_value, validate_search_query
|
||||||
|
from src.server.services.folder_rename_service import (
|
||||||
|
_scan_for_pre_existing_duplicates,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -71,6 +75,100 @@ async def get_anime_status(
|
|||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
|
|
||||||
|
class DuplicateFolderGroup(BaseModel):
|
||||||
|
"""A group of duplicate folders for the same series.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
key: Series key (provider-assigned unique identifier)
|
||||||
|
folders: List of folder names that are duplicates
|
||||||
|
folder_count: Number of duplicate folders
|
||||||
|
"""
|
||||||
|
key: str = Field(..., description="Series key (unique identifier)")
|
||||||
|
folders: List[str] = Field(..., description="List of duplicate folder names")
|
||||||
|
folder_count: int = Field(..., description="Number of duplicate folders")
|
||||||
|
|
||||||
|
|
||||||
|
class DuplicateFoldersResponse(BaseModel):
|
||||||
|
"""Response model for duplicate folders listing.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
total_groups: Total number of duplicate groups found
|
||||||
|
duplicate_groups: List of duplicate folder groups
|
||||||
|
message: Human-readable summary
|
||||||
|
"""
|
||||||
|
total_groups: int = Field(..., description="Total number of duplicate groups")
|
||||||
|
duplicate_groups: List[DuplicateFolderGroup] = Field(
|
||||||
|
..., description="List of duplicate folder groups"
|
||||||
|
)
|
||||||
|
message: str = Field(..., description="Human-readable summary")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/duplicate-folders", response_model=DuplicateFoldersResponse)
|
||||||
|
async def get_duplicate_folders(
|
||||||
|
_auth: dict = Depends(require_auth),
|
||||||
|
) -> DuplicateFoldersResponse:
|
||||||
|
"""List all pre-existing duplicate folder groups.
|
||||||
|
|
||||||
|
Scans the anime directory for folders with tvshow.nfo files that
|
||||||
|
map to the same series key. Returns groups of duplicates for
|
||||||
|
manual review and cleanup.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DuplicateFoldersResponse with groups of duplicate folders
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Not all duplicate folders are safe to merge - some may belong
|
||||||
|
to different releases (e.g., dubbed vs. subbed). Review carefully
|
||||||
|
before taking action.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not settings.anime_directory:
|
||||||
|
return DuplicateFoldersResponse(
|
||||||
|
total_groups=0,
|
||||||
|
duplicate_groups=[],
|
||||||
|
message="Anime directory not configured",
|
||||||
|
)
|
||||||
|
|
||||||
|
anime_dir = Path(settings.anime_directory)
|
||||||
|
if not anime_dir.is_dir():
|
||||||
|
return DuplicateFoldersResponse(
|
||||||
|
total_groups=0,
|
||||||
|
duplicate_groups=[],
|
||||||
|
message=f"Anime directory not found: {anime_dir}",
|
||||||
|
)
|
||||||
|
|
||||||
|
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
|
||||||
|
|
||||||
|
groups = [
|
||||||
|
DuplicateFolderGroup(
|
||||||
|
key=dup.key,
|
||||||
|
folders=dup.folders,
|
||||||
|
folder_count=dup.count,
|
||||||
|
)
|
||||||
|
for dup in duplicates
|
||||||
|
]
|
||||||
|
|
||||||
|
if groups:
|
||||||
|
message = (
|
||||||
|
f"Found {len(groups)} duplicate group(s). "
|
||||||
|
"Review carefully - some duplicates may be different releases "
|
||||||
|
"(e.g., dubbed vs. subbed)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
message = "No duplicate folders found."
|
||||||
|
|
||||||
|
return DuplicateFoldersResponse(
|
||||||
|
total_groups=len(groups),
|
||||||
|
duplicate_groups=groups,
|
||||||
|
message=message,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("Failed to scan for duplicate folders: %s", str(exc))
|
||||||
|
raise ServerError(
|
||||||
|
message=f"Failed to scan for duplicates: {str(exc)}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
|
||||||
class AnimeSummary(BaseModel):
|
class AnimeSummary(BaseModel):
|
||||||
"""Summary of an anime series with missing episodes.
|
"""Summary of an anime series with missing episodes.
|
||||||
|
|
||||||
|
|||||||
@@ -13,8 +13,9 @@ reflect the new paths.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@@ -34,6 +35,141 @@ logger = logging.getLogger(__name__)
|
|||||||
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
|
INVALID_PATH_CHARS = '<>:"/\\|?*\x00'
|
||||||
|
|
||||||
|
|
||||||
|
class DuplicateGroup:
|
||||||
|
"""Represents a group of duplicate folders for the same series.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
key: The series key (folder name before rename).
|
||||||
|
folders: List of folder paths that map to this series.
|
||||||
|
nfo_paths: List of corresponding NFO file paths.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, key: str, folders: List[str], nfo_paths: List[Path]):
|
||||||
|
self.key = key
|
||||||
|
self.folders = folders
|
||||||
|
self.nfo_paths = nfo_paths
|
||||||
|
|
||||||
|
@property
|
||||||
|
def count(self) -> int:
|
||||||
|
return len(self.folders)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"DuplicateGroup(key={self.key!r}, folders={self.folders})"
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_for_pre_existing_duplicates(anime_dir: Path) -> List[DuplicateGroup]:
|
||||||
|
"""Scan anime directory for pre-existing duplicate folders.
|
||||||
|
|
||||||
|
Groups folders by the series key extracted from their NFO files.
|
||||||
|
Folders with the same title+year (same expected name) are flagged as duplicates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
anime_dir: Path to the anime directory to scan.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of DuplicateGroup objects, one per series with duplicate folders.
|
||||||
|
"""
|
||||||
|
# Group folders by their expected name (title+year from NFO)
|
||||||
|
groups: Dict[str, List[Tuple[str, Path]]] = defaultdict(list)
|
||||||
|
|
||||||
|
for series_dir in anime_dir.iterdir():
|
||||||
|
if not series_dir.is_dir():
|
||||||
|
continue
|
||||||
|
nfo_path = series_dir / "tvshow.nfo"
|
||||||
|
if not nfo_path.exists():
|
||||||
|
continue
|
||||||
|
title, year = _parse_nfo_title_and_year(nfo_path)
|
||||||
|
if not title or not year:
|
||||||
|
continue
|
||||||
|
expected_name = _compute_expected_folder_name(title, year)
|
||||||
|
groups[expected_name].append((series_dir.name, nfo_path))
|
||||||
|
|
||||||
|
# Filter to only groups with more than one folder
|
||||||
|
duplicates = []
|
||||||
|
for key, items in groups.items():
|
||||||
|
if len(items) > 1:
|
||||||
|
folders = [item[0] for item in items]
|
||||||
|
nfo_paths = [item[1] for item in items]
|
||||||
|
duplicates.append(DuplicateGroup(key=key, folders=folders, nfo_paths=nfo_paths))
|
||||||
|
|
||||||
|
return duplicates
|
||||||
|
|
||||||
|
|
||||||
|
def _try_merge_duplicate_group(group: DuplicateGroup, dry_run: bool = False) -> bool:
|
||||||
|
"""Attempt to merge a duplicate group automatically.
|
||||||
|
|
||||||
|
Uses the first folder as the canonical one and removes others if they are
|
||||||
|
empty or contain only symlinks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
group: The DuplicateGroup to merge.
|
||||||
|
dry_run: If True, only log actions without executing them.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if merge was successful, False otherwise.
|
||||||
|
"""
|
||||||
|
if len(group.folders) < 2:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Keep first folder as canonical, mark others for removal
|
||||||
|
canonical = group.folders[0]
|
||||||
|
to_remove = group.folders[1:]
|
||||||
|
|
||||||
|
for folder in to_remove:
|
||||||
|
folder_path = group.nfo_paths[0].parent.parent / folder # same parent dir
|
||||||
|
if not folder_path.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if folder is empty or only has symlinks
|
||||||
|
try:
|
||||||
|
contents = list(folder_path.iterdir())
|
||||||
|
except PermissionError:
|
||||||
|
logger.warning("Permission denied accessing %s, skip merge", folder_path)
|
||||||
|
return False
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not contents:
|
||||||
|
# Empty folder - safe to remove
|
||||||
|
if dry_run:
|
||||||
|
logger.info("[DRY-RUN] Would delete empty duplicate folder: %s", folder_path)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
folder_path.rmdir()
|
||||||
|
logger.info("Deleted empty duplicate folder: %s", folder_path)
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if all contents are symlinks pointing to canonical
|
||||||
|
all_symlinks = all(
|
||||||
|
item.is_symlink() and item.resolve() == (folder_path.parent / canonical).resolve()
|
||||||
|
for item in contents
|
||||||
|
)
|
||||||
|
if all_symlinks:
|
||||||
|
if dry_run:
|
||||||
|
logger.info("[DRY-RUN] Would remove symlinks in duplicate folder: %s", folder_path)
|
||||||
|
else:
|
||||||
|
for item in contents:
|
||||||
|
item.unlink()
|
||||||
|
try:
|
||||||
|
folder_path.rmdir()
|
||||||
|
logger.info("Removed symlink-only duplicate folder: %s", folder_path)
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Cannot auto-merge - requires manual intervention
|
||||||
|
logger.warning(
|
||||||
|
"Cannot auto-merge duplicate folders for '%s': %s (manual merge required)",
|
||||||
|
group.key,
|
||||||
|
[canonical] + to_remove,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
def _parse_nfo_title_and_year(nfo_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
||||||
"""Parse a tvshow.nfo and return (title, year) text values.
|
"""Parse a tvshow.nfo and return (title, year) text values.
|
||||||
|
|
||||||
@@ -383,6 +519,28 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
|||||||
|
|
||||||
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
|
stats = {"scanned": 0, "renamed": 0, "skipped": 0, "errors": 0}
|
||||||
|
|
||||||
|
# Detect pre-existing duplicates before rename loop
|
||||||
|
pre_existing_duplicates: Set[str] = set()
|
||||||
|
duplicates = _scan_for_pre_existing_duplicates(anime_dir)
|
||||||
|
for dup_group in duplicates:
|
||||||
|
# Try automatic merge first
|
||||||
|
if _try_merge_duplicate_group(dup_group, dry_run=dry_run):
|
||||||
|
logger.info(
|
||||||
|
"Auto-merged duplicate group for '%s' (%d folders)",
|
||||||
|
dup_group.key,
|
||||||
|
dup_group.count,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Flag all folders in this group as pre-existing duplicates
|
||||||
|
for folder in dup_group.folders:
|
||||||
|
pre_existing_duplicates.add(folder)
|
||||||
|
logger.warning(
|
||||||
|
"Duplicate folders detected for series '%s': %s — "
|
||||||
|
"manual cleanup required (different releases or non-empty duplicates)",
|
||||||
|
dup_group.key,
|
||||||
|
dup_group.folders,
|
||||||
|
)
|
||||||
|
|
||||||
for series_dir in sorted(anime_dir.iterdir()):
|
for series_dir in sorted(anime_dir.iterdir()):
|
||||||
if not series_dir.is_dir():
|
if not series_dir.is_dir():
|
||||||
continue
|
continue
|
||||||
@@ -422,6 +580,15 @@ async def validate_and_rename_series_folders(dry_run: bool = False) -> Dict[str,
|
|||||||
|
|
||||||
expected_path = anime_dir / expected_name
|
expected_path = anime_dir / expected_name
|
||||||
|
|
||||||
|
# Check for pre-existing duplicate
|
||||||
|
if current_name in pre_existing_duplicates:
|
||||||
|
logger.warning(
|
||||||
|
"Skipping rename for '%s' — pre-existing duplicate folder detected",
|
||||||
|
current_name,
|
||||||
|
)
|
||||||
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Check for duplicate target
|
# Check for duplicate target
|
||||||
if expected_path.exists():
|
if expected_path.exists():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|||||||
Reference in New Issue
Block a user