Add DataMigrationService for file-to-database migration (Task 1)

This commit is contained in:
Lukas 2025-12-01 18:09:38 +01:00
parent 0222262f8f
commit 7e2d3dd5ab
3 changed files with 980 additions and 1 deletions

View File

@ -65,7 +65,7 @@ The current implementation stores anime series metadata in `data` files (JSON fo
--- ---
### Task 1: Create Data File Migration Service ### Task 1: Create Data File Migration Service
**File:** `src/server/services/data_migration_service.py` **File:** `src/server/services/data_migration_service.py`

View File

@ -0,0 +1,413 @@
"""Data migration service for migrating file-based storage to database.
This module provides functionality to migrate anime series data from
legacy file-based storage (data files without .json extension) to the
SQLite database using the AnimeSeries model.
The migration service:
- Scans anime directories for existing data files
- Reads Serie objects from data files
- Migrates them to the database using AnimeSeriesService
- Handles errors gracefully without stopping the migration
- Provides detailed migration results
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.exc import IntegrityError
from src.core.entities.series import Serie
from src.server.database.service import AnimeSeriesService
logger = logging.getLogger(__name__)
@dataclass
class MigrationResult:
"""Result of a data file migration operation.
Attributes:
total_found: Total number of data files found
migrated: Number of files successfully migrated
skipped: Number of files skipped (already in database)
failed: Number of files that failed to migrate
errors: List of error messages encountered
"""
total_found: int = 0
migrated: int = 0
skipped: int = 0
failed: int = 0
errors: List[str] = field(default_factory=list)
def __post_init__(self):
"""Ensure errors is always a list."""
if self.errors is None:
self.errors = []
class DataMigrationError(Exception):
"""Base exception for data migration errors."""
class DataFileReadError(DataMigrationError):
"""Raised when a data file cannot be read."""
class DataMigrationService:
"""Service for migrating data files to database.
This service handles the migration of anime series data from
file-based storage to the database. It scans directories for
data files, reads Serie objects, and creates AnimeSeries records.
Example:
```python
service = DataMigrationService()
# Check if migration is needed
if await service.is_migration_needed("/path/to/anime"):
async with get_db_session() as db:
result = await service.migrate_all("/path/to/anime", db)
print(f"Migrated {result.migrated} series")
```
"""
def __init__(self) -> None:
"""Initialize the data migration service."""
pass
def scan_for_data_files(self, anime_directory: str) -> List[Path]:
"""Scan for data files in the anime directory.
Finds all 'data' files (JSON format without extension) in
the anime directory structure. Each series folder may contain
a 'data' file with series metadata.
Args:
anime_directory: Path to the anime directory containing
series folders
Returns:
List of Path objects pointing to data files
Raises:
ValueError: If anime_directory is invalid
"""
if not anime_directory or not anime_directory.strip():
logger.warning("Empty anime directory provided")
return []
base_path = Path(anime_directory)
if not base_path.exists():
logger.warning(
"Anime directory does not exist: %s",
anime_directory
)
return []
if not base_path.is_dir():
logger.warning(
"Anime directory is not a directory: %s",
anime_directory
)
return []
data_files: List[Path] = []
try:
# Iterate through all subdirectories (series folders)
for folder in base_path.iterdir():
if not folder.is_dir():
continue
# Check for 'data' file in each series folder
data_file = folder / "data"
if data_file.exists() and data_file.is_file():
data_files.append(data_file)
logger.debug("Found data file: %s", data_file)
except PermissionError as e:
logger.error(
"Permission denied scanning directory %s: %s",
anime_directory,
e
)
except OSError as e:
logger.error(
"OS error scanning directory %s: %s",
anime_directory,
e
)
logger.info(
"Found %d data files in %s",
len(data_files),
anime_directory
)
return data_files
def _read_data_file(self, data_path: Path) -> Optional[Serie]:
"""Read a Serie object from a data file.
Args:
data_path: Path to the data file
Returns:
Serie object if successfully read, None otherwise
Raises:
DataFileReadError: If the file cannot be read or parsed
"""
try:
serie = Serie.load_from_file(str(data_path))
# Validate the serie has required fields
if not serie.key or not serie.key.strip():
raise DataFileReadError(
f"Data file {data_path} has empty or missing key"
)
logger.debug(
"Successfully read serie '%s' from %s",
serie.key,
data_path
)
return serie
except FileNotFoundError as e:
raise DataFileReadError(
f"Data file not found: {data_path}"
) from e
except PermissionError as e:
raise DataFileReadError(
f"Permission denied reading data file: {data_path}"
) from e
except (ValueError, KeyError, TypeError) as e:
raise DataFileReadError(
f"Invalid data in file {data_path}: {e}"
) from e
except Exception as e:
raise DataFileReadError(
f"Error reading data file {data_path}: {e}"
) from e
async def migrate_data_file(
self,
data_path: Path,
db: AsyncSession
) -> bool:
"""Migrate a single data file to the database.
Reads the data file, checks if the series already exists in the
database, and creates a new record if it doesn't exist. If the
series exists, optionally updates the episode_dict if changed.
Args:
data_path: Path to the data file
db: Async database session
Returns:
True if the series was migrated (created or updated),
False if skipped (already exists with same data)
Raises:
DataFileReadError: If the file cannot be read
DataMigrationError: If database operation fails
"""
# Read the data file
serie = self._read_data_file(data_path)
if serie is None:
raise DataFileReadError(f"Could not read data file: {data_path}")
# Check if series already exists in database
existing = await AnimeSeriesService.get_by_key(db, serie.key)
if existing is not None:
# Check if episode_dict has changed
existing_dict = existing.episode_dict or {}
new_dict = serie.episodeDict or {}
# Convert keys to strings for comparison (JSON stores keys as strings)
new_dict_str_keys = {
str(k): v for k, v in new_dict.items()
}
if existing_dict == new_dict_str_keys:
logger.debug(
"Series '%s' already exists with same data, skipping",
serie.key
)
return False
# Update episode_dict if different
await AnimeSeriesService.update(
db,
existing.id,
episode_dict=new_dict_str_keys
)
logger.info(
"Updated episode_dict for existing series '%s'",
serie.key
)
return True
# Create new series in database
try:
# Convert episode_dict keys to strings for JSON storage
episode_dict_for_db = {
str(k): v for k, v in (serie.episodeDict or {}).items()
}
await AnimeSeriesService.create(
db,
key=serie.key,
name=serie.name,
site=serie.site,
folder=serie.folder,
episode_dict=episode_dict_for_db,
)
logger.info(
"Migrated series '%s' to database",
serie.key
)
return True
except IntegrityError as e:
# Race condition - series was created by another process
logger.warning(
"Series '%s' was already created (race condition): %s",
serie.key,
e
)
return False
except Exception as e:
raise DataMigrationError(
f"Failed to create series '{serie.key}' in database: {e}"
) from e
async def migrate_all(
self,
anime_directory: str,
db: AsyncSession
) -> MigrationResult:
"""Migrate all data files from anime directory to database.
Scans the anime directory for data files and migrates each one
to the database. Errors are logged but do not stop the migration.
Args:
anime_directory: Path to the anime directory
db: Async database session
Returns:
MigrationResult with counts and error messages
"""
result = MigrationResult()
# Scan for data files
data_files = self.scan_for_data_files(anime_directory)
result.total_found = len(data_files)
if result.total_found == 0:
logger.info("No data files found to migrate")
return result
logger.info(
"Starting migration of %d data files",
result.total_found
)
# Migrate each file
for data_path in data_files:
try:
migrated = await self.migrate_data_file(data_path, db)
if migrated:
result.migrated += 1
else:
result.skipped += 1
except DataFileReadError as e:
result.failed += 1
error_msg = f"Failed to read {data_path}: {e}"
result.errors.append(error_msg)
logger.error(error_msg)
except DataMigrationError as e:
result.failed += 1
error_msg = f"Failed to migrate {data_path}: {e}"
result.errors.append(error_msg)
logger.error(error_msg)
except Exception as e:
result.failed += 1
error_msg = f"Unexpected error migrating {data_path}: {e}"
result.errors.append(error_msg)
logger.exception(error_msg)
# Commit all changes
try:
await db.commit()
except Exception as e:
logger.error("Failed to commit migration: %s", e)
result.errors.append(f"Failed to commit migration: {e}")
logger.info(
"Migration complete: %d migrated, %d skipped, %d failed",
result.migrated,
result.skipped,
result.failed
)
return result
def is_migration_needed(self, anime_directory: str) -> bool:
"""Check if there are data files to migrate.
Args:
anime_directory: Path to the anime directory
Returns:
True if data files exist, False otherwise
"""
data_files = self.scan_for_data_files(anime_directory)
needs_migration = len(data_files) > 0
if needs_migration:
logger.info(
"Migration needed: found %d data files",
len(data_files)
)
else:
logger.debug("No migration needed: no data files found")
return needs_migration
# Singleton instance for the service
_data_migration_service: Optional[DataMigrationService] = None
def get_data_migration_service() -> DataMigrationService:
"""Get the singleton data migration service instance.
Returns:
DataMigrationService instance
"""
global _data_migration_service
if _data_migration_service is None:
_data_migration_service = DataMigrationService()
return _data_migration_service
def reset_data_migration_service() -> None:
"""Reset the singleton service instance (for testing)."""
global _data_migration_service
_data_migration_service = None

View File

@ -0,0 +1,566 @@
"""Unit tests for DataMigrationService.
This module contains comprehensive tests for the data migration service,
including scanning for data files, migrating individual files,
batch migration, and error handling.
"""
import json
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from src.core.entities.series import Serie
from src.server.services.data_migration_service import (
DataFileReadError,
DataMigrationError,
DataMigrationService,
MigrationResult,
get_data_migration_service,
reset_data_migration_service,
)
class TestMigrationResult:
"""Test MigrationResult dataclass."""
def test_migration_result_defaults(self):
"""Test MigrationResult with default values."""
result = MigrationResult()
assert result.total_found == 0
assert result.migrated == 0
assert result.skipped == 0
assert result.failed == 0
assert result.errors == []
def test_migration_result_with_values(self):
"""Test MigrationResult with custom values."""
result = MigrationResult(
total_found=10,
migrated=5,
skipped=3,
failed=2,
errors=["Error 1", "Error 2"]
)
assert result.total_found == 10
assert result.migrated == 5
assert result.skipped == 3
assert result.failed == 2
assert result.errors == ["Error 1", "Error 2"]
def test_migration_result_post_init_none_errors(self):
"""Test that None errors list is converted to empty list."""
# Create result then manually set errors to None
result = MigrationResult()
result.errors = None
result.__post_init__()
assert result.errors == []
class TestDataMigrationServiceScan:
"""Test scanning for data files."""
def test_scan_empty_directory(self):
"""Test scanning empty anime directory."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
result = service.scan_for_data_files(tmp_dir)
assert result == []
def test_scan_empty_string(self):
"""Test scanning with empty string."""
service = DataMigrationService()
result = service.scan_for_data_files("")
assert result == []
def test_scan_whitespace_string(self):
"""Test scanning with whitespace string."""
service = DataMigrationService()
result = service.scan_for_data_files(" ")
assert result == []
def test_scan_nonexistent_directory(self):
"""Test scanning nonexistent directory."""
service = DataMigrationService()
result = service.scan_for_data_files("/nonexistent/path")
assert result == []
def test_scan_file_instead_of_directory(self):
"""Test scanning when path is a file, not directory."""
service = DataMigrationService()
with tempfile.NamedTemporaryFile() as tmp_file:
result = service.scan_for_data_files(tmp_file.name)
assert result == []
def test_scan_finds_data_files(self):
"""Test scanning finds data files in series folders."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create series folders with data files
series1 = Path(tmp_dir) / "Attack on Titan (2013)"
series1.mkdir()
(series1 / "data").write_text('{"key": "aot", "name": "AOT"}')
series2 = Path(tmp_dir) / "One Piece"
series2.mkdir()
(series2 / "data").write_text('{"key": "one-piece", "name": "OP"}')
# Create folder without data file
series3 = Path(tmp_dir) / "No Data Here"
series3.mkdir()
result = service.scan_for_data_files(tmp_dir)
assert len(result) == 2
assert all(isinstance(p, Path) for p in result)
# Check filenames
filenames = [p.name for p in result]
assert all(name == "data" for name in filenames)
def test_scan_ignores_files_in_root(self):
"""Test scanning ignores files directly in anime directory."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create a 'data' file in root (should be ignored)
(Path(tmp_dir) / "data").write_text('{"key": "root"}')
# Create series folder with data file
series1 = Path(tmp_dir) / "Series One"
series1.mkdir()
(series1 / "data").write_text('{"key": "series-one"}')
result = service.scan_for_data_files(tmp_dir)
assert len(result) == 1
assert result[0].parent.name == "Series One"
def test_scan_ignores_nested_data_files(self):
"""Test scanning only finds data files one level deep."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create nested folder structure
series1 = Path(tmp_dir) / "Series One"
series1.mkdir()
(series1 / "data").write_text('{"key": "series-one"}')
# Create nested subfolder with data (should be ignored)
nested = series1 / "Season 1"
nested.mkdir()
(nested / "data").write_text('{"key": "nested"}')
result = service.scan_for_data_files(tmp_dir)
assert len(result) == 1
assert result[0].parent.name == "Series One"
class TestDataMigrationServiceReadFile:
"""Test reading data files."""
def test_read_valid_data_file(self):
"""Test reading a valid data file."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
data_file = Path(tmp_dir) / "data"
serie_data = {
"key": "attack-on-titan",
"name": "Attack on Titan",
"site": "aniworld.to",
"folder": "Attack on Titan (2013)",
"episodeDict": {"1": [1, 2, 3]}
}
data_file.write_text(json.dumps(serie_data))
result = service._read_data_file(data_file)
assert result is not None
assert result.key == "attack-on-titan"
assert result.name == "Attack on Titan"
assert result.site == "aniworld.to"
assert result.folder == "Attack on Titan (2013)"
def test_read_file_not_found(self):
"""Test reading nonexistent file raises error."""
service = DataMigrationService()
with pytest.raises(DataFileReadError) as exc_info:
service._read_data_file(Path("/nonexistent/data"))
assert "not found" in str(exc_info.value).lower() or "Error reading" in str(exc_info.value)
def test_read_file_empty_key(self):
"""Test reading file with empty key raises error."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
data_file = Path(tmp_dir) / "data"
serie_data = {
"key": "",
"name": "No Key Series",
"site": "aniworld.to",
"folder": "Test",
"episodeDict": {}
}
data_file.write_text(json.dumps(serie_data))
with pytest.raises(DataFileReadError) as exc_info:
service._read_data_file(data_file)
# The Serie class will raise ValueError for empty key
assert "empty" in str(exc_info.value).lower() or "key" in str(exc_info.value).lower()
def test_read_file_invalid_json(self):
"""Test reading file with invalid JSON raises error."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
data_file = Path(tmp_dir) / "data"
data_file.write_text("not valid json {{{")
with pytest.raises(DataFileReadError):
service._read_data_file(data_file)
def test_read_file_missing_required_fields(self):
"""Test reading file with missing required fields raises error."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
data_file = Path(tmp_dir) / "data"
# Missing 'key' field
data_file.write_text('{"name": "Test", "site": "test.com"}')
with pytest.raises(DataFileReadError):
service._read_data_file(data_file)
class TestDataMigrationServiceMigrateSingle:
"""Test migrating single data files."""
@pytest.fixture
def mock_db(self):
"""Create a mock database session."""
return AsyncMock()
@pytest.fixture
def sample_serie(self):
"""Create a sample Serie for testing."""
return Serie(
key="attack-on-titan",
name="Attack on Titan",
site="aniworld.to",
folder="Attack on Titan (2013)",
episodeDict={1: [1, 2, 3], 2: [1, 2]}
)
@pytest.mark.asyncio
async def test_migrate_new_series(self, mock_db, sample_serie):
"""Test migrating a new series to database."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
data_file = Path(tmp_dir) / "data"
sample_serie.save_to_file(str(data_file))
with patch.object(
service,
'_read_data_file',
return_value=sample_serie
):
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(return_value=None)
MockService.create = AsyncMock()
result = await service.migrate_data_file(data_file, mock_db)
assert result is True
MockService.create.assert_called_once()
# Verify the key was passed correctly
call_kwargs = MockService.create.call_args.kwargs
assert call_kwargs['key'] == "attack-on-titan"
assert call_kwargs['name'] == "Attack on Titan"
@pytest.mark.asyncio
async def test_migrate_existing_series_same_data(self, mock_db, sample_serie):
"""Test migrating series that already exists with same data."""
service = DataMigrationService()
# Create mock existing series with same episode_dict
existing = MagicMock()
existing.id = 1
existing.episode_dict = {"1": [1, 2, 3], "2": [1, 2]}
with patch.object(
service,
'_read_data_file',
return_value=sample_serie
):
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(return_value=existing)
result = await service.migrate_data_file(
Path("/fake/data"),
mock_db
)
assert result is False
MockService.create.assert_not_called()
@pytest.mark.asyncio
async def test_migrate_existing_series_different_data(self, mock_db):
"""Test migrating series that exists with different episode_dict."""
service = DataMigrationService()
# Serie with new episodes
serie = Serie(
key="attack-on-titan",
name="Attack on Titan",
site="aniworld.to",
folder="AOT",
episodeDict={1: [1, 2, 3, 4, 5]} # More episodes than existing
)
# Existing series has fewer episodes
existing = MagicMock()
existing.id = 1
existing.episode_dict = {"1": [1, 2, 3]}
with patch.object(
service,
'_read_data_file',
return_value=serie
):
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(return_value=existing)
MockService.update = AsyncMock()
result = await service.migrate_data_file(
Path("/fake/data"),
mock_db
)
assert result is True
MockService.update.assert_called_once()
@pytest.mark.asyncio
async def test_migrate_read_error(self, mock_db):
"""Test migration handles read errors properly."""
service = DataMigrationService()
with patch.object(
service,
'_read_data_file',
side_effect=DataFileReadError("Cannot read file")
):
with pytest.raises(DataFileReadError):
await service.migrate_data_file(Path("/fake/data"), mock_db)
class TestDataMigrationServiceMigrateAll:
"""Test batch migration of data files."""
@pytest.fixture
def mock_db(self):
"""Create a mock database session."""
db = AsyncMock()
db.commit = AsyncMock()
return db
@pytest.mark.asyncio
async def test_migrate_all_empty_directory(self, mock_db):
"""Test migration with no data files."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
result = await service.migrate_all(tmp_dir, mock_db)
assert result.total_found == 0
assert result.migrated == 0
assert result.skipped == 0
assert result.failed == 0
assert result.errors == []
@pytest.mark.asyncio
async def test_migrate_all_success(self, mock_db):
"""Test successful migration of multiple files."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create test data files
for i in range(3):
series_dir = Path(tmp_dir) / f"Series {i}"
series_dir.mkdir()
data = {
"key": f"series-{i}",
"name": f"Series {i}",
"site": "aniworld.to",
"folder": f"Series {i}",
"episodeDict": {}
}
(series_dir / "data").write_text(json.dumps(data))
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(return_value=None)
MockService.create = AsyncMock()
result = await service.migrate_all(tmp_dir, mock_db)
assert result.total_found == 3
assert result.migrated == 3
assert result.skipped == 0
assert result.failed == 0
@pytest.mark.asyncio
async def test_migrate_all_with_errors(self, mock_db):
"""Test migration continues after individual file errors."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create valid data file
valid_dir = Path(tmp_dir) / "Valid Series"
valid_dir.mkdir()
valid_data = {
"key": "valid-series",
"name": "Valid Series",
"site": "aniworld.to",
"folder": "Valid Series",
"episodeDict": {}
}
(valid_dir / "data").write_text(json.dumps(valid_data))
# Create invalid data file
invalid_dir = Path(tmp_dir) / "Invalid Series"
invalid_dir.mkdir()
(invalid_dir / "data").write_text("not valid json")
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(return_value=None)
MockService.create = AsyncMock()
result = await service.migrate_all(tmp_dir, mock_db)
assert result.total_found == 2
assert result.migrated == 1
assert result.failed == 1
assert len(result.errors) == 1
@pytest.mark.asyncio
async def test_migrate_all_with_skips(self, mock_db):
"""Test migration correctly counts skipped files."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
# Create data files
for i in range(2):
series_dir = Path(tmp_dir) / f"Series {i}"
series_dir.mkdir()
data = {
"key": f"series-{i}",
"name": f"Series {i}",
"site": "aniworld.to",
"folder": f"Series {i}",
"episodeDict": {}
}
(series_dir / "data").write_text(json.dumps(data))
# Mock: first series doesn't exist, second already exists
existing = MagicMock()
existing.id = 2
existing.episode_dict = {}
with patch(
'src.server.services.data_migration_service.AnimeSeriesService'
) as MockService:
MockService.get_by_key = AsyncMock(
side_effect=[None, existing]
)
MockService.create = AsyncMock()
result = await service.migrate_all(tmp_dir, mock_db)
assert result.total_found == 2
assert result.migrated == 1
assert result.skipped == 1
class TestDataMigrationServiceIsMigrationNeeded:
"""Test is_migration_needed method."""
def test_migration_needed_with_data_files(self):
"""Test migration is needed when data files exist."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
series_dir = Path(tmp_dir) / "Test Series"
series_dir.mkdir()
(series_dir / "data").write_text('{"key": "test"}')
assert service.is_migration_needed(tmp_dir) is True
def test_migration_not_needed_empty_directory(self):
"""Test migration not needed for empty directory."""
service = DataMigrationService()
with tempfile.TemporaryDirectory() as tmp_dir:
assert service.is_migration_needed(tmp_dir) is False
def test_migration_not_needed_nonexistent_directory(self):
"""Test migration not needed for nonexistent directory."""
service = DataMigrationService()
assert service.is_migration_needed("/nonexistent/path") is False
class TestDataMigrationServiceSingleton:
"""Test singleton pattern for service."""
def test_get_service_returns_same_instance(self):
"""Test getting service returns same instance."""
reset_data_migration_service()
service1 = get_data_migration_service()
service2 = get_data_migration_service()
assert service1 is service2
def test_reset_service_creates_new_instance(self):
"""Test resetting service creates new instance."""
service1 = get_data_migration_service()
reset_data_migration_service()
service2 = get_data_migration_service()
assert service1 is not service2
def test_service_is_correct_type(self):
"""Test service is correct type."""
reset_data_migration_service()
service = get_data_migration_service()
assert isinstance(service, DataMigrationService)