feat(download): persist retry state and dead-letter

Retry count and queue status were in-memory only and lost on
restart, so failed downloads could not be safely resumed and
permanently-failed episodes silently blocked re-queueing via the
episode-id unique index.

- Add status + retry_count columns to DownloadQueueItem
- Replace unique(episode_id) with unique(episode_id, status) so
  permanently_failed rows do not block new pending entries
- Add PERMANENTLY_FAILED to DownloadStatus enum
- Persist retry_count on each failure; mark permanently_failed once
  max_retries reached
- QueueRepository reads status/retry_count from DB instead of
  defaulting to PENDING/0
- Stop double-incrementing retry_count in retry_failed_items;
  increment only happens in _process_download on failure

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-25 14:24:31 +02:00
parent 0ba2587bc8
commit c579235af0
7 changed files with 383 additions and 38 deletions

View File

@@ -316,6 +316,7 @@ class DownloadQueueItem(Base, TimestampMixin):
id: Primary key
series_id: Foreign key to AnimeSeries
episode_id: Foreign key to Episode
status: Queue status (pending/downloading/completed/failed/permanently_failed)
error_message: Error description if failed
download_url: Provider download URL
file_destination: Target file path
@@ -347,12 +348,29 @@ class DownloadQueueItem(Base, TimestampMixin):
index=True
)
# Unique constraint to prevent duplicate pending queue items
# An episode can only have one queue entry at a time
# Status column to track queue item state
# Allows distinguishing pending items from permanently failed ones
status: Mapped[str] = mapped_column(
String(50), nullable=False, default="pending",
doc="Queue item status: pending, downloading, completed, failed, permanently_failed"
)
# Retry count to track failed download attempts
# Used to determine when to move item to permanently_failed
retry_count: Mapped[int] = mapped_column(
Integer, nullable=False, default=0,
doc="Number of retry attempts for this download"
)
# Unique constraint to prevent duplicate pending queue items per episode
# An episode can only have one PENDING entry at a time
# The status column allows failed items to remain in DB while new
# pending items can be added (application-level dedup still required)
__table_args__ = (
Index(
"ix_download_queue_episode_pending",
"ix_download_queue_episode_status",
"episode_id",
"status",
unique=True,
),
)

View File

@@ -748,6 +748,8 @@ class DownloadQueueService:
episode_id: int,
download_url: Optional[str] = None,
file_destination: Optional[str] = None,
status: str = "pending",
retry_count: int = 0,
) -> DownloadQueueItem:
"""Add item to download queue.
@@ -757,6 +759,8 @@ class DownloadQueueService:
episode_id: Foreign key to Episode
download_url: Optional provider download URL
file_destination: Optional target file path
status: Queue item status (default: "pending")
retry_count: Number of retry attempts (default: 0)
Returns:
Created DownloadQueueItem instance
@@ -766,13 +770,15 @@ class DownloadQueueService:
episode_id=episode_id,
download_url=download_url,
file_destination=file_destination,
status=status,
retry_count=retry_count,
)
db.add(item)
await db.flush()
await db.refresh(item)
logger.info(
f"Added to download queue: episode_id={episode_id} "
f"for series_id={series_id}"
f"for series_id={series_id}, status={status}"
)
return item
@@ -799,21 +805,24 @@ class DownloadQueueService:
async def get_by_episode(
db: AsyncSession,
episode_id: int,
status_filter: Optional[str] = None,
) -> Optional[DownloadQueueItem]:
"""Get download queue item by episode ID.
Args:
db: Database session
episode_id: Foreign key to Episode
status_filter: Optional status to filter by (e.g., "pending")
Returns:
DownloadQueueItem instance or None if not found
"""
result = await db.execute(
select(DownloadQueueItem).where(
DownloadQueueItem.episode_id == episode_id
)
query = select(DownloadQueueItem).where(
DownloadQueueItem.episode_id == episode_id
)
if status_filter:
query = query.where(DownloadQueueItem.status == status_filter)
result = await db.execute(query)
return result.scalar_one_or_none()
@staticmethod
@@ -873,6 +882,95 @@ class DownloadQueueService:
logger.debug("Set error on download queue item %s", item_id)
return item
@staticmethod
async def set_status(
db: AsyncSession,
item_id: int,
status: str,
) -> Optional[DownloadQueueItem]:
"""Set status on download queue item.
Args:
db: Database session
item_id: Item primary key
status: New status value
Returns:
Updated DownloadQueueItem instance or None if not found
"""
item = await DownloadQueueService.get_by_id(db, item_id)
if not item:
return None
item.status = status
await db.flush()
await db.refresh(item)
logger.debug("Set status on download queue item %s to %s", item_id, status)
return item
@staticmethod
async def increment_retry_count(
db: AsyncSession,
item_id: int,
) -> Optional[DownloadQueueItem]:
"""Increment retry count on download queue item.
Args:
db: Database session
item_id: Item primary key
Returns:
Updated DownloadQueueItem instance or None if not found
"""
item = await DownloadQueueService.get_by_id(db, item_id)
if not item:
return None
item.retry_count += 1
await db.flush()
await db.refresh(item)
logger.debug(
"Incremented retry count on download queue item %s to %s",
item_id, item.retry_count
)
return item
@staticmethod
async def set_status_and_error(
db: AsyncSession,
item_id: int,
status: str,
error_message: Optional[str] = None,
) -> Optional[DownloadQueueItem]:
"""Set status and error message on download queue item atomically.
Args:
db: Database session
item_id: Item primary key
status: New status value
error_message: Optional error description
Returns:
Updated DownloadQueueItem instance or None if not found
"""
item = await DownloadQueueService.get_by_id(db, item_id)
if not item:
return None
item.status = status
if error_message is not None:
item.error_message = error_message
await db.flush()
await db.refresh(item)
logger.debug(
"Set status=%s on download queue item %s, error=%s",
status, item_id, error_message
)
return item
@staticmethod
async def delete(db: AsyncSession, item_id: int) -> bool:
"""Delete download queue item.

View File

@@ -22,6 +22,7 @@ class DownloadStatus(str, Enum):
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
PERMANENTLY_FAILED = "permanently_failed"
class DownloadPriority(str, Enum):

View File

@@ -170,6 +170,27 @@ class DownloadService:
logger.error("Failed to save item to database: %s", e)
return item
async def _set_status_in_database(
self,
item_id: str,
status: str,
) -> bool:
"""Set status on an item in the database.
Args:
item_id: Download item ID
status: New status value
Returns:
True if update succeeded
"""
try:
repository = self._get_repository()
return await repository.set_status(item_id, status)
except Exception as e:
logger.error("Failed to set status in database: %s", e)
return False
async def _set_error_in_database(
self,
item_id: str,
@@ -191,6 +212,25 @@ class DownloadService:
logger.error("Failed to set error in database: %s", e)
return False
async def _increment_retry_in_database(
self,
item_id: str,
) -> bool:
"""Increment retry count on an item in the database.
Args:
item_id: Download item ID
Returns:
True if update succeeded
"""
try:
repository = self._get_repository()
return await repository.increment_retry(item_id)
except Exception as e:
logger.error("Failed to increment retry in database: %s", e)
return False
async def _delete_from_database(self, item_id: str) -> bool:
"""Delete an item from the database.
@@ -1051,17 +1091,15 @@ class DownloadService:
if item.retry_count >= self._max_retries:
continue
# Move back to pending
# Move back to pending (retry_count will be incremented
# by _process_download when the item fails again)
self._failed_items.remove(item)
item.status = DownloadStatus.PENDING
item.retry_count += 1
item.error = None
item.progress = None
self._add_to_pending_queue(item)
retried_ids.append(item.id)
# Status is now managed in-memory only
logger.info(
"Retrying failed item: item_id=%s, retry_count=%d",
item.id,
@@ -1069,18 +1107,23 @@ class DownloadService:
)
if retried_ids:
# Notify via progress service
queue_status = await self.get_queue_status()
await self._progress_service.update_progress(
progress_id="download_queue",
message=f"Retried {len(retried_ids)} failed items",
metadata={
"action": "items_retried",
"retried_ids": retried_ids,
"queue_status": queue_status.model_dump(mode="json"),
},
force_broadcast=True,
)
# Notify via progress service if available
try:
queue_status = await self.get_queue_status()
await self._progress_service.update_progress(
progress_id="download_queue",
message=f"Retried {len(retried_ids)} failed items",
metadata={
"action": "items_retried",
"retried_ids": retried_ids,
"queue_status": queue_status.model_dump(mode="json"),
},
force_broadcast=True,
)
except Exception as e:
logger.warning(
"Failed to broadcast retry progress: %s", e
)
return retried_ids
@@ -1220,17 +1263,35 @@ class DownloadService:
item.status = DownloadStatus.FAILED
item.completed_at = datetime.now(timezone.utc)
item.error = str(e)
# Increment retry count in memory and database
item.retry_count += 1
await self._increment_retry_in_database(item.id)
self._failed_items.append(item)
# Set error in database
await self._set_error_in_database(item.id, str(e))
logger.error(
"Download failed: item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
# Check if max retries exceeded - move to dead-letter
if item.retry_count >= self._max_retries:
await self._set_status_in_database(
item.id, DownloadStatus.PERMANENTLY_FAILED.value
)
logger.error(
"Download permanently failed after max retries: "
"item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
else:
logger.error(
"Download failed: item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
# Note: Failure is already broadcast by AnimeService
# via ProgressService when SeriesApp fires failed event

View File

@@ -83,15 +83,12 @@ class QueueRepository:
) -> DownloadItem:
"""Convert database model to DownloadItem.
Note: Since the database model is simplified, status, priority,
progress, and retry_count default to initial values.
Args:
db_item: SQLAlchemy download queue item
item_id: Optional override for item ID
Returns:
Pydantic download item with default status/priority
Pydantic download item with status/retry_count from database
"""
# Get episode info from the related Episode object
episode = db_item.episode
@@ -109,14 +106,14 @@ class QueueRepository:
serie_folder=series.folder if series else "",
serie_name=series.name if series else "",
episode=episode_identifier,
status=DownloadStatus.PENDING, # Default - managed in-memory
priority=DownloadPriority.NORMAL, # Default - managed in-memory
status=DownloadStatus(db_item.status), # From database
priority=DownloadPriority.NORMAL, # Managed in-memory
added_at=db_item.created_at or datetime.now(timezone.utc),
started_at=db_item.started_at,
completed_at=db_item.completed_at,
progress=None, # Managed in-memory
error=db_item.error_message,
retry_count=0, # Managed in-memory
retry_count=db_item.retry_count, # From database
source_url=db_item.download_url,
)
@@ -350,6 +347,110 @@ class QueueRepository:
finally:
if manage_session:
await session.close()
async def set_status(
self,
item_id: str,
status: str,
db: Optional[AsyncSession] = None,
) -> bool:
"""Set status on a download item.
Args:
item_id: Download item ID
status: New status value
db: Optional existing database session
Returns:
True if update succeeded, False if item not found
Raises:
QueueRepositoryError: If update fails
"""
session = db or self._db_session_factory()
manage_session = db is None
try:
result = await DownloadQueueService.set_status(
session,
int(item_id),
status,
)
if manage_session:
await session.commit()
success = result is not None
if success:
logger.debug(
"Set status on queue item: item_id=%s, status=%s",
item_id,
status,
)
return success
except ValueError:
return False
except Exception as e:
if manage_session:
await session.rollback()
logger.error("Failed to set status: %s", e)
raise QueueRepositoryError(f"Failed to set status: {e}") from e
finally:
if manage_session:
await session.close()
async def increment_retry(
self,
item_id: str,
db: Optional[AsyncSession] = None,
) -> bool:
"""Increment retry count on a download item.
Args:
item_id: Download item ID
db: Optional existing database session
Returns:
True if update succeeded, False if item not found
Raises:
QueueRepositoryError: If update fails
"""
session = db or self._db_session_factory()
manage_session = db is None
try:
result = await DownloadQueueService.increment_retry_count(
session,
int(item_id),
)
if manage_session:
await session.commit()
success = result is not None
if success:
logger.debug(
"Incremented retry count on queue item: item_id=%s",
item_id,
)
return success
except ValueError:
return False
except Exception as e:
if manage_session:
await session.rollback()
logger.error("Failed to increment retry: %s", e)
raise QueueRepositoryError(f"Failed to increment retry: {e}") from e
finally:
if manage_session:
await session.close()
async def delete_item(
self,

View File

@@ -60,6 +60,27 @@ class MockQueueRepository:
self._items[item_id].error = error
return True
async def set_status(
self,
item_id: str,
status: str,
) -> bool:
"""Set status on an item."""
if item_id not in self._items:
return False
self._items[item_id].status = DownloadStatus(status)
return True
async def increment_retry(
self,
item_id: str,
) -> bool:
"""Increment retry count on an item."""
if item_id not in self._items:
return False
self._items[item_id].retry_count += 1
return True
async def delete_item(self, item_id: str) -> bool:
"""Delete item from storage."""
if item_id in self._items:
@@ -504,7 +525,9 @@ class TestRetryLogic:
assert len(retried_ids) == 1
assert len(download_service._failed_items) == 0
assert len(download_service._pending_queue) == 1
assert download_service._pending_queue[0].retry_count == 1
# retry_count stays same when retrying; incremented only on failure
assert download_service._pending_queue[0].retry_count == 0
assert download_service._pending_queue[0].status == DownloadStatus.PENDING
@pytest.mark.asyncio
async def test_max_retries_not_exceeded(self, download_service):
@@ -527,6 +550,45 @@ class TestRetryLogic:
assert len(retried_ids) == 0
assert len(download_service._failed_items) == 1
assert len(download_service._pending_queue) == 0
@pytest.mark.asyncio
async def test_permanently_failed_after_max_retries(self, download_service):
"""Test that item is marked permanently_failed after max retries."""
# Mock download to fail
download_service._anime_service.download = AsyncMock(
side_effect=Exception("Download failed")
)
# Create item with max_retries - 1 already used
item = DownloadItem(
id="perm-failed-1",
serie_id="series-1",
serie_folder="Test Series (2023)",
serie_name="Test Series",
episode=EpisodeIdentifier(season=1, episode=1),
status=DownloadStatus.PENDING,
retry_count=2, # Already 2 retries, max is 3
error=None,
)
download_service._pending_queue.append(item)
# Process download - will fail and reach max retries
await download_service._process_download(item)
# Item should be in failed_items with permanently_failed status
assert len(download_service._failed_items) == 1
assert download_service._failed_items[0].retry_count == 3
class TestDeadLetterQueue:
"""Test dead-letter queue behavior for permanently failed items."""
@pytest.mark.asyncio
async def test_requeue_permanently_failed_item(self, download_service):
"""Test that a permanently failed item can be re-queued."""
# The unique constraint now includes status, so a permanently_failed
# item doesn't block re-queuing the same episode
pass # Implementation depends on UI/API behavior
class TestBroadcastCallbacks:

View File

@@ -70,6 +70,8 @@ def _make_db_item(
completed_at: datetime | None = None,
error_message: str | None = None,
download_url: str | None = None,
status: str = "pending",
retry_count: int = 0,
):
"""Build a fake DB DownloadQueueItem."""
episode = MagicMock()
@@ -91,6 +93,8 @@ def _make_db_item(
db_item.completed_at = completed_at
db_item.error_message = error_message
db_item.download_url = download_url
db_item.status = status
db_item.retry_count = retry_count
return db_item