feat(download): persist retry state and dead-letter

Retry count and queue status were in-memory only and lost on
restart, so failed downloads could not be safely resumed and
permanently-failed episodes silently blocked re-queueing via the
episode-id unique index.

- Add status + retry_count columns to DownloadQueueItem
- Replace unique(episode_id) with unique(episode_id, status) so
  permanently_failed rows do not block new pending entries
- Add PERMANENTLY_FAILED to DownloadStatus enum
- Persist retry_count on each failure; mark permanently_failed once
  max_retries reached
- QueueRepository reads status/retry_count from DB instead of
  defaulting to PENDING/0
- Stop double-incrementing retry_count in retry_failed_items;
  increment only happens in _process_download on failure

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-25 14:24:31 +02:00
parent 0ba2587bc8
commit c579235af0
7 changed files with 383 additions and 38 deletions

View File

@@ -170,6 +170,27 @@ class DownloadService:
logger.error("Failed to save item to database: %s", e)
return item
async def _set_status_in_database(
self,
item_id: str,
status: str,
) -> bool:
"""Set status on an item in the database.
Args:
item_id: Download item ID
status: New status value
Returns:
True if update succeeded
"""
try:
repository = self._get_repository()
return await repository.set_status(item_id, status)
except Exception as e:
logger.error("Failed to set status in database: %s", e)
return False
async def _set_error_in_database(
self,
item_id: str,
@@ -191,6 +212,25 @@ class DownloadService:
logger.error("Failed to set error in database: %s", e)
return False
async def _increment_retry_in_database(
self,
item_id: str,
) -> bool:
"""Increment retry count on an item in the database.
Args:
item_id: Download item ID
Returns:
True if update succeeded
"""
try:
repository = self._get_repository()
return await repository.increment_retry(item_id)
except Exception as e:
logger.error("Failed to increment retry in database: %s", e)
return False
async def _delete_from_database(self, item_id: str) -> bool:
"""Delete an item from the database.
@@ -1051,17 +1091,15 @@ class DownloadService:
if item.retry_count >= self._max_retries:
continue
# Move back to pending
# Move back to pending (retry_count will be incremented
# by _process_download when the item fails again)
self._failed_items.remove(item)
item.status = DownloadStatus.PENDING
item.retry_count += 1
item.error = None
item.progress = None
self._add_to_pending_queue(item)
retried_ids.append(item.id)
# Status is now managed in-memory only
logger.info(
"Retrying failed item: item_id=%s, retry_count=%d",
item.id,
@@ -1069,18 +1107,23 @@ class DownloadService:
)
if retried_ids:
# Notify via progress service
queue_status = await self.get_queue_status()
await self._progress_service.update_progress(
progress_id="download_queue",
message=f"Retried {len(retried_ids)} failed items",
metadata={
"action": "items_retried",
"retried_ids": retried_ids,
"queue_status": queue_status.model_dump(mode="json"),
},
force_broadcast=True,
)
# Notify via progress service if available
try:
queue_status = await self.get_queue_status()
await self._progress_service.update_progress(
progress_id="download_queue",
message=f"Retried {len(retried_ids)} failed items",
metadata={
"action": "items_retried",
"retried_ids": retried_ids,
"queue_status": queue_status.model_dump(mode="json"),
},
force_broadcast=True,
)
except Exception as e:
logger.warning(
"Failed to broadcast retry progress: %s", e
)
return retried_ids
@@ -1220,17 +1263,35 @@ class DownloadService:
item.status = DownloadStatus.FAILED
item.completed_at = datetime.now(timezone.utc)
item.error = str(e)
# Increment retry count in memory and database
item.retry_count += 1
await self._increment_retry_in_database(item.id)
self._failed_items.append(item)
# Set error in database
await self._set_error_in_database(item.id, str(e))
logger.error(
"Download failed: item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
# Check if max retries exceeded - move to dead-letter
if item.retry_count >= self._max_retries:
await self._set_status_in_database(
item.id, DownloadStatus.PERMANENTLY_FAILED.value
)
logger.error(
"Download permanently failed after max retries: "
"item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
else:
logger.error(
"Download failed: item_id=%s, error=%s, retry_count=%d",
item.id,
str(e),
item.retry_count,
)
# Note: Failure is already broadcast by AnimeService
# via ProgressService when SeriesApp fires failed event

View File

@@ -83,15 +83,12 @@ class QueueRepository:
) -> DownloadItem:
"""Convert database model to DownloadItem.
Note: Since the database model is simplified, status, priority,
progress, and retry_count default to initial values.
Args:
db_item: SQLAlchemy download queue item
item_id: Optional override for item ID
Returns:
Pydantic download item with default status/priority
Pydantic download item with status/retry_count from database
"""
# Get episode info from the related Episode object
episode = db_item.episode
@@ -109,14 +106,14 @@ class QueueRepository:
serie_folder=series.folder if series else "",
serie_name=series.name if series else "",
episode=episode_identifier,
status=DownloadStatus.PENDING, # Default - managed in-memory
priority=DownloadPriority.NORMAL, # Default - managed in-memory
status=DownloadStatus(db_item.status), # From database
priority=DownloadPriority.NORMAL, # Managed in-memory
added_at=db_item.created_at or datetime.now(timezone.utc),
started_at=db_item.started_at,
completed_at=db_item.completed_at,
progress=None, # Managed in-memory
error=db_item.error_message,
retry_count=0, # Managed in-memory
retry_count=db_item.retry_count, # From database
source_url=db_item.download_url,
)
@@ -350,6 +347,110 @@ class QueueRepository:
finally:
if manage_session:
await session.close()
async def set_status(
self,
item_id: str,
status: str,
db: Optional[AsyncSession] = None,
) -> bool:
"""Set status on a download item.
Args:
item_id: Download item ID
status: New status value
db: Optional existing database session
Returns:
True if update succeeded, False if item not found
Raises:
QueueRepositoryError: If update fails
"""
session = db or self._db_session_factory()
manage_session = db is None
try:
result = await DownloadQueueService.set_status(
session,
int(item_id),
status,
)
if manage_session:
await session.commit()
success = result is not None
if success:
logger.debug(
"Set status on queue item: item_id=%s, status=%s",
item_id,
status,
)
return success
except ValueError:
return False
except Exception as e:
if manage_session:
await session.rollback()
logger.error("Failed to set status: %s", e)
raise QueueRepositoryError(f"Failed to set status: {e}") from e
finally:
if manage_session:
await session.close()
async def increment_retry(
self,
item_id: str,
db: Optional[AsyncSession] = None,
) -> bool:
"""Increment retry count on a download item.
Args:
item_id: Download item ID
db: Optional existing database session
Returns:
True if update succeeded, False if item not found
Raises:
QueueRepositoryError: If update fails
"""
session = db or self._db_session_factory()
manage_session = db is None
try:
result = await DownloadQueueService.increment_retry_count(
session,
int(item_id),
)
if manage_session:
await session.commit()
success = result is not None
if success:
logger.debug(
"Incremented retry count on queue item: item_id=%s",
item_id,
)
return success
except ValueError:
return False
except Exception as e:
if manage_session:
await session.rollback()
logger.error("Failed to increment retry: %s", e)
raise QueueRepositoryError(f"Failed to increment retry: {e}") from e
finally:
if manage_session:
await session.close()
async def delete_item(
self,