feat(backend): implement graceful shutdown for container stop
Graceful shutdown ensures in-flight operations complete before process exits: - Lifespan shutdown handler drains pending tasks with 25s timeout - Scheduler stops accepting new jobs immediately - HTTP session, external logging, scheduler lock, DB conn closed cleanly - 25s Python timeout leaves 5s margin before Docker's 30s SIGKILL Files changed: - backend/app/main.py: enhanced _lifespan shutdown with task drain - Docker/Dockerfile.backend: documented signal handling in header - Docker/docker-compose.yml: added stop_grace_period: 30s - Docker/compose.prod.yml: added stop_grace_period: 30s - Docs/Deployment.md: new Graceful Shutdown section with sequence table - Docs/TROUBLESHOOTING.md: new Graceful Shutdown Issues section Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -234,24 +234,70 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
log.info("bangui_shutting_down")
|
||||
scheduler.shutdown(wait=False)
|
||||
await http_session.close()
|
||||
# Grace period for pending tasks to complete before hard shutdown.
|
||||
# Docker stop sends SIGTERM; uvicorn catches it and calls lifespan shutdown.
|
||||
# We use a shorter timeout here (25s) to leave a safety margin before
|
||||
# Docker's 30s kill timeout kicks in.
|
||||
graceful_timeout: float = 25.0
|
||||
|
||||
# Shutdown external logging handler
|
||||
log.info("bangui_shutting_down", timeout_seconds=graceful_timeout)
|
||||
|
||||
# 1. Signal scheduler to stop accepting new jobs.
|
||||
# APScheduler's shutdown(wait=False) prevents new jobs from being submitted
|
||||
# while allowing currently-running jobs to complete.
|
||||
scheduler.shutdown(wait=False)
|
||||
log.debug("scheduler_stopped_accepting_jobs")
|
||||
|
||||
# 2. Drain in-flight tasks: wait for running background jobs to complete.
|
||||
# This gives blocklist imports, geo resolutions, and history syncs time to finish.
|
||||
# Tasks that exceed the timeout are cancelled — the finally block in each
|
||||
# task's coroutine handles cleanup.
|
||||
import asyncio # noqa: TC003
|
||||
|
||||
pending_tasks: list[asyncio.Task[Any]] = [
|
||||
t for t in asyncio.all_tasks() if not t.done()
|
||||
]
|
||||
if pending_tasks:
|
||||
log.info(
|
||||
"waiting_for_pending_tasks",
|
||||
count=len(pending_tasks),
|
||||
timeout_seconds=graceful_timeout,
|
||||
)
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*pending_tasks, return_exceptions=True),
|
||||
timeout=graceful_timeout,
|
||||
)
|
||||
log.debug("pending_tasks_completed")
|
||||
except TimeoutError:
|
||||
log.warning(
|
||||
"pending_tasks_timeout",
|
||||
cancelled_count=len(pending_tasks),
|
||||
)
|
||||
|
||||
# 3. Close HTTP session to release connections.
|
||||
await http_session.close()
|
||||
log.debug("http_session_closed")
|
||||
|
||||
# 4. Shutdown external logging handler.
|
||||
if _external_log_handler:
|
||||
try:
|
||||
await _external_log_handler.shutdown()
|
||||
log.debug("external_logging_shutdown_complete")
|
||||
except Exception as exc:
|
||||
log.error("external_logging_shutdown_failed", error=str(exc))
|
||||
|
||||
# Release the scheduler lock to allow other instances to take over
|
||||
# 5. Release the scheduler lock so other instances can take over immediately.
|
||||
# During rolling deployments or restarts, this allows the new instance to
|
||||
# acquire the lock without waiting for TTL expiry.
|
||||
try:
|
||||
await release_scheduler_lock(startup_db)
|
||||
log.debug("scheduler_lock_released")
|
||||
except Exception as e:
|
||||
log.error("scheduler_lock_release_failed", error=str(e))
|
||||
finally:
|
||||
await startup_db.close()
|
||||
|
||||
# 6. Close the database connection.
|
||||
await startup_db.close()
|
||||
log.info("bangui_shut_down")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user