feat(backend): implement graceful shutdown for container stop

Graceful shutdown ensures in-flight operations complete before process exits: - Lifespan shutdown handler drains pending tasks with 25s timeout - Scheduler stops accepting new jobs immediately - HTTP session, external logging, scheduler lock, DB conn closed cleanly - 25s Python timeout leaves 5s margin before Docker's 30s SIGKILL Files changed: - backend/app/main.py: enhanced _lifespan shutdown with task drain - Docker/Dockerfile.backend: documented signal handling in header - Docker/docker-compose.yml: added stop_grace_period: 30s - Docker/compose.prod.yml: added stop_grace_period: 30s - Docs/Deployment.md: new Graceful Shutdown section with sequence table - Docs/TROUBLESHOOTING.md: new Graceful Shutdown Issues section Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-02 22:47:10 +02:00
parent f6c3c02183
commit b631c1c546
10 changed files with 383 additions and 20 deletions
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -234,24 +234,70 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    try:
        yield
    finally:
-        log.info("bangui_shutting_down")
-        scheduler.shutdown(wait=False)
-        await http_session.close()
+        # Grace period for pending tasks to complete before hard shutdown.
+        # Docker stop sends SIGTERM; uvicorn catches it and calls lifespan shutdown.
+        # We use a shorter timeout here (25s) to leave a safety margin before
+        # Docker's 30s kill timeout kicks in.
+        graceful_timeout: float = 25.0

-        # Shutdown external logging handler
+        log.info("bangui_shutting_down", timeout_seconds=graceful_timeout)
+
+        # 1. Signal scheduler to stop accepting new jobs.
+        # APScheduler's shutdown(wait=False) prevents new jobs from being submitted
+        # while allowing currently-running jobs to complete.
+        scheduler.shutdown(wait=False)
+        log.debug("scheduler_stopped_accepting_jobs")
+
+        # 2. Drain in-flight tasks: wait for running background jobs to complete.
+        # This gives blocklist imports, geo resolutions, and history syncs time to finish.
+        # Tasks that exceed the timeout are cancelled — the finally block in each
+        # task's coroutine handles cleanup.
+        import asyncio  # noqa: TC003
+
+        pending_tasks: list[asyncio.Task[Any]] = [
+            t for t in asyncio.all_tasks() if not t.done()
+        ]
+        if pending_tasks:
+            log.info(
+                "waiting_for_pending_tasks",
+                count=len(pending_tasks),
+                timeout_seconds=graceful_timeout,
+            )
+            try:
+                await asyncio.wait_for(
+                    asyncio.gather(*pending_tasks, return_exceptions=True),
+                    timeout=graceful_timeout,
+                )
+                log.debug("pending_tasks_completed")
+            except TimeoutError:
+                log.warning(
+                    "pending_tasks_timeout",
+                    cancelled_count=len(pending_tasks),
+                )
+
+        # 3. Close HTTP session to release connections.
+        await http_session.close()
+        log.debug("http_session_closed")
+
+        # 4. Shutdown external logging handler.
        if _external_log_handler:
            try:
                await _external_log_handler.shutdown()
+                log.debug("external_logging_shutdown_complete")
            except Exception as exc:
                log.error("external_logging_shutdown_failed", error=str(exc))

-        # Release the scheduler lock to allow other instances to take over
+        # 5. Release the scheduler lock so other instances can take over immediately.
+        # During rolling deployments or restarts, this allows the new instance to
+        # acquire the lock without waiting for TTL expiry.
        try:
            await release_scheduler_lock(startup_db)
+            log.debug("scheduler_lock_released")
        except Exception as e:
            log.error("scheduler_lock_release_failed", error=str(e))
-        finally:
-            await startup_db.close()
+
+        # 6. Close the database connection.
+        await startup_db.close()
        log.info("bangui_shut_down")