feat(backend): implement graceful shutdown for container stop
Graceful shutdown ensures in-flight operations complete before process exits: - Lifespan shutdown handler drains pending tasks with 25s timeout - Scheduler stops accepting new jobs immediately - HTTP session, external logging, scheduler lock, DB conn closed cleanly - 25s Python timeout leaves 5s margin before Docker's 30s SIGKILL Files changed: - backend/app/main.py: enhanced _lifespan shutdown with task drain - Docker/Dockerfile.backend: documented signal handling in header - Docker/docker-compose.yml: added stop_grace_period: 30s - Docker/compose.prod.yml: added stop_grace_period: 30s - Docs/Deployment.md: new Graceful Shutdown section with sequence table - Docs/TROUBLESHOOTING.md: new Graceful Shutdown Issues section Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -234,24 +234,70 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
log.info("bangui_shutting_down")
|
||||
scheduler.shutdown(wait=False)
|
||||
await http_session.close()
|
||||
# Grace period for pending tasks to complete before hard shutdown.
|
||||
# Docker stop sends SIGTERM; uvicorn catches it and calls lifespan shutdown.
|
||||
# We use a shorter timeout here (25s) to leave a safety margin before
|
||||
# Docker's 30s kill timeout kicks in.
|
||||
graceful_timeout: float = 25.0
|
||||
|
||||
# Shutdown external logging handler
|
||||
log.info("bangui_shutting_down", timeout_seconds=graceful_timeout)
|
||||
|
||||
# 1. Signal scheduler to stop accepting new jobs.
|
||||
# APScheduler's shutdown(wait=False) prevents new jobs from being submitted
|
||||
# while allowing currently-running jobs to complete.
|
||||
scheduler.shutdown(wait=False)
|
||||
log.debug("scheduler_stopped_accepting_jobs")
|
||||
|
||||
# 2. Drain in-flight tasks: wait for running background jobs to complete.
|
||||
# This gives blocklist imports, geo resolutions, and history syncs time to finish.
|
||||
# Tasks that exceed the timeout are cancelled — the finally block in each
|
||||
# task's coroutine handles cleanup.
|
||||
import asyncio # noqa: TC003
|
||||
|
||||
pending_tasks: list[asyncio.Task[Any]] = [
|
||||
t for t in asyncio.all_tasks() if not t.done()
|
||||
]
|
||||
if pending_tasks:
|
||||
log.info(
|
||||
"waiting_for_pending_tasks",
|
||||
count=len(pending_tasks),
|
||||
timeout_seconds=graceful_timeout,
|
||||
)
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
asyncio.gather(*pending_tasks, return_exceptions=True),
|
||||
timeout=graceful_timeout,
|
||||
)
|
||||
log.debug("pending_tasks_completed")
|
||||
except TimeoutError:
|
||||
log.warning(
|
||||
"pending_tasks_timeout",
|
||||
cancelled_count=len(pending_tasks),
|
||||
)
|
||||
|
||||
# 3. Close HTTP session to release connections.
|
||||
await http_session.close()
|
||||
log.debug("http_session_closed")
|
||||
|
||||
# 4. Shutdown external logging handler.
|
||||
if _external_log_handler:
|
||||
try:
|
||||
await _external_log_handler.shutdown()
|
||||
log.debug("external_logging_shutdown_complete")
|
||||
except Exception as exc:
|
||||
log.error("external_logging_shutdown_failed", error=str(exc))
|
||||
|
||||
# Release the scheduler lock to allow other instances to take over
|
||||
# 5. Release the scheduler lock so other instances can take over immediately.
|
||||
# During rolling deployments or restarts, this allows the new instance to
|
||||
# acquire the lock without waiting for TTL expiry.
|
||||
try:
|
||||
await release_scheduler_lock(startup_db)
|
||||
log.debug("scheduler_lock_released")
|
||||
except Exception as e:
|
||||
log.error("scheduler_lock_release_failed", error=str(e))
|
||||
finally:
|
||||
await startup_db.close()
|
||||
|
||||
# 6. Close the database connection.
|
||||
await startup_db.close()
|
||||
log.info("bangui_shut_down")
|
||||
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from pydantic import Field
|
||||
|
||||
from app.models.response import BanGuiBaseModel
|
||||
|
||||
|
||||
class LoginRequest(BanGuiBaseModel):
|
||||
"""Payload for ``POST /api/auth/login``."""
|
||||
|
||||
@@ -35,6 +36,12 @@ class LogoutResponse(BanGuiBaseModel):
|
||||
|
||||
message: str = Field(default="Logged out successfully.")
|
||||
|
||||
class SessionValidResponse(BanGuiBaseModel):
|
||||
"""Response for ``GET /api/auth/session`` confirming session validity."""
|
||||
|
||||
valid: bool = Field(default=True, description="Whether the session is valid and active.")
|
||||
|
||||
|
||||
class Session(BanGuiBaseModel):
|
||||
"""Internal domain model representing a persisted session record."""
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ from app.dependencies import (
|
||||
SettingsDep,
|
||||
)
|
||||
from app.exceptions import AuthenticationError, RateLimitError
|
||||
from app.models.auth import LoginRequest, LoginResponse, LogoutResponse
|
||||
from app.models.auth import LoginRequest, LoginResponse, LogoutResponse, SessionValidResponse
|
||||
from app.services import auth_service
|
||||
from app.utils.client_ip import get_client_ip
|
||||
from app.utils.constants import SESSION_COOKIE_NAME
|
||||
@@ -114,11 +114,12 @@ async def login(
|
||||
|
||||
@router.get(
|
||||
"/session",
|
||||
response_model=SessionValidResponse,
|
||||
summary="Validate the current session",
|
||||
)
|
||||
async def validate_session(
|
||||
_: AuthDep,
|
||||
) -> dict[str, bool]:
|
||||
) -> SessionValidResponse:
|
||||
"""Validate the current session.
|
||||
|
||||
This endpoint requires a valid session and returns 200 if the session is
|
||||
@@ -132,9 +133,9 @@ async def validate_session(
|
||||
_: The injected session object (unused, but its presence triggers validation).
|
||||
|
||||
Returns:
|
||||
A simple JSON object confirming the session is valid.
|
||||
:class:`~app.models.auth.SessionValidResponse` confirming the session state.
|
||||
"""
|
||||
return {"valid": True}
|
||||
return SessionValidResponse(valid=True)
|
||||
|
||||
|
||||
@router.post(
|
||||
|
||||
Reference in New Issue
Block a user