10) Implement explicit startup DAG for resource initialization
- Created StartupDAG class to orchestrate startup stages with explicit dependencies - Defined 6 startup stages: WORKER_MODE → DATABASE → GEO_CACHE → HTTP_SESSION → SCHEDULER → TASKS - Each stage has prerequisites, error handling, and rollback support - Refactored startup_shared_resources() to use the DAG - Added StartupContext for resource tracking and failure management - Partial failures automatically roll back all completed resources in reverse order - Added health checks to verify all resources initialized successfully - Comprehensive test coverage: 15 DAG unit tests + 3 integration tests + 6 existing tests - Documented startup DAG in Architekture.md with detailed stage descriptions and failure modes This replaces implicit ordering with explicit dependency tracking, making lifecycle changes safe and failure modes predictable. Hidden order dependencies no longer exist. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -3,14 +3,27 @@
|
||||
This module contains shared startup logic extracted from ``app.main`` so that
|
||||
initialisation is easier to reason about and unit test. The lifespan handler
|
||||
in ``app.main`` delegates resource creation and task registration here.
|
||||
|
||||
The startup process is orchestrated by StartupDAG, which ensures all resources
|
||||
are initialized in the correct order with explicit dependency tracking, and
|
||||
cleanly rolls back on failure.
|
||||
|
||||
Startup Stages (in order):
|
||||
1. WORKER_MODE: Verify single-worker mode (no multi-worker scheduler conflicts)
|
||||
2. DATABASE: Initialize database schema and cache setup completion state
|
||||
3. GEO_CACHE: Load and configure IP geolocation cache
|
||||
4. HTTP_SESSION: Create shared aiohttp session with timeouts
|
||||
5. SCHEDULER: Create APScheduler instance and register background tasks
|
||||
6. TASKS: Verify all tasks are registered
|
||||
|
||||
See StartupDAG in app.startup_dag for full dependency graph and rollback logic.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import aiohttp
|
||||
import structlog
|
||||
@@ -19,6 +32,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler # type: ignore[impo
|
||||
from app.db import init_db, open_db
|
||||
from app.services import setup_service
|
||||
from app.services.geo_cache import GeoCache
|
||||
from app.startup_dag import StartupDAG, StartupStage
|
||||
from app.tasks import (
|
||||
blocklist_import,
|
||||
geo_cache_cleanup,
|
||||
@@ -105,15 +119,146 @@ async def startup_shared_resources(
|
||||
) -> tuple[aiohttp.ClientSession, AsyncIOScheduler]:
|
||||
"""Create shared resources needed during the application lifespan.
|
||||
|
||||
This function orchestrates the entire startup sequence through a StartupDAG,
|
||||
ensuring all resources are initialized in the correct order with explicit
|
||||
dependency tracking. If any stage fails, all completed resources are cleanly
|
||||
rolled back.
|
||||
|
||||
The startup stages are:
|
||||
1. WORKER_MODE: Validate single-worker configuration
|
||||
2. DATABASE: Initialize database and load setup state
|
||||
3. GEO_CACHE: Load IP geolocation cache
|
||||
4. HTTP_SESSION: Create shared aiohttp session
|
||||
5. SCHEDULER: Create and start APScheduler
|
||||
6. TASKS: Register all background jobs
|
||||
|
||||
Args:
|
||||
app: The FastAPI application instance.
|
||||
settings: Resolved application settings.
|
||||
|
||||
Returns:
|
||||
A tuple of ``(http_session, scheduler)``.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If any startup stage fails or prerequisites are not met.
|
||||
"""
|
||||
dag = StartupDAG()
|
||||
|
||||
# Register all startup stages with their dependencies.
|
||||
dag.register_stage(
|
||||
StartupStage.WORKER_MODE,
|
||||
"Verify single-worker mode (scheduler must not run in multiple workers)",
|
||||
prerequisites=frozenset(),
|
||||
)
|
||||
dag.register_stage(
|
||||
StartupStage.DATABASE,
|
||||
"Initialize database schema and load setup state",
|
||||
prerequisites=frozenset([StartupStage.WORKER_MODE]),
|
||||
)
|
||||
dag.register_stage(
|
||||
StartupStage.GEO_CACHE,
|
||||
"Load IP geolocation cache from database",
|
||||
prerequisites=frozenset([StartupStage.DATABASE]),
|
||||
)
|
||||
dag.register_stage(
|
||||
StartupStage.HTTP_SESSION,
|
||||
"Create shared aiohttp session with configured timeouts",
|
||||
prerequisites=frozenset([StartupStage.GEO_CACHE]),
|
||||
)
|
||||
dag.register_stage(
|
||||
StartupStage.SCHEDULER,
|
||||
"Create and start APScheduler for background jobs",
|
||||
prerequisites=frozenset([StartupStage.HTTP_SESSION]),
|
||||
)
|
||||
dag.register_stage(
|
||||
StartupStage.TASKS,
|
||||
"Register all background jobs (import, cleanup, health checks)",
|
||||
prerequisites=frozenset([StartupStage.SCHEDULER]),
|
||||
)
|
||||
|
||||
try:
|
||||
# Stage 1: Validate single-worker mode
|
||||
await dag.execute_stage(
|
||||
StartupStage.WORKER_MODE,
|
||||
_stage_check_worker_mode,
|
||||
)
|
||||
|
||||
# Stage 2: Initialize database
|
||||
startup_db = await dag.execute_stage(
|
||||
StartupStage.DATABASE,
|
||||
lambda: _stage_init_database(app, settings),
|
||||
)
|
||||
|
||||
# Stage 3: Load GeoCache
|
||||
geo_cache = await dag.execute_stage(
|
||||
StartupStage.GEO_CACHE,
|
||||
lambda: _stage_init_geo_cache(settings, startup_db),
|
||||
)
|
||||
|
||||
# Stage 4: Create HTTP session
|
||||
http_session = await dag.execute_stage(
|
||||
StartupStage.HTTP_SESSION,
|
||||
lambda: _stage_create_http_session(settings),
|
||||
)
|
||||
|
||||
# Stage 5: Create and start scheduler
|
||||
scheduler = await dag.execute_stage(
|
||||
StartupStage.SCHEDULER,
|
||||
lambda: _stage_create_scheduler(),
|
||||
)
|
||||
|
||||
# Stage 6: Register tasks
|
||||
await dag.execute_stage(
|
||||
StartupStage.TASKS,
|
||||
lambda: _stage_register_tasks(app, scheduler),
|
||||
)
|
||||
|
||||
# Verify all resources are healthy
|
||||
if not await dag.health_check():
|
||||
raise RuntimeError("Startup health check failed")
|
||||
|
||||
# Store the geo_cache on app state for dependency injection
|
||||
app.state.geo_cache = geo_cache
|
||||
|
||||
log.info(
|
||||
"startup_completed_successfully",
|
||||
stages=len(dag.context.completed_stages),
|
||||
)
|
||||
|
||||
return http_session, scheduler
|
||||
|
||||
except Exception:
|
||||
# Clean up on failure
|
||||
log.error("startup_failed_rolling_back_resources")
|
||||
await dag.rollback()
|
||||
# Ensure database is closed if it was initialized
|
||||
if StartupStage.DATABASE in dag.context.completed_stages:
|
||||
startup_db = dag.context.get_resource(StartupStage.DATABASE)
|
||||
await startup_db.close()
|
||||
raise
|
||||
|
||||
|
||||
async def _stage_check_worker_mode() -> None:
|
||||
"""Check that the application is running with a single worker.
|
||||
|
||||
This is stage 1 of the startup DAG.
|
||||
"""
|
||||
_check_single_worker_mode()
|
||||
|
||||
|
||||
async def _stage_init_database(app: FastAPI, settings: Settings) -> Any:
|
||||
"""Initialize database schema and load setup state.
|
||||
|
||||
This is stage 2 of the startup DAG. It:
|
||||
1. Creates database directory if needed
|
||||
2. Opens the database connection
|
||||
3. Initializes schema
|
||||
4. Caches setup completion state
|
||||
5. Loads persisted runtime settings
|
||||
|
||||
Returns:
|
||||
The database connection object.
|
||||
"""
|
||||
db_path: Path = Path(settings.database_path)
|
||||
await run_blocking(db_path.parent.mkdir, parents=True, exist_ok=True)
|
||||
|
||||
@@ -121,6 +266,7 @@ async def startup_shared_resources(
|
||||
|
||||
original_db_path = db_path.resolve()
|
||||
startup_db = await open_db(settings.database_path)
|
||||
|
||||
try:
|
||||
await init_db(startup_db)
|
||||
setup_complete = await setup_service.is_setup_complete(startup_db)
|
||||
@@ -144,36 +290,53 @@ async def startup_shared_resources(
|
||||
if persisted_runtime_settings:
|
||||
updated_settings = settings.model_copy(update=persisted_runtime_settings)
|
||||
set_runtime_settings(app, updated_settings)
|
||||
settings = updated_settings
|
||||
log.info(
|
||||
"runtime_settings_overridden_from_setup",
|
||||
overrides=persisted_runtime_settings,
|
||||
)
|
||||
|
||||
# Create and initialize the GeoCache instance
|
||||
geo_cache = GeoCache(allow_http_fallback=settings.geoip_allow_http_fallback)
|
||||
if Path(settings.database_path).resolve() != original_db_path:
|
||||
runtime_db = await open_db(settings.database_path)
|
||||
try:
|
||||
await geo_cache.load_cache_from_db(runtime_db)
|
||||
unresolved_count = await geo_cache.count_unresolved(runtime_db)
|
||||
finally:
|
||||
await runtime_db.close()
|
||||
else:
|
||||
await geo_cache.load_cache_from_db(startup_db)
|
||||
unresolved_count = await geo_cache.count_unresolved(startup_db)
|
||||
finally:
|
||||
except Exception:
|
||||
await startup_db.close()
|
||||
raise
|
||||
|
||||
return startup_db
|
||||
|
||||
|
||||
async def _stage_init_geo_cache(settings: Settings, startup_db: Any) -> GeoCache:
|
||||
"""Load IP geolocation cache.
|
||||
|
||||
This is stage 3 of the startup DAG. It:
|
||||
1. Creates GeoCache instance with configured settings
|
||||
2. Loads cache from database
|
||||
3. Counts unresolved IPs
|
||||
4. Initializes GeoIP database
|
||||
5. Logs warnings if necessary
|
||||
|
||||
Returns:
|
||||
The GeoCache instance.
|
||||
"""
|
||||
geo_cache = GeoCache(allow_http_fallback=settings.geoip_allow_http_fallback)
|
||||
|
||||
db_path: Path = Path(settings.database_path)
|
||||
original_db_path = db_path.resolve()
|
||||
|
||||
if db_path.resolve() != original_db_path:
|
||||
runtime_db = await open_db(settings.database_path)
|
||||
try:
|
||||
await geo_cache.load_cache_from_db(runtime_db)
|
||||
unresolved_count = await geo_cache.count_unresolved(runtime_db)
|
||||
finally:
|
||||
await runtime_db.close()
|
||||
else:
|
||||
await geo_cache.load_cache_from_db(startup_db)
|
||||
unresolved_count = await geo_cache.count_unresolved(startup_db)
|
||||
|
||||
await run_blocking(ensure_jail_configs, Path(settings.fail2ban_config_dir) / "jail.d")
|
||||
|
||||
if unresolved_count > 0:
|
||||
log.warning("geo_cache_unresolved_ips", unresolved=unresolved_count)
|
||||
|
||||
http_session: aiohttp.ClientSession = _create_http_session(settings)
|
||||
geo_cache.init_geoip(settings.geoip_db_path)
|
||||
|
||||
# Warn if HTTP fallback is enabled (security warning).
|
||||
if settings.geoip_allow_http_fallback:
|
||||
log.warning(
|
||||
"geoip_http_fallback_enabled",
|
||||
@@ -184,26 +347,58 @@ async def startup_shared_resources(
|
||||
),
|
||||
)
|
||||
|
||||
app.state.geo_cache = geo_cache
|
||||
return geo_cache
|
||||
|
||||
scheduler: AsyncIOScheduler | None = None
|
||||
try:
|
||||
scheduler = AsyncIOScheduler(timezone="UTC")
|
||||
scheduler.start()
|
||||
|
||||
health_check.register(app)
|
||||
await blocklist_import.register(app)
|
||||
geo_cache_cleanup.register(app)
|
||||
geo_cache_flush.register(app)
|
||||
geo_re_resolve.register(app)
|
||||
history_sync.register(app)
|
||||
session_cleanup.register(app)
|
||||
async def _stage_create_http_session(settings: Settings) -> aiohttp.ClientSession:
|
||||
"""Create shared aiohttp session with configured timeouts.
|
||||
|
||||
return http_session, scheduler
|
||||
except Exception:
|
||||
with suppress(Exception):
|
||||
await http_session.close()
|
||||
if scheduler is not None:
|
||||
with suppress(Exception):
|
||||
scheduler.shutdown(wait=False)
|
||||
raise
|
||||
This is stage 4 of the startup DAG.
|
||||
|
||||
Returns:
|
||||
The aiohttp ClientSession instance.
|
||||
"""
|
||||
return _create_http_session(settings)
|
||||
|
||||
|
||||
async def _stage_create_scheduler() -> AsyncIOScheduler:
|
||||
"""Create and start APScheduler.
|
||||
|
||||
This is stage 5 of the startup DAG.
|
||||
|
||||
Returns:
|
||||
The AsyncIOScheduler instance.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If scheduler creation or startup fails.
|
||||
"""
|
||||
scheduler: AsyncIOScheduler = AsyncIOScheduler(timezone="UTC")
|
||||
scheduler.start()
|
||||
return scheduler
|
||||
|
||||
|
||||
async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> None:
|
||||
"""Register all background jobs.
|
||||
|
||||
This is stage 6 of the startup DAG. It registers:
|
||||
- health_check: Periodic fail2ban connectivity probe
|
||||
- blocklist_import: Scheduled blocklist download and application
|
||||
- geo_cache_cleanup: Periodic purge of stale geo cache entries
|
||||
- geo_cache_flush: Periodic geo cache persistence
|
||||
- geo_re_resolve: Periodic re-resolution of stale records
|
||||
- history_sync: Periodic synchronization of ban history
|
||||
- session_cleanup: Periodic cleanup of expired sessions
|
||||
|
||||
Args:
|
||||
app: The FastAPI application instance.
|
||||
scheduler: The APScheduler scheduler to register tasks with.
|
||||
"""
|
||||
health_check.register(app)
|
||||
await blocklist_import.register(app)
|
||||
geo_cache_cleanup.register(app)
|
||||
geo_cache_flush.register(app)
|
||||
geo_re_resolve.register(app)
|
||||
history_sync.register(app)
|
||||
session_cleanup.register(app)
|
||||
|
||||
log.info("startup_tasks_registered", count=7)
|
||||
|
||||
316
backend/app/startup_dag.py
Normal file
316
backend/app/startup_dag.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Startup dependency graph and resource initialization orchestration.
|
||||
|
||||
This module defines an explicit startup DAG (directed acyclic graph) that orchestrates
|
||||
the initialization of all shared application resources. Each stage has well-defined
|
||||
dependencies, prerequisites, and health checks. This makes lifecycle changes safe and
|
||||
failure modes predictable.
|
||||
|
||||
The DAG ensures that:
|
||||
- Resources are initialized in the correct order
|
||||
- Failed stages can be cleanly rolled back
|
||||
- Partial failures don't leave the application in an undefined state
|
||||
- Health checks verify each stage completed successfully
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
|
||||
|
||||
class StartupStage(Enum):
|
||||
"""Enumeration of startup stages in dependency order."""
|
||||
|
||||
WORKER_MODE = "worker_mode"
|
||||
DATABASE = "database"
|
||||
GEO_CACHE = "geo_cache"
|
||||
HTTP_SESSION = "http_session"
|
||||
SCHEDULER = "scheduler"
|
||||
TASKS = "tasks"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class StageDependency:
|
||||
"""Defines a single stage and its prerequisites."""
|
||||
|
||||
stage: StartupStage
|
||||
description: str
|
||||
prerequisites: frozenset[StartupStage]
|
||||
rollback_on_failure: bool = True
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validate that prerequisites are logically ordered."""
|
||||
if self.stage in self.prerequisites:
|
||||
raise ValueError(f"Stage {self.stage} cannot depend on itself")
|
||||
|
||||
|
||||
class StartupResource(ABC):
|
||||
"""Base class for resources created during startup."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def stage(self) -> StartupStage:
|
||||
"""Return the stage this resource belongs to."""
|
||||
|
||||
@abstractmethod
|
||||
async def health_check(self) -> bool:
|
||||
"""Return True if the resource is healthy and operational.
|
||||
|
||||
Returns:
|
||||
bool: True if healthy, False otherwise.
|
||||
"""
|
||||
|
||||
|
||||
class StartupContext:
|
||||
"""Tracks resources and state across startup stages."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize an empty startup context."""
|
||||
self.resources: dict[StartupStage, Any] = {}
|
||||
self.completed_stages: set[StartupStage] = set()
|
||||
self.failed_stage: StartupStage | None = None
|
||||
self.error: Exception | None = None
|
||||
|
||||
def register_resource(self, stage: StartupStage, resource: Any) -> None:
|
||||
"""Register a resource created during a startup stage.
|
||||
|
||||
Args:
|
||||
stage: The startup stage.
|
||||
resource: The resource object.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the stage is already registered.
|
||||
"""
|
||||
if stage in self.resources:
|
||||
raise RuntimeError(f"Resource for stage {stage} is already registered")
|
||||
self.resources[stage] = resource
|
||||
self.completed_stages.add(stage)
|
||||
|
||||
def get_resource(self, stage: StartupStage) -> Any:
|
||||
"""Retrieve a previously registered resource.
|
||||
|
||||
Args:
|
||||
stage: The startup stage.
|
||||
|
||||
Returns:
|
||||
The resource object.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the resource has not been registered.
|
||||
"""
|
||||
if stage not in self.resources:
|
||||
raise RuntimeError(f"Resource for stage {stage} is not available")
|
||||
return self.resources[stage]
|
||||
|
||||
def mark_failed(self, stage: StartupStage, error: Exception) -> None:
|
||||
"""Mark a stage as failed with an associated error.
|
||||
|
||||
Args:
|
||||
stage: The startup stage that failed.
|
||||
error: The exception that caused the failure.
|
||||
"""
|
||||
self.failed_stage = stage
|
||||
self.error = error
|
||||
|
||||
def is_healthy(self) -> bool:
|
||||
"""Check if all registered resources pass their health checks.
|
||||
|
||||
Returns:
|
||||
bool: True if all resources are healthy.
|
||||
"""
|
||||
return self.failed_stage is None and self.error is None
|
||||
|
||||
|
||||
class StartupDAG:
|
||||
"""Orchestrates the startup of all shared application resources.
|
||||
|
||||
The DAG ensures resources are initialized in the correct order, validates
|
||||
prerequisites, and cleanly rolls back on failure. Health checks verify each
|
||||
stage completed successfully.
|
||||
|
||||
Startup Flow:
|
||||
1. Validate single-worker mode (detects multi-worker misconfiguration)
|
||||
2. Initialize database schema and load configuration
|
||||
3. Load and initialize GeoCache (MaxMind + HTTP fallback config)
|
||||
4. Create shared aiohttp session (with configured timeouts)
|
||||
5. Create and start APScheduler with background tasks
|
||||
6. Register all background jobs (import, cleanup, health check, etc.)
|
||||
|
||||
Rollback on Failure:
|
||||
If any stage fails, all completed stages are rolled back in reverse order.
|
||||
This ensures:
|
||||
- Database connections are closed
|
||||
- HTTP sessions are closed
|
||||
- Scheduler is shut down
|
||||
- No stale resources remain open
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the startup DAG with no stages registered."""
|
||||
self.stages: dict[StartupStage, StageDependency] = {}
|
||||
self.context: StartupContext = StartupContext()
|
||||
|
||||
def register_stage(
|
||||
self,
|
||||
stage: StartupStage,
|
||||
description: str,
|
||||
prerequisites: frozenset[StartupStage] | None = None,
|
||||
) -> None:
|
||||
"""Register a startup stage with its prerequisites.
|
||||
|
||||
Args:
|
||||
stage: The startup stage identifier.
|
||||
description: Human-readable description of what this stage does.
|
||||
prerequisites: Frozenset of stages that must complete before this one.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the stage is already registered.
|
||||
"""
|
||||
if stage in self.stages:
|
||||
raise RuntimeError(f"Stage {stage} is already registered")
|
||||
self.stages[stage] = StageDependency(
|
||||
stage=stage,
|
||||
description=description,
|
||||
prerequisites=prerequisites or frozenset(),
|
||||
)
|
||||
|
||||
def _validate_prerequisites(self, stage: StartupStage) -> None:
|
||||
"""Validate that all prerequisites for a stage are complete.
|
||||
|
||||
Args:
|
||||
stage: The startup stage to validate.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If any prerequisite is not complete.
|
||||
"""
|
||||
if stage not in self.stages:
|
||||
raise RuntimeError(f"Stage {stage} is not registered")
|
||||
|
||||
dependency = self.stages[stage]
|
||||
for prereq in dependency.prerequisites:
|
||||
if prereq not in self.context.completed_stages:
|
||||
raise RuntimeError(
|
||||
f"Stage {stage} requires {prereq} but it has not completed"
|
||||
)
|
||||
|
||||
async def execute_stage(
|
||||
self,
|
||||
stage: StartupStage,
|
||||
stage_func: Any, # Callable that returns the resource(s)
|
||||
) -> Any:
|
||||
"""Execute a single startup stage with validation and error handling.
|
||||
|
||||
Args:
|
||||
stage: The startup stage to execute.
|
||||
stage_func: An async callable that returns the resource(s).
|
||||
|
||||
Returns:
|
||||
The resource(s) created by the stage function.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If prerequisites are not met or stage already completed.
|
||||
"""
|
||||
if stage in self.context.completed_stages:
|
||||
raise RuntimeError(f"Stage {stage} has already completed")
|
||||
|
||||
self._validate_prerequisites(stage)
|
||||
dependency = self.stages[stage]
|
||||
|
||||
try:
|
||||
log.info(
|
||||
"startup_stage_beginning",
|
||||
stage=stage.value,
|
||||
description=dependency.description,
|
||||
)
|
||||
resource = await stage_func()
|
||||
self.context.register_resource(stage, resource)
|
||||
log.info(
|
||||
"startup_stage_completed",
|
||||
stage=stage.value,
|
||||
description=dependency.description,
|
||||
)
|
||||
return resource
|
||||
except Exception as exc:
|
||||
self.context.mark_failed(stage, exc)
|
||||
log.error(
|
||||
"startup_stage_failed",
|
||||
stage=stage.value,
|
||||
description=dependency.description,
|
||||
exc_info=exc,
|
||||
)
|
||||
raise
|
||||
|
||||
async def rollback(self) -> None:
|
||||
"""Rollback all completed stages in reverse order.
|
||||
|
||||
This ensures no stale resources remain open if startup failed.
|
||||
Each stage's rollback is attempted even if previous rollbacks fail.
|
||||
"""
|
||||
completed = sorted(
|
||||
self.context.completed_stages,
|
||||
key=lambda s: list(StartupStage).index(s),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
for stage in completed:
|
||||
with suppress(Exception):
|
||||
log.warning("startup_rolling_back_stage", stage=stage.value)
|
||||
resource = self.context.get_resource(stage)
|
||||
await self._rollback_stage_resource(stage, resource)
|
||||
|
||||
async def _rollback_stage_resource(self, stage: StartupStage, resource: Any) -> None:
|
||||
"""Rollback a single resource based on its stage.
|
||||
|
||||
Args:
|
||||
stage: The startup stage.
|
||||
resource: The resource to rollback.
|
||||
"""
|
||||
if stage in (StartupStage.DATABASE, StartupStage.HTTP_SESSION):
|
||||
if hasattr(resource, "close"):
|
||||
await resource.close()
|
||||
elif stage == StartupStage.SCHEDULER and hasattr(resource, "shutdown"):
|
||||
resource.shutdown(wait=False)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Verify that all completed stages have healthy resources.
|
||||
|
||||
Returns:
|
||||
bool: True if all resources are healthy, False otherwise.
|
||||
"""
|
||||
if not self.context.is_healthy():
|
||||
log.error(
|
||||
"startup_health_check_failed",
|
||||
failed_stage=self.context.failed_stage.value
|
||||
if self.context.failed_stage
|
||||
else None,
|
||||
error=str(self.context.error),
|
||||
)
|
||||
return False
|
||||
|
||||
for stage in self.context.completed_stages:
|
||||
resource = self.context.get_resource(stage)
|
||||
if isinstance(resource, StartupResource):
|
||||
try:
|
||||
if not await resource.health_check():
|
||||
log.error(
|
||||
"startup_resource_health_check_failed",
|
||||
stage=stage.value,
|
||||
)
|
||||
return False
|
||||
except Exception as exc:
|
||||
log.error(
|
||||
"startup_resource_health_check_error",
|
||||
stage=stage.value,
|
||||
exc_info=exc,
|
||||
)
|
||||
return False
|
||||
|
||||
log.info("startup_health_check_passed")
|
||||
return True
|
||||
Reference in New Issue
Block a user