"""Startup dependency graph and resource initialization orchestration. This module defines an explicit startup DAG (directed acyclic graph) that orchestrates the initialization of all shared application resources. Each stage has well-defined dependencies, prerequisites, and health checks. This makes lifecycle changes safe and failure modes predictable. The DAG ensures that: - Resources are initialized in the correct order - Failed stages can be cleanly rolled back - Partial failures don't leave the application in an undefined state - Health checks verify each stage completed successfully """ from __future__ import annotations from abc import ABC, abstractmethod from contextlib import suppress from dataclasses import dataclass from enum import Enum from typing import Any import structlog log: structlog.stdlib.BoundLogger = structlog.get_logger() class StartupStage(Enum): """Enumeration of startup stages in dependency order.""" WORKER_MODE = "worker_mode" DATABASE = "database" GEO_CACHE = "geo_cache" HTTP_SESSION = "http_session" SCHEDULER = "scheduler" TASKS = "tasks" @dataclass(frozen=True) class StageDependency: """Defines a single stage and its prerequisites.""" stage: StartupStage description: str prerequisites: frozenset[StartupStage] rollback_on_failure: bool = True def __post_init__(self) -> None: """Validate that prerequisites are logically ordered.""" if self.stage in self.prerequisites: raise ValueError(f"Stage {self.stage} cannot depend on itself") class StartupResource(ABC): """Base class for resources created during startup.""" @property @abstractmethod def stage(self) -> StartupStage: """Return the stage this resource belongs to.""" @abstractmethod async def health_check(self) -> bool: """Return True if the resource is healthy and operational. Returns: bool: True if healthy, False otherwise. """ class StartupContext: """Tracks resources and state across startup stages.""" def __init__(self) -> None: """Initialize an empty startup context.""" self.resources: dict[StartupStage, Any] = {} self.completed_stages: set[StartupStage] = set() self.failed_stage: StartupStage | None = None self.error: Exception | None = None def register_resource(self, stage: StartupStage, resource: Any) -> None: """Register a resource created during a startup stage. Args: stage: The startup stage. resource: The resource object. Raises: RuntimeError: If the stage is already registered. """ if stage in self.resources: raise RuntimeError(f"Resource for stage {stage} is already registered") self.resources[stage] = resource self.completed_stages.add(stage) def get_resource(self, stage: StartupStage) -> Any: """Retrieve a previously registered resource. Args: stage: The startup stage. Returns: The resource object. Raises: RuntimeError: If the resource has not been registered. """ if stage not in self.resources: raise RuntimeError(f"Resource for stage {stage} is not available") return self.resources[stage] def mark_failed(self, stage: StartupStage, error: Exception) -> None: """Mark a stage as failed with an associated error. Args: stage: The startup stage that failed. error: The exception that caused the failure. """ self.failed_stage = stage self.error = error def is_healthy(self) -> bool: """Check if all registered resources pass their health checks. Returns: bool: True if all resources are healthy. """ return self.failed_stage is None and self.error is None class StartupDAG: """Orchestrates the startup of all shared application resources. The DAG ensures resources are initialized in the correct order, validates prerequisites, and cleanly rolls back on failure. Health checks verify each stage completed successfully. Startup Flow: 1. Validate single-worker mode (detects multi-worker misconfiguration) 2. Initialize database schema and load configuration 3. Load and initialize GeoCache (MaxMind + HTTP fallback config) 4. Create shared aiohttp session (with configured timeouts) 5. Create and start APScheduler with background tasks 6. Register all background jobs (import, cleanup, health check, etc.) Rollback on Failure: If any stage fails, all completed stages are rolled back in reverse order. This ensures: - Database connections are closed - HTTP sessions are closed - Scheduler is shut down - No stale resources remain open """ def __init__(self) -> None: """Initialize the startup DAG with no stages registered.""" self.stages: dict[StartupStage, StageDependency] = {} self.context: StartupContext = StartupContext() def register_stage( self, stage: StartupStage, description: str, prerequisites: frozenset[StartupStage] | None = None, ) -> None: """Register a startup stage with its prerequisites. Args: stage: The startup stage identifier. description: Human-readable description of what this stage does. prerequisites: Frozenset of stages that must complete before this one. Raises: RuntimeError: If the stage is already registered. """ if stage in self.stages: raise RuntimeError(f"Stage {stage} is already registered") self.stages[stage] = StageDependency( stage=stage, description=description, prerequisites=prerequisites or frozenset(), ) def _validate_prerequisites(self, stage: StartupStage) -> None: """Validate that all prerequisites for a stage are complete. Args: stage: The startup stage to validate. Raises: RuntimeError: If any prerequisite is not complete. """ if stage not in self.stages: raise RuntimeError(f"Stage {stage} is not registered") dependency = self.stages[stage] for prereq in dependency.prerequisites: if prereq not in self.context.completed_stages: raise RuntimeError( f"Stage {stage} requires {prereq} but it has not completed" ) async def execute_stage( self, stage: StartupStage, stage_func: Any, # Callable that returns the resource(s) ) -> Any: """Execute a single startup stage with validation and error handling. Args: stage: The startup stage to execute. stage_func: An async callable that returns the resource(s). Returns: The resource(s) created by the stage function. Raises: RuntimeError: If prerequisites are not met or stage already completed. """ if stage in self.context.completed_stages: raise RuntimeError(f"Stage {stage} has already completed") self._validate_prerequisites(stage) dependency = self.stages[stage] try: log.info( "startup_stage_beginning", stage=stage.value, description=dependency.description, ) resource = await stage_func() self.context.register_resource(stage, resource) log.info( "startup_stage_completed", stage=stage.value, description=dependency.description, ) return resource except Exception as exc: self.context.mark_failed(stage, exc) log.error( "startup_stage_failed", stage=stage.value, description=dependency.description, exc_info=exc, ) raise async def rollback(self) -> None: """Rollback all completed stages in reverse order. This ensures no stale resources remain open if startup failed. Each stage's rollback is attempted even if previous rollbacks fail. """ completed = sorted( self.context.completed_stages, key=lambda s: list(StartupStage).index(s), reverse=True, ) for stage in completed: with suppress(Exception): log.warning("startup_rolling_back_stage", stage=stage.value) resource = self.context.get_resource(stage) await self._rollback_stage_resource(stage, resource) async def _rollback_stage_resource(self, stage: StartupStage, resource: Any) -> None: """Rollback a single resource based on its stage. Args: stage: The startup stage. resource: The resource to rollback. """ if stage in (StartupStage.DATABASE, StartupStage.HTTP_SESSION): if hasattr(resource, "close"): await resource.close() elif stage == StartupStage.SCHEDULER and hasattr(resource, "shutdown"): resource.shutdown(wait=False) async def health_check(self) -> bool: """Verify that all completed stages have healthy resources. Returns: bool: True if all resources are healthy, False otherwise. """ if not self.context.is_healthy(): log.error( "startup_health_check_failed", failed_stage=self.context.failed_stage.value if self.context.failed_stage else None, error=str(self.context.error), ) return False for stage in self.context.completed_stages: resource = self.context.get_resource(stage) if isinstance(resource, StartupResource): try: if not await resource.health_check(): log.error( "startup_resource_health_check_failed", stage=stage.value, ) return False except Exception as exc: log.error( "startup_resource_health_check_error", stage=stage.value, exc_info=exc, ) return False log.info("startup_health_check_passed") return True