- Remove structlog dependency from backend/pyproject.toml - Add app.utils.logging_compat shim for keyword-arg logging API - Add app.utils.json_formatter for JSON log output with extra fields - Update all backend modules to use logging_compat.get_logger() - Update docstrings in log_sanitizer.py and json_formatter.py - Update test comment in test_async_utils.py - Record 406 failing tests in Docs/Tasks.md for tracking
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Startup dependency graph and resource initialization orchestration.
|
|
|
|
This module defines an explicit startup DAG (directed acyclic graph) that orchestrates
|
|
the initialization of all shared application resources. Each stage has well-defined
|
|
dependencies, prerequisites, and health checks. This makes lifecycle changes safe and
|
|
failure modes predictable.
|
|
|
|
The DAG ensures that:
|
|
- Resources are initialized in the correct order
|
|
- Failed stages can be cleanly rolled back
|
|
- Partial failures don't leave the application in an undefined state
|
|
- Health checks verify each stage completed successfully
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from contextlib import suppress
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from app.utils.logging_compat import get_logger
|
|
|
|
log = get_logger(__name__)
|
|
|
|
|
|
class StartupStage(Enum):
|
|
"""Enumeration of startup stages in dependency order."""
|
|
|
|
WORKER_MODE = "worker_mode"
|
|
DATABASE = "database"
|
|
GEO_CACHE = "geo_cache"
|
|
HTTP_SESSION = "http_session"
|
|
SCHEDULER = "scheduler"
|
|
TASKS = "tasks"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class StageDependency:
|
|
"""Defines a single stage and its prerequisites."""
|
|
|
|
stage: StartupStage
|
|
description: str
|
|
prerequisites: frozenset[StartupStage]
|
|
rollback_on_failure: bool = True
|
|
|
|
def __post_init__(self) -> None:
|
|
"""Validate that prerequisites are logically ordered."""
|
|
if self.stage in self.prerequisites:
|
|
raise ValueError(f"Stage {self.stage} cannot depend on itself")
|
|
|
|
|
|
class StartupResource(ABC):
|
|
"""Base class for resources created during startup."""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def stage(self) -> StartupStage:
|
|
"""Return the stage this resource belongs to."""
|
|
|
|
@abstractmethod
|
|
async def health_check(self) -> bool:
|
|
"""Return True if the resource is healthy and operational.
|
|
|
|
Returns:
|
|
bool: True if healthy, False otherwise.
|
|
"""
|
|
|
|
|
|
class StartupContext:
|
|
"""Tracks resources and state across startup stages."""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize an empty startup context."""
|
|
self.resources: dict[StartupStage, Any] = {}
|
|
self.completed_stages: set[StartupStage] = set()
|
|
self.failed_stage: StartupStage | None = None
|
|
self.error: Exception | None = None
|
|
|
|
def register_resource(self, stage: StartupStage, resource: Any) -> None:
|
|
"""Register a resource created during a startup stage.
|
|
|
|
Args:
|
|
stage: The startup stage.
|
|
resource: The resource object.
|
|
|
|
Raises:
|
|
RuntimeError: If the stage is already registered.
|
|
"""
|
|
if stage in self.resources:
|
|
raise RuntimeError(f"Resource for stage {stage} is already registered")
|
|
self.resources[stage] = resource
|
|
self.completed_stages.add(stage)
|
|
|
|
def get_resource(self, stage: StartupStage) -> Any:
|
|
"""Retrieve a previously registered resource.
|
|
|
|
Args:
|
|
stage: The startup stage.
|
|
|
|
Returns:
|
|
The resource object.
|
|
|
|
Raises:
|
|
RuntimeError: If the resource has not been registered.
|
|
"""
|
|
if stage not in self.resources:
|
|
raise RuntimeError(f"Resource for stage {stage} is not available")
|
|
return self.resources[stage]
|
|
|
|
def mark_failed(self, stage: StartupStage, error: Exception) -> None:
|
|
"""Mark a stage as failed with an associated error.
|
|
|
|
Args:
|
|
stage: The startup stage that failed.
|
|
error: The exception that caused the failure.
|
|
"""
|
|
self.failed_stage = stage
|
|
self.error = error
|
|
|
|
def is_healthy(self) -> bool:
|
|
"""Check if all registered resources pass their health checks.
|
|
|
|
Returns:
|
|
bool: True if all resources are healthy.
|
|
"""
|
|
return self.failed_stage is None and self.error is None
|
|
|
|
|
|
class StartupDAG:
|
|
"""Orchestrates the startup of all shared application resources.
|
|
|
|
The DAG ensures resources are initialized in the correct order, validates
|
|
prerequisites, and cleanly rolls back on failure. Health checks verify each
|
|
stage completed successfully.
|
|
|
|
Startup Flow:
|
|
1. Validate single-worker mode (detects multi-worker misconfiguration)
|
|
2. Initialize database schema and load configuration
|
|
3. Load and initialize GeoCache (MaxMind + HTTP fallback config)
|
|
4. Create shared aiohttp session (with configured timeouts)
|
|
5. Create and start APScheduler with background tasks
|
|
6. Register all background jobs (import, cleanup, health check, etc.)
|
|
|
|
Rollback on Failure:
|
|
If any stage fails, all completed stages are rolled back in reverse order.
|
|
This ensures:
|
|
- Database connections are closed
|
|
- HTTP sessions are closed
|
|
- Scheduler is shut down
|
|
- No stale resources remain open
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize the startup DAG with no stages registered."""
|
|
self.stages: dict[StartupStage, StageDependency] = {}
|
|
self.context: StartupContext = StartupContext()
|
|
|
|
def register_stage(
|
|
self,
|
|
stage: StartupStage,
|
|
description: str,
|
|
prerequisites: frozenset[StartupStage] | None = None,
|
|
) -> None:
|
|
"""Register a startup stage with its prerequisites.
|
|
|
|
Args:
|
|
stage: The startup stage identifier.
|
|
description: Human-readable description of what this stage does.
|
|
prerequisites: Frozenset of stages that must complete before this one.
|
|
|
|
Raises:
|
|
RuntimeError: If the stage is already registered.
|
|
"""
|
|
if stage in self.stages:
|
|
raise RuntimeError(f"Stage {stage} is already registered")
|
|
self.stages[stage] = StageDependency(
|
|
stage=stage,
|
|
description=description,
|
|
prerequisites=prerequisites or frozenset(),
|
|
)
|
|
|
|
def _validate_prerequisites(self, stage: StartupStage) -> None:
|
|
"""Validate that all prerequisites for a stage are complete.
|
|
|
|
Args:
|
|
stage: The startup stage to validate.
|
|
|
|
Raises:
|
|
RuntimeError: If any prerequisite is not complete.
|
|
"""
|
|
if stage not in self.stages:
|
|
raise RuntimeError(f"Stage {stage} is not registered")
|
|
|
|
dependency = self.stages[stage]
|
|
for prereq in dependency.prerequisites:
|
|
if prereq not in self.context.completed_stages:
|
|
raise RuntimeError(
|
|
f"Stage {stage} requires {prereq} but it has not completed"
|
|
)
|
|
|
|
async def execute_stage(
|
|
self,
|
|
stage: StartupStage,
|
|
stage_func: Any, # Callable that returns the resource(s)
|
|
) -> Any:
|
|
"""Execute a single startup stage with validation and error handling.
|
|
|
|
Args:
|
|
stage: The startup stage to execute.
|
|
stage_func: An async callable that returns the resource(s).
|
|
|
|
Returns:
|
|
The resource(s) created by the stage function.
|
|
|
|
Raises:
|
|
RuntimeError: If prerequisites are not met or stage already completed.
|
|
"""
|
|
if stage in self.context.completed_stages:
|
|
raise RuntimeError(f"Stage {stage} has already completed")
|
|
|
|
self._validate_prerequisites(stage)
|
|
dependency = self.stages[stage]
|
|
|
|
try:
|
|
log.info(
|
|
"startup_stage_beginning",
|
|
stage=stage.value,
|
|
description=dependency.description,
|
|
)
|
|
resource = await stage_func()
|
|
self.context.register_resource(stage, resource)
|
|
log.info(
|
|
"startup_stage_completed",
|
|
stage=stage.value,
|
|
description=dependency.description,
|
|
)
|
|
return resource
|
|
except Exception as exc:
|
|
self.context.mark_failed(stage, exc)
|
|
log.error(
|
|
"startup_stage_failed",
|
|
stage=stage.value,
|
|
description=dependency.description,
|
|
exc_info=exc,
|
|
)
|
|
raise
|
|
|
|
async def rollback(self) -> None:
|
|
"""Rollback all completed stages in reverse order.
|
|
|
|
This ensures no stale resources remain open if startup failed.
|
|
Each stage's rollback is attempted even if previous rollbacks fail.
|
|
"""
|
|
completed = sorted(
|
|
self.context.completed_stages,
|
|
key=lambda s: list(StartupStage).index(s),
|
|
reverse=True,
|
|
)
|
|
|
|
for stage in completed:
|
|
with suppress(Exception):
|
|
log.warning("startup_rolling_back_stage", stage=stage.value)
|
|
resource = self.context.get_resource(stage)
|
|
await self._rollback_stage_resource(stage, resource)
|
|
|
|
async def _rollback_stage_resource(self, stage: StartupStage, resource: Any) -> None:
|
|
"""Rollback a single resource based on its stage.
|
|
|
|
Args:
|
|
stage: The startup stage.
|
|
resource: The resource to rollback.
|
|
"""
|
|
if stage in (StartupStage.DATABASE, StartupStage.HTTP_SESSION):
|
|
if hasattr(resource, "close"):
|
|
await resource.close()
|
|
elif stage == StartupStage.SCHEDULER and hasattr(resource, "shutdown"):
|
|
resource.shutdown(wait=False)
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Verify that all completed stages have healthy resources.
|
|
|
|
Returns:
|
|
bool: True if all resources are healthy, False otherwise.
|
|
"""
|
|
if not self.context.is_healthy():
|
|
log.error(
|
|
"startup_health_check_failed",
|
|
failed_stage=self.context.failed_stage.value
|
|
if self.context.failed_stage
|
|
else None,
|
|
error=str(self.context.error),
|
|
)
|
|
return False
|
|
|
|
for stage in self.context.completed_stages:
|
|
resource = self.context.get_resource(stage)
|
|
if isinstance(resource, StartupResource):
|
|
try:
|
|
if not await resource.health_check():
|
|
log.error(
|
|
"startup_resource_health_check_failed",
|
|
stage=stage.value,
|
|
)
|
|
return False
|
|
except Exception as exc:
|
|
log.error(
|
|
"startup_resource_health_check_error",
|
|
stage=stage.value,
|
|
exc_info=exc,
|
|
)
|
|
return False
|
|
|
|
log.info("startup_health_check_passed")
|
|
return True
|