Files
BanGUI/backend/app/tasks/scheduler_lock_heartbeat.py
Lukas 0133489920 Update observability docs and task utilities
- Add Observability.md documentation
- Standardize task logging with correlation_id support
- Add log_sanitizer utility for PII masking
- Update Tasks.md tracking
- Update geo_cache tasks and other task modules with correlation_id

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-03 11:52:09 +02:00

136 lines
4.7 KiB
Python

"""Scheduler lock heartbeat background task.
Registers an APScheduler job that periodically updates the scheduler lock's
heartbeat timestamp. This prevents the lock from being considered stale
if the running instance experiences temporary delays or high load.
Without this heartbeat, stale lock detection (based on TTL) could incorrectly
determine that the scheduler instance has crashed when it's merely busy, and
a new instance could take over.
Correlation IDs are propagated through the task using :mod:`app.utils.correlation`
so that task logs can be correlated across runs.
"""
from __future__ import annotations
import uuid
from typing import TYPE_CHECKING
import structlog
from app.tasks.db import task_db
from app.tasks.timeout_utils import run_with_timeout
from app.utils.correlation import get_correlation_id, reset_correlation_id, set_correlation_id
from app.utils.runtime_state import get_effective_settings
from app.utils.scheduler_lock import update_scheduler_lock_heartbeat
if TYPE_CHECKING:
from fastapi import FastAPI
from app.config import Settings
log: structlog.stdlib.BoundLogger = structlog.get_logger()
#: How often the heartbeat job fires (seconds). Must be significantly less than
#: the lock TTL to allow multiple missed heartbeats before lock expiry.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL: int = 5
#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates.
JOB_ID: str = "scheduler_lock_heartbeat"
#: Maximum seconds to allow for scheduler lock heartbeat to complete.
TASK_TIMEOUT_SECONDS: int = 5
async def _update_heartbeat_with_resources(
settings: Settings,
correlation_id: str | None = None,
) -> None:
"""Update the scheduler lock heartbeat timestamp.
If the heartbeat update fails (e.g., we no longer hold the lock), log
a warning but don't crash the scheduler. This allows the running
application to continue even if something went wrong.
The heartbeat must complete within TASK_TIMEOUT_SECONDS to prevent
scheduler starvation. If it exceeds this timeout, a warning is logged
and the task is cancelled.
Args:
settings: The resolved application settings used for database access.
correlation_id: Optional correlation ID from the triggering request.
"""
if correlation_id is None:
correlation_id = str(uuid.uuid4())
token = set_correlation_id(correlation_id)
try:
await _do_update_heartbeat_with_settings(settings)
finally:
reset_correlation_id(token)
async def _do_update_heartbeat_with_settings(settings: Settings) -> None:
"""Inner heartbeat logic that runs with correlation context set."""
async def _do_update() -> None:
async with task_db(settings) as db:
success = await update_scheduler_lock_heartbeat(db)
if success:
log.debug("scheduler_lock_heartbeat_updated", correlation_id=get_correlation_id())
else:
log.warning(
"scheduler_lock_heartbeat_failed",
correlation_id=get_correlation_id(),
message="Failed to update heartbeat; we no longer hold the lock. "
"Another instance may have taken over or the database connection failed.",
)
try:
await run_with_timeout("scheduler_lock_heartbeat", _do_update(), TASK_TIMEOUT_SECONDS)
except TimeoutError:
log.error(
"scheduler_lock_heartbeat_timeout",
correlation_id=get_correlation_id(),
timeout_seconds=TASK_TIMEOUT_SECONDS,
message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
)
except Exception as e:
log.error(
"scheduler_lock_heartbeat_error",
correlation_id=get_correlation_id(),
error=str(e),
message="Unexpected error during heartbeat update.",
)
async def _update_heartbeat(app: FastAPI) -> None:
await _update_heartbeat_with_resources(get_effective_settings(app))
def register(app: FastAPI) -> None:
"""Add (or replace) the scheduler lock heartbeat job.
Must be called after the scheduler has been started (i.e., inside the
lifespan handler, after ``scheduler.start()``).
Args:
app: The :class:`fastapi.FastAPI` application instance whose
``app.state.scheduler`` will receive the job.
"""
settings = get_effective_settings(app)
app.state.scheduler.add_job(
_update_heartbeat_with_resources,
trigger="interval",
seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
kwargs={"settings": settings},
id=JOB_ID,
replace_existing=True,
)
log.info(
"scheduler_lock_heartbeat_scheduled",
interval_seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
)