Files
BanGUI/backend/app/tasks/scheduler_lock_heartbeat.py
Lukas 05c3b564ae Refactor scheduler lock implementation with heartbeat mechanism
- Add heartbeat-based lock renewal in scheduler_lock_heartbeat.py
- Update scheduler_lock.py with improved lock management
- Add comprehensive tests for scheduler lock functionality
- Update deployment and task documentation

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-30 22:10:38 +02:00

112 lines
3.8 KiB
Python

"""Scheduler lock heartbeat background task.
Registers an APScheduler job that periodically updates the scheduler lock's
heartbeat timestamp. This prevents the lock from being considered stale
if the running instance experiences temporary delays or high load.
Without this heartbeat, stale lock detection (based on TTL) could incorrectly
determine that the scheduler instance has crashed when it's merely busy, and
a new instance could take over.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import structlog
from app.tasks.db import task_db
from app.tasks.timeout_utils import run_with_timeout
from app.utils.runtime_state import get_effective_settings
from app.utils.scheduler_lock import update_scheduler_lock_heartbeat
if TYPE_CHECKING:
from fastapi import FastAPI
from app.config import Settings
log: structlog.stdlib.BoundLogger = structlog.get_logger()
#: How often the heartbeat job fires (seconds). Must be significantly less than
#: the lock TTL to allow multiple missed heartbeats before lock expiry.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL: int = 5
#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates.
JOB_ID: str = "scheduler_lock_heartbeat"
#: Maximum seconds to allow for scheduler lock heartbeat to complete.
TASK_TIMEOUT_SECONDS: int = 5
async def _update_heartbeat_with_resources(settings: Settings) -> None:
"""Update the scheduler lock heartbeat timestamp.
If the heartbeat update fails (e.g., we no longer hold the lock), log
a warning but don't crash the scheduler. This allows the running
application to continue even if something went wrong.
The heartbeat must complete within TASK_TIMEOUT_SECONDS to prevent
scheduler starvation. If it exceeds this timeout, a warning is logged
and the task is cancelled.
Args:
settings: The resolved application settings used for database access.
"""
async def _do_update() -> None:
async with task_db(settings) as db:
success = await update_scheduler_lock_heartbeat(db)
if success:
log.debug("scheduler_lock_heartbeat_updated")
else:
log.warning(
"scheduler_lock_heartbeat_failed",
message="Failed to update heartbeat; we no longer hold the lock. "
"Another instance may have taken over or the database connection failed.",
)
try:
await run_with_timeout("scheduler_lock_heartbeat", _do_update(), TASK_TIMEOUT_SECONDS)
except TimeoutError:
log.error(
"scheduler_lock_heartbeat_timeout",
timeout_seconds=TASK_TIMEOUT_SECONDS,
message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
)
except Exception as e:
log.error(
"scheduler_lock_heartbeat_error",
error=str(e),
message="Unexpected error during heartbeat update.",
)
async def _update_heartbeat(app: FastAPI) -> None:
await _update_heartbeat_with_resources(get_effective_settings(app))
def register(app: FastAPI) -> None:
"""Add (or replace) the scheduler lock heartbeat job.
Must be called after the scheduler has been started (i.e., inside the
lifespan handler, after ``scheduler.start()``).
Args:
app: The :class:`fastapi.FastAPI` application instance whose
``app.state.scheduler`` will receive the job.
"""
settings = get_effective_settings(app)
app.state.scheduler.add_job(
_update_heartbeat_with_resources,
trigger="interval",
seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
kwargs={"settings": settings},
id=JOB_ID,
replace_existing=True,
)
log.info(
"scheduler_lock_heartbeat_scheduled",
interval_seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
)