Refactor scheduler lock implementation with heartbeat mechanism

- Add heartbeat-based lock renewal in scheduler_lock_heartbeat.py
- Update scheduler_lock.py with improved lock management
- Add comprehensive tests for scheduler lock functionality
- Update deployment and task documentation

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-04-30 22:10:38 +02:00
parent f9e283541b
commit 05c3b564ae
5 changed files with 163 additions and 55 deletions

View File

@@ -27,8 +27,9 @@ if TYPE_CHECKING:
log: structlog.stdlib.BoundLogger = structlog.get_logger()
#: How often the heartbeat job fires (seconds). Must be less than the lock TTL.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL: int = 10
#: How often the heartbeat job fires (seconds). Must be significantly less than
#: the lock TTL to allow multiple missed heartbeats before lock expiry.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL: int = 5
#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates.
JOB_ID: str = "scheduler_lock_heartbeat"
@@ -44,6 +45,10 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
a warning but don't crash the scheduler. This allows the running
application to continue even if something went wrong.
The heartbeat must complete within TASK_TIMEOUT_SECONDS to prevent
scheduler starvation. If it exceeds this timeout, a warning is logged
and the task is cancelled.
Args:
settings: The resolved application settings used for database access.
"""
@@ -57,10 +62,24 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
else:
log.warning(
"scheduler_lock_heartbeat_failed",
message="Failed to update heartbeat; we may have lost the lock.",
message="Failed to update heartbeat; we no longer hold the lock. "
"Another instance may have taken over or the database connection failed.",
)
await run_with_timeout("scheduler_lock_heartbeat", _do_update(), TASK_TIMEOUT_SECONDS)
try:
await run_with_timeout("scheduler_lock_heartbeat", _do_update(), TASK_TIMEOUT_SECONDS)
except TimeoutError:
log.error(
"scheduler_lock_heartbeat_timeout",
timeout_seconds=TASK_TIMEOUT_SECONDS,
message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
)
except Exception as e:
log.error(
"scheduler_lock_heartbeat_error",
error=str(e),
message="Unexpected error during heartbeat update.",
)
async def _update_heartbeat(app: FastAPI) -> None:

View File

@@ -51,11 +51,16 @@ log: structlog.stdlib.BoundLogger = structlog.get_logger()
# Lock record expires if heartbeat hasn't been updated for this many seconds.
# This prevents stale locks from a crashed instance from blocking new startups.
# Set conservatively to allow temporary delays (e.g., high load) before considering
# the lock abandoned.
SCHEDULER_LOCK_TTL_SECONDS: int = 60
# Heartbeat interval: how often to update the lock's heartbeat_at timestamp.
# Must be less than TTL to prevent premature expiration.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 10
# Must be significantly less than TTL (at least 3-4x smaller) to allow multiple
# consecutive missed heartbeats before the lock is considered stale.
# With TTL=60s and interval=5s, the lock survives ~12 missed heartbeats before
# expiring, providing robust protection against temporary delays.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5
async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None: