384 lines
14 KiB
Python
384 lines
14 KiB
Python
"""Database-based scheduler lock for single-executor enforcement.
|
|
|
|
This module implements a database-backed lock mechanism that ensures only one
|
|
BanGUI instance runs the background scheduler, even in container orchestration
|
|
environments where multiple instances might start simultaneously.
|
|
|
|
The lock uses atomic database operations to prevent race conditions:
|
|
- Lock acquisition is atomic: INSERT ... ON CONFLICT with BEGIN IMMEDIATE transaction
|
|
- Lock stealing: If heartbeat exceeds timeout, lock can be taken by another instance
|
|
- Heartbeat update is conditional: UPDATE only if we still hold the lock
|
|
- Stale lock detection uses heartbeat timestamps with configurable timeout
|
|
|
|
This approach is more reliable than filesystem-based locking in containerized
|
|
environments because:
|
|
1. Database transactions are atomic (no TOCTOU race windows)
|
|
2. No NFS/network filesystem edge cases
|
|
3. Stale lock detection is timestamp-based, not PID-based (PID reuse is unreliable)
|
|
4. Works across container restarts and rolling deployments
|
|
|
|
The lock record stores:
|
|
- id: Always 1 (singleton table)
|
|
- pid: Process ID of the lock holder
|
|
- hostname: Container/host name for debugging
|
|
- created_at: When the lock was first acquired
|
|
- heartbeat_at: When the lock was last confirmed alive (updated periodically)
|
|
- heartbeat_timeout: Seconds after which lock is considered stale (default 300)
|
|
|
|
On startup:
|
|
1. Cleanup any stale locks (where heartbeat_at + heartbeat_timeout < now)
|
|
2. Try to insert the lock for this instance using ON CONFLICT to steal stale locks
|
|
3. If INSERT succeeds, lock is acquired
|
|
4. If INSERT fails (IntegrityError), another instance holds a valid lock
|
|
|
|
On running (periodic):
|
|
- Update heartbeat_at to keep the lock alive and prevent false positives
|
|
|
|
On shutdown:
|
|
- Delete the lock (this instance is no longer running the scheduler)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import socket
|
|
import time
|
|
from typing import Any
|
|
|
|
import aiosqlite
|
|
|
|
from app.utils.logging_compat import get_logger
|
|
|
|
log = get_logger(__name__)
|
|
|
|
# Lock record expires if heartbeat hasn't been updated for this many seconds.
|
|
# This prevents stale locks from a crashed instance from blocking new startups.
|
|
# Set conservatively to allow temporary delays (e.g., high load) before considering
|
|
# the lock abandoned.
|
|
SCHEDULER_LOCK_TTL_SECONDS: int = 60
|
|
|
|
# Heartbeat interval: how often to update the lock's heartbeat_at timestamp.
|
|
# Must be significantly less than TTL (at least 3-4x smaller) to allow multiple
|
|
# consecutive missed heartbeats before the lock is considered stale.
|
|
# With TTL=60s and interval=5s, the lock survives ~12 missed heartbeats before
|
|
# expiring, providing robust protection against temporary delays.
|
|
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5
|
|
|
|
# Default heartbeat timeout: how long to wait before considering a lock stale
|
|
# when another instance tries to acquire it. This is the max time a lock holder
|
|
# can go without updating heartbeat before someone else can steal it.
|
|
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS: int = 300
|
|
|
|
|
|
async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
|
|
"""Create the scheduler_lock table if it doesn't exist.
|
|
|
|
This is called during database schema initialization and is safe to call
|
|
multiple times (CREATE TABLE IF NOT EXISTS is idempotent).
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
"""
|
|
await db.execute(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS scheduler_lock (
|
|
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
pid INTEGER NOT NULL,
|
|
hostname TEXT NOT NULL,
|
|
created_at REAL NOT NULL,
|
|
heartbeat_at REAL NOT NULL,
|
|
heartbeat_timeout REAL NOT NULL DEFAULT ?
|
|
);
|
|
""",
|
|
(SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,),
|
|
)
|
|
await db.commit()
|
|
|
|
|
|
async def is_lock_stale(heartbeat_at: float, timeout: float, now: float) -> bool:
|
|
"""Check if a lock is considered stale based on heartbeat timestamp.
|
|
|
|
Args:
|
|
heartbeat_at: Last heartbeat timestamp from the lock record
|
|
timeout: Heartbeat timeout in seconds
|
|
now: Current timestamp
|
|
|
|
Returns:
|
|
True if (now - heartbeat_at) > timeout, indicating stale lock
|
|
"""
|
|
return (now - heartbeat_at) > timeout
|
|
|
|
|
|
async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
|
|
"""Try to acquire the scheduler lock.
|
|
|
|
Uses atomic INSERT ... ON CONFLICT to acquire or steal the lock:
|
|
- If no lock exists: INSERT succeeds, lock acquired
|
|
- If stale lock (heartbeat timeout exceeded): INSERT succeeds, lock stolen
|
|
- If valid lock held by another process: INSERT fails with IntegrityError
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
|
|
Returns:
|
|
True if the lock was successfully acquired, False if held by another instance.
|
|
|
|
Raises:
|
|
RuntimeError: If database operations fail.
|
|
"""
|
|
now = time.time()
|
|
pid = os.getpid()
|
|
hostname = socket.gethostname()
|
|
|
|
try:
|
|
await db.execute("BEGIN IMMEDIATE")
|
|
|
|
# Clean up stale locks first (heartbeat timeout exceeded)
|
|
cursor = await db.execute("SELECT pid, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1")
|
|
row = await cursor.fetchone()
|
|
|
|
if row and len(row) == 3:
|
|
lock_pid, lock_heartbeat, lock_timeout = row
|
|
if lock_pid == pid:
|
|
# Same process re-acquiring - allowed (refresh)
|
|
pass
|
|
elif (now - lock_heartbeat) <= lock_timeout:
|
|
# Another process holds a valid lock - cannot acquire
|
|
await db.rollback()
|
|
log.warning(
|
|
"scheduler_lock_held_by_other_instance",
|
|
our_pid=pid,
|
|
lock_pid=lock_pid,
|
|
lock_heartbeat_age_seconds=now - lock_heartbeat,
|
|
)
|
|
return False
|
|
# Stale lock (held by another process that crashed) - will be overwritten below
|
|
|
|
# Try to insert or update the lock
|
|
await db.execute(
|
|
"""
|
|
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
|
VALUES (1, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(id) DO UPDATE SET
|
|
pid = excluded.pid,
|
|
hostname = excluded.hostname,
|
|
created_at = excluded.created_at,
|
|
heartbeat_at = excluded.heartbeat_at,
|
|
heartbeat_timeout = excluded.heartbeat_timeout
|
|
""",
|
|
(pid, hostname, now, now, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
|
|
)
|
|
await db.commit()
|
|
|
|
log.info(
|
|
"scheduler_lock_acquired",
|
|
pid=pid,
|
|
hostname=hostname,
|
|
)
|
|
return True
|
|
|
|
except aiosqlite.IntegrityError:
|
|
# Lock is already held by another instance (INSERT failed due to UNIQUE constraint)
|
|
# and the ON CONFLICT WHERE condition was not met (lock is fresh, not stale)
|
|
try:
|
|
cursor = await db.execute(
|
|
"SELECT pid, hostname, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
|
)
|
|
row = await cursor.fetchone()
|
|
if row:
|
|
lock_pid, lock_hostname, lock_heartbeat, lock_timeout = row
|
|
heartbeat_age = now - lock_heartbeat
|
|
log.warning(
|
|
"scheduler_lock_held_by_other_instance",
|
|
our_pid=pid,
|
|
lock_pid=lock_pid,
|
|
lock_hostname=lock_hostname,
|
|
heartbeat_age_seconds=heartbeat_age,
|
|
heartbeat_timeout=lock_timeout,
|
|
)
|
|
except Exception as e:
|
|
log.warning("scheduler_lock_held_but_could_not_read_holder", error=str(e))
|
|
return False
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to acquire scheduler lock due to database error: {e}") from e
|
|
|
|
|
|
async def release_scheduler_lock(db: aiosqlite.Connection) -> None:
|
|
"""Release the scheduler lock.
|
|
|
|
This function should be called during application shutdown. It removes the
|
|
lock record, allowing other instances to acquire it.
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
|
|
Raises:
|
|
RuntimeError: If database operations fail.
|
|
"""
|
|
pid = os.getpid()
|
|
|
|
try:
|
|
cursor = await db.execute(
|
|
"DELETE FROM scheduler_lock WHERE id = 1 AND pid = ?",
|
|
(pid,),
|
|
)
|
|
await db.commit()
|
|
|
|
if cursor.rowcount == 0:
|
|
# This shouldn't happen in normal operation, but log it for visibility
|
|
log.warning(
|
|
"scheduler_lock_release_mismatch",
|
|
our_pid=pid,
|
|
message="Tried to release lock but we don't hold it. Another instance may have replaced us.",
|
|
)
|
|
else:
|
|
log.info("scheduler_lock_released", pid=pid)
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to release scheduler lock: {e}") from e
|
|
|
|
|
|
async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
|
|
"""Update the heartbeat timestamp to keep the lock alive.
|
|
|
|
This function should be called periodically (every ~10 seconds) to prevent
|
|
the lock from being considered stale. It only succeeds if this process
|
|
still holds the lock.
|
|
|
|
Error handling: If the heartbeat update fails due to a database error, this
|
|
function returns False (indicating lock loss) rather than raising an exception.
|
|
This prevents the scheduler from crashing due to transient database issues,
|
|
allowing the running application to continue and potentially recover the lock
|
|
if it still holds it.
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
|
|
Returns:
|
|
True if the heartbeat was updated (we still hold the lock), False if
|
|
we no longer hold the lock or a database error occurred.
|
|
"""
|
|
now = time.time()
|
|
pid = os.getpid()
|
|
|
|
try:
|
|
cursor = await db.execute(
|
|
"UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1 AND pid = ?",
|
|
(now, pid),
|
|
)
|
|
await db.commit()
|
|
|
|
if cursor.rowcount == 0:
|
|
# We no longer hold the lock
|
|
log.warning(
|
|
"scheduler_lock_heartbeat_lost",
|
|
our_pid=pid,
|
|
message="Heartbeat update failed; we no longer hold the lock.",
|
|
)
|
|
return False
|
|
|
|
log.debug("scheduler_lock_heartbeat_updated", pid=pid)
|
|
return True
|
|
|
|
except Exception as e:
|
|
# Don't crash the scheduler if heartbeat update fails - log and return False
|
|
log.error(
|
|
"scheduler_lock_heartbeat_error",
|
|
our_pid=pid,
|
|
error=str(e),
|
|
message="Heartbeat update failed due to database error. Will retry on next interval.",
|
|
)
|
|
return False
|
|
|
|
|
|
async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] | None:
|
|
"""Retrieve information about the current scheduler lock.
|
|
|
|
This function is useful for debugging and monitoring. Returns None if no
|
|
lock is currently held.
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
|
|
Returns:
|
|
A dict with keys: pid, hostname, created_at, heartbeat_at, heartbeat_timeout,
|
|
or None if no lock exists.
|
|
"""
|
|
try:
|
|
cursor = await db.execute(
|
|
"SELECT pid, hostname, created_at, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
|
)
|
|
row = await cursor.fetchone()
|
|
if row:
|
|
pid, hostname, created_at, heartbeat_at, heartbeat_timeout = row
|
|
return {
|
|
"pid": pid,
|
|
"hostname": hostname,
|
|
"created_at": created_at,
|
|
"heartbeat_at": heartbeat_at,
|
|
"heartbeat_timeout": heartbeat_timeout,
|
|
}
|
|
return None
|
|
except Exception as e:
|
|
log.warning("scheduler_lock_info_query_failed", error=str(e))
|
|
return None
|
|
|
|
|
|
async def get_lock_health(db: aiosqlite.Connection) -> dict[str, Any]:
|
|
"""Get health status of the scheduler lock for monitoring.
|
|
|
|
Returns a dict with lock status, age, and whether it's stale. Used for
|
|
observability endpoints and monitoring dashboards.
|
|
|
|
Args:
|
|
db: The SQLite database connection.
|
|
|
|
Returns:
|
|
A dict with keys:
|
|
- has_lock: bool indicating if a lock exists
|
|
- is_stale: bool indicating if lock is stale (heartbeat timeout exceeded)
|
|
- pid: int or None
|
|
- hostname: str or None
|
|
- heartbeat_age_seconds: float or None (time since last heartbeat)
|
|
- created_at: float or None
|
|
- heartbeat_timeout: float or None
|
|
- stale_reason: str or None (why lock is considered stale)
|
|
"""
|
|
info = await get_scheduler_lock_info(db)
|
|
now = time.time()
|
|
|
|
if info is None:
|
|
return {
|
|
"has_lock": False,
|
|
"is_stale": False,
|
|
"pid": None,
|
|
"hostname": None,
|
|
"heartbeat_age_seconds": None,
|
|
"created_at": None,
|
|
"heartbeat_timeout": None,
|
|
"stale_reason": None,
|
|
}
|
|
|
|
heartbeat_age = now - info["heartbeat_at"]
|
|
is_stale_result = await is_lock_stale(
|
|
info["heartbeat_at"],
|
|
info["heartbeat_timeout"],
|
|
now,
|
|
)
|
|
|
|
stale_reason: str | None = None
|
|
if is_stale_result:
|
|
stale_reason = f"heartbeat_age ({heartbeat_age:.1f}s) > timeout ({info['heartbeat_timeout']:.1f}s)"
|
|
|
|
return {
|
|
"has_lock": True,
|
|
"is_stale": is_stale_result,
|
|
"pid": info["pid"],
|
|
"hostname": info["hostname"],
|
|
"heartbeat_age_seconds": heartbeat_age,
|
|
"created_at": info["created_at"],
|
|
"heartbeat_timeout": info["heartbeat_timeout"],
|
|
"stale_reason": stale_reason,
|
|
}
|