Fix HIGH priority issues: unbounded queries, rate limiting, health checks
Issue #3 - Unbounded Query Results (OOM): - get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default) - Added 'id' field to records from get_archived_history() and get_archived_history_keyset() - Protocol signature updated with page_size, max_rows, last_ban_id params Issue #7 - Docker Health Check Fails: - Added curl to Dockerfile.backend runtime image - HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health' - compose.prod.yml: increased start_period to 40s, timeout to 10s - Frontend healthcheck proxies to backend /api/health Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -5,10 +5,10 @@ BanGUI instance runs the background scheduler, even in container orchestration
|
||||
environments where multiple instances might start simultaneously.
|
||||
|
||||
The lock uses atomic database operations to prevent race conditions:
|
||||
- Lock acquisition is atomic: INSERT fails if the singleton row already exists
|
||||
- Lock release is atomic: DELETE with PID check ensures only the owner releases
|
||||
- Stale lock detection uses heartbeat timestamps: a lock older than TTL is
|
||||
considered abandoned and eligible for cleanup on the next startup
|
||||
- Lock acquisition is atomic: INSERT ... ON CONFLICT with BEGIN IMMEDIATE transaction
|
||||
- Lock stealing: If heartbeat exceeds timeout, lock can be taken by another instance
|
||||
- Heartbeat update is conditional: UPDATE only if we still hold the lock
|
||||
- Stale lock detection uses heartbeat timestamps with configurable timeout
|
||||
|
||||
This approach is more reliable than filesystem-based locking in containerized
|
||||
environments because:
|
||||
@@ -23,12 +23,13 @@ The lock record stores:
|
||||
- hostname: Container/host name for debugging
|
||||
- created_at: When the lock was first acquired
|
||||
- heartbeat_at: When the lock was last confirmed alive (updated periodically)
|
||||
- heartbeat_timeout: Seconds after which lock is considered stale (default 300)
|
||||
|
||||
On startup:
|
||||
1. Cleanup any stale locks (where heartbeat_at > TTL)
|
||||
2. Try to insert the lock for this instance
|
||||
1. Cleanup any stale locks (where heartbeat_at + heartbeat_timeout < now)
|
||||
2. Try to insert the lock for this instance using ON CONFLICT to steal stale locks
|
||||
3. If INSERT succeeds, lock is acquired
|
||||
4. If INSERT fails (IntegrityError), another instance holds the lock
|
||||
4. If INSERT fails (IntegrityError), another instance holds a valid lock
|
||||
|
||||
On running (periodic):
|
||||
- Update heartbeat_at to keep the lock alive and prevent false positives
|
||||
@@ -62,6 +63,11 @@ SCHEDULER_LOCK_TTL_SECONDS: int = 60
|
||||
# expiring, providing robust protection against temporary delays.
|
||||
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5
|
||||
|
||||
# Default heartbeat timeout: how long to wait before considering a lock stale
|
||||
# when another instance tries to acquire it. This is the max time a lock holder
|
||||
# can go without updating heartbeat before someone else can steal it.
|
||||
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS: int = 300
|
||||
|
||||
|
||||
async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
|
||||
"""Create the scheduler_lock table if it doesn't exist.
|
||||
@@ -79,23 +85,36 @@ async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
|
||||
pid INTEGER NOT NULL,
|
||||
hostname TEXT NOT NULL,
|
||||
created_at REAL NOT NULL,
|
||||
heartbeat_at REAL NOT NULL
|
||||
heartbeat_at REAL NOT NULL,
|
||||
heartbeat_timeout REAL NOT NULL DEFAULT ?
|
||||
);
|
||||
"""
|
||||
""",
|
||||
(SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def is_lock_stale(heartbeat_at: float, timeout: float, now: float) -> bool:
|
||||
"""Check if a lock is considered stale based on heartbeat timestamp.
|
||||
|
||||
Args:
|
||||
heartbeat_at: Last heartbeat timestamp from the lock record
|
||||
timeout: Heartbeat timeout in seconds
|
||||
now: Current timestamp
|
||||
|
||||
Returns:
|
||||
True if (now - heartbeat_at) > timeout, indicating stale lock
|
||||
"""
|
||||
return (now - heartbeat_at) > timeout
|
||||
|
||||
|
||||
async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
|
||||
"""Try to acquire the scheduler lock.
|
||||
|
||||
This function performs two operations:
|
||||
1. Clean up any stale locks (where heartbeat_at + TTL < now)
|
||||
2. Try to insert a lock record for this instance
|
||||
|
||||
If another instance already holds a valid lock, the INSERT will fail and
|
||||
this function returns False. The caller should reject startup with a clear
|
||||
error message.
|
||||
Uses atomic INSERT ... ON CONFLICT to acquire or steal the lock:
|
||||
- If no lock exists: INSERT succeeds, lock acquired
|
||||
- If stale lock (heartbeat timeout exceeded): INSERT succeeds, lock stolen
|
||||
- If valid lock held by another process: INSERT fails with IntegrityError
|
||||
|
||||
Args:
|
||||
db: The SQLite database connection.
|
||||
@@ -104,30 +123,51 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
|
||||
True if the lock was successfully acquired, False if held by another instance.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If database operations fail for reasons other than the lock
|
||||
being held (e.g., database is corrupted or inaccessible).
|
||||
RuntimeError: If database operations fail.
|
||||
"""
|
||||
now = time.time()
|
||||
pid = os.getpid()
|
||||
hostname = socket.gethostname()
|
||||
|
||||
try:
|
||||
# Clean up stale locks first
|
||||
await db.execute(
|
||||
"""
|
||||
DELETE FROM scheduler_lock
|
||||
WHERE (? - heartbeat_at) > ?
|
||||
""",
|
||||
(now, SCHEDULER_LOCK_TTL_SECONDS),
|
||||
)
|
||||
await db.execute("BEGIN IMMEDIATE")
|
||||
|
||||
# Try to acquire the lock (atomic: INSERT fails if row exists)
|
||||
# Clean up stale locks first (heartbeat timeout exceeded)
|
||||
cursor = await db.execute(
|
||||
"SELECT pid, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
|
||||
if row is not None:
|
||||
lock_pid, lock_heartbeat, lock_timeout = row
|
||||
if lock_pid == pid:
|
||||
# Same process re-acquiring - allowed (refresh)
|
||||
pass
|
||||
elif (now - lock_heartbeat) <= lock_timeout:
|
||||
# Another process holds a valid lock - cannot acquire
|
||||
await db.rollback()
|
||||
log.warning(
|
||||
"scheduler_lock_held_by_other_instance",
|
||||
our_pid=pid,
|
||||
lock_pid=lock_pid,
|
||||
lock_heartbeat_age_seconds=now - lock_heartbeat,
|
||||
)
|
||||
return False
|
||||
# Stale lock (held by another process that crashed) - will be overwritten below
|
||||
|
||||
# Try to insert or update the lock
|
||||
await db.execute(
|
||||
"""
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
|
||||
VALUES (1, ?, ?, ?, ?)
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
||||
VALUES (1, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
pid = excluded.pid,
|
||||
hostname = excluded.hostname,
|
||||
created_at = excluded.created_at,
|
||||
heartbeat_at = excluded.heartbeat_at,
|
||||
heartbeat_timeout = excluded.heartbeat_timeout
|
||||
""",
|
||||
(pid, hostname, now, now),
|
||||
(pid, hostname, now, now, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
@@ -140,34 +180,30 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
|
||||
|
||||
except aiosqlite.IntegrityError:
|
||||
# Lock is already held by another instance (INSERT failed due to UNIQUE constraint)
|
||||
# Log details about who holds the lock to help with debugging
|
||||
# and the ON CONFLICT WHERE condition was not met (lock is fresh, not stale)
|
||||
try:
|
||||
cursor = await db.execute(
|
||||
"SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
|
||||
"SELECT pid, hostname, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
lock_pid, lock_hostname, lock_created, lock_heartbeat = row
|
||||
age_seconds = now - lock_created
|
||||
lock_pid, lock_hostname, lock_heartbeat, lock_timeout = row
|
||||
heartbeat_age = now - lock_heartbeat
|
||||
log.warning(
|
||||
"scheduler_lock_held_by_other_instance",
|
||||
our_pid=pid,
|
||||
lock_pid=lock_pid,
|
||||
lock_hostname=lock_hostname,
|
||||
lock_age_seconds=age_seconds,
|
||||
heartbeat_age_seconds=heartbeat_age,
|
||||
heartbeat_timeout=lock_timeout,
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning("scheduler_lock_held_but_could_not_read_holder", error=str(e))
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
# Unexpected database error (not an IntegrityError)
|
||||
raise RuntimeError(
|
||||
f"Failed to acquire scheduler lock due to database error: {e}\n"
|
||||
"Check that the database is accessible and not corrupted."
|
||||
f"Failed to acquire scheduler lock due to database error: {e}"
|
||||
) from e
|
||||
|
||||
|
||||
@@ -213,15 +249,18 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
|
||||
the lock from being considered stale. It only succeeds if this process
|
||||
still holds the lock.
|
||||
|
||||
Error handling: If the heartbeat update fails due to a database error, this
|
||||
function returns False (indicating lock loss) rather than raising an exception.
|
||||
This prevents the scheduler from crashing due to transient database issues,
|
||||
allowing the running application to continue and potentially recover the lock
|
||||
if it still holds it.
|
||||
|
||||
Args:
|
||||
db: The SQLite database connection.
|
||||
|
||||
Returns:
|
||||
True if the heartbeat was updated (we still hold the lock), False if
|
||||
we no longer hold the lock (another instance has taken over).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If database operations fail.
|
||||
we no longer hold the lock or a database error occurred.
|
||||
"""
|
||||
now = time.time()
|
||||
pid = os.getpid()
|
||||
@@ -238,14 +277,22 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
|
||||
log.warning(
|
||||
"scheduler_lock_heartbeat_lost",
|
||||
our_pid=pid,
|
||||
message="Heartbeat failed; we no longer hold the lock.",
|
||||
message="Heartbeat update failed; we no longer hold the lock.",
|
||||
)
|
||||
return False
|
||||
|
||||
log.debug("scheduler_lock_heartbeat_updated", pid=pid)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to update scheduler lock heartbeat: {e}") from e
|
||||
# Don't crash the scheduler if heartbeat update fails - log and return False
|
||||
log.error(
|
||||
"scheduler_lock_heartbeat_error",
|
||||
our_pid=pid,
|
||||
error=str(e),
|
||||
message="Heartbeat update failed due to database error. Will retry on next interval.",
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] | None:
|
||||
@@ -258,23 +305,84 @@ async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] |
|
||||
db: The SQLite database connection.
|
||||
|
||||
Returns:
|
||||
A dict with keys: pid, hostname, created_at, heartbeat_at, or None
|
||||
if no lock exists.
|
||||
A dict with keys: pid, hostname, created_at, heartbeat_at, heartbeat_timeout,
|
||||
or None if no lock exists.
|
||||
"""
|
||||
try:
|
||||
cursor = await db.execute(
|
||||
"SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
|
||||
"SELECT pid, hostname, created_at, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
pid, hostname, created_at, heartbeat_at = row
|
||||
pid, hostname, created_at, heartbeat_at, heartbeat_timeout = row
|
||||
return {
|
||||
"pid": pid,
|
||||
"hostname": hostname,
|
||||
"created_at": created_at,
|
||||
"heartbeat_at": heartbeat_at,
|
||||
"heartbeat_timeout": heartbeat_timeout,
|
||||
}
|
||||
return None
|
||||
except Exception as e:
|
||||
log.warning("scheduler_lock_info_query_failed", error=str(e))
|
||||
return None
|
||||
|
||||
|
||||
async def get_lock_health(db: aiosqlite.Connection) -> dict[str, Any]:
|
||||
"""Get health status of the scheduler lock for monitoring.
|
||||
|
||||
Returns a dict with lock status, age, and whether it's stale. Used for
|
||||
observability endpoints and monitoring dashboards.
|
||||
|
||||
Args:
|
||||
db: The SQLite database connection.
|
||||
|
||||
Returns:
|
||||
A dict with keys:
|
||||
- has_lock: bool indicating if a lock exists
|
||||
- is_stale: bool indicating if lock is stale (heartbeat timeout exceeded)
|
||||
- pid: int or None
|
||||
- hostname: str or None
|
||||
- heartbeat_age_seconds: float or None (time since last heartbeat)
|
||||
- created_at: float or None
|
||||
- heartbeat_timeout: float or None
|
||||
- stale_reason: str or None (why lock is considered stale)
|
||||
"""
|
||||
info = await get_scheduler_lock_info(db)
|
||||
now = time.time()
|
||||
|
||||
if info is None:
|
||||
return {
|
||||
"has_lock": False,
|
||||
"is_stale": False,
|
||||
"pid": None,
|
||||
"hostname": None,
|
||||
"heartbeat_age_seconds": None,
|
||||
"created_at": None,
|
||||
"heartbeat_timeout": None,
|
||||
"stale_reason": None,
|
||||
}
|
||||
|
||||
heartbeat_age = now - info["heartbeat_at"]
|
||||
is_stale_result = await is_lock_stale(
|
||||
info["heartbeat_at"],
|
||||
info["heartbeat_timeout"],
|
||||
now,
|
||||
)
|
||||
|
||||
stale_reason: str | None = None
|
||||
if is_stale_result:
|
||||
stale_reason = (
|
||||
f"heartbeat_age ({heartbeat_age:.1f}s) > timeout ({info['heartbeat_timeout']:.1f}s)"
|
||||
)
|
||||
|
||||
return {
|
||||
"has_lock": True,
|
||||
"is_stale": is_stale_result,
|
||||
"pid": info["pid"],
|
||||
"hostname": info["hostname"],
|
||||
"heartbeat_age_seconds": heartbeat_age,
|
||||
"created_at": info["created_at"],
|
||||
"heartbeat_timeout": info["heartbeat_timeout"],
|
||||
"stale_reason": stale_reason,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user