Fix HIGH priority issues: unbounded queries, rate limiting, health checks

Issue #3 - Unbounded Query Results (OOM):
- get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default)
- Added 'id' field to records from get_archived_history() and get_archived_history_keyset()
- Protocol signature updated with page_size, max_rows, last_ban_id params

Issue #7 - Docker Health Check Fails:
- Added curl to Dockerfile.backend runtime image
- HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health'
- compose.prod.yml: increased start_period to 40s, timeout to 10s
- Frontend healthcheck proxies to backend /api/health

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-01 21:47:36 +02:00
parent 1830da496d
commit 0d5882b32f
39 changed files with 2067 additions and 339 deletions

View File

@@ -5,10 +5,10 @@ BanGUI instance runs the background scheduler, even in container orchestration
environments where multiple instances might start simultaneously.
The lock uses atomic database operations to prevent race conditions:
- Lock acquisition is atomic: INSERT fails if the singleton row already exists
- Lock release is atomic: DELETE with PID check ensures only the owner releases
- Stale lock detection uses heartbeat timestamps: a lock older than TTL is
considered abandoned and eligible for cleanup on the next startup
- Lock acquisition is atomic: INSERT ... ON CONFLICT with BEGIN IMMEDIATE transaction
- Lock stealing: If heartbeat exceeds timeout, lock can be taken by another instance
- Heartbeat update is conditional: UPDATE only if we still hold the lock
- Stale lock detection uses heartbeat timestamps with configurable timeout
This approach is more reliable than filesystem-based locking in containerized
environments because:
@@ -23,12 +23,13 @@ The lock record stores:
- hostname: Container/host name for debugging
- created_at: When the lock was first acquired
- heartbeat_at: When the lock was last confirmed alive (updated periodically)
- heartbeat_timeout: Seconds after which lock is considered stale (default 300)
On startup:
1. Cleanup any stale locks (where heartbeat_at > TTL)
2. Try to insert the lock for this instance
1. Cleanup any stale locks (where heartbeat_at + heartbeat_timeout < now)
2. Try to insert the lock for this instance using ON CONFLICT to steal stale locks
3. If INSERT succeeds, lock is acquired
4. If INSERT fails (IntegrityError), another instance holds the lock
4. If INSERT fails (IntegrityError), another instance holds a valid lock
On running (periodic):
- Update heartbeat_at to keep the lock alive and prevent false positives
@@ -62,6 +63,11 @@ SCHEDULER_LOCK_TTL_SECONDS: int = 60
# expiring, providing robust protection against temporary delays.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5
# Default heartbeat timeout: how long to wait before considering a lock stale
# when another instance tries to acquire it. This is the max time a lock holder
# can go without updating heartbeat before someone else can steal it.
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS: int = 300
async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
"""Create the scheduler_lock table if it doesn't exist.
@@ -79,23 +85,36 @@ async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
pid INTEGER NOT NULL,
hostname TEXT NOT NULL,
created_at REAL NOT NULL,
heartbeat_at REAL NOT NULL
heartbeat_at REAL NOT NULL,
heartbeat_timeout REAL NOT NULL DEFAULT ?
);
"""
""",
(SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,),
)
await db.commit()
async def is_lock_stale(heartbeat_at: float, timeout: float, now: float) -> bool:
"""Check if a lock is considered stale based on heartbeat timestamp.
Args:
heartbeat_at: Last heartbeat timestamp from the lock record
timeout: Heartbeat timeout in seconds
now: Current timestamp
Returns:
True if (now - heartbeat_at) > timeout, indicating stale lock
"""
return (now - heartbeat_at) > timeout
async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
"""Try to acquire the scheduler lock.
This function performs two operations:
1. Clean up any stale locks (where heartbeat_at + TTL < now)
2. Try to insert a lock record for this instance
If another instance already holds a valid lock, the INSERT will fail and
this function returns False. The caller should reject startup with a clear
error message.
Uses atomic INSERT ... ON CONFLICT to acquire or steal the lock:
- If no lock exists: INSERT succeeds, lock acquired
- If stale lock (heartbeat timeout exceeded): INSERT succeeds, lock stolen
- If valid lock held by another process: INSERT fails with IntegrityError
Args:
db: The SQLite database connection.
@@ -104,30 +123,51 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
True if the lock was successfully acquired, False if held by another instance.
Raises:
RuntimeError: If database operations fail for reasons other than the lock
being held (e.g., database is corrupted or inaccessible).
RuntimeError: If database operations fail.
"""
now = time.time()
pid = os.getpid()
hostname = socket.gethostname()
try:
# Clean up stale locks first
await db.execute(
"""
DELETE FROM scheduler_lock
WHERE (? - heartbeat_at) > ?
""",
(now, SCHEDULER_LOCK_TTL_SECONDS),
)
await db.execute("BEGIN IMMEDIATE")
# Try to acquire the lock (atomic: INSERT fails if row exists)
# Clean up stale locks first (heartbeat timeout exceeded)
cursor = await db.execute(
"SELECT pid, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
)
row = await cursor.fetchone()
if row is not None:
lock_pid, lock_heartbeat, lock_timeout = row
if lock_pid == pid:
# Same process re-acquiring - allowed (refresh)
pass
elif (now - lock_heartbeat) <= lock_timeout:
# Another process holds a valid lock - cannot acquire
await db.rollback()
log.warning(
"scheduler_lock_held_by_other_instance",
our_pid=pid,
lock_pid=lock_pid,
lock_heartbeat_age_seconds=now - lock_heartbeat,
)
return False
# Stale lock (held by another process that crashed) - will be overwritten below
# Try to insert or update the lock
await db.execute(
"""
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
VALUES (1, ?, ?, ?, ?)
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
VALUES (1, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
pid = excluded.pid,
hostname = excluded.hostname,
created_at = excluded.created_at,
heartbeat_at = excluded.heartbeat_at,
heartbeat_timeout = excluded.heartbeat_timeout
""",
(pid, hostname, now, now),
(pid, hostname, now, now, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
)
await db.commit()
@@ -140,34 +180,30 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
except aiosqlite.IntegrityError:
# Lock is already held by another instance (INSERT failed due to UNIQUE constraint)
# Log details about who holds the lock to help with debugging
# and the ON CONFLICT WHERE condition was not met (lock is fresh, not stale)
try:
cursor = await db.execute(
"SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
"SELECT pid, hostname, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
)
row = await cursor.fetchone()
if row:
lock_pid, lock_hostname, lock_created, lock_heartbeat = row
age_seconds = now - lock_created
lock_pid, lock_hostname, lock_heartbeat, lock_timeout = row
heartbeat_age = now - lock_heartbeat
log.warning(
"scheduler_lock_held_by_other_instance",
our_pid=pid,
lock_pid=lock_pid,
lock_hostname=lock_hostname,
lock_age_seconds=age_seconds,
heartbeat_age_seconds=heartbeat_age,
heartbeat_timeout=lock_timeout,
)
except Exception as e:
log.warning("scheduler_lock_held_but_could_not_read_holder", error=str(e))
return False
except Exception as e:
# Unexpected database error (not an IntegrityError)
raise RuntimeError(
f"Failed to acquire scheduler lock due to database error: {e}\n"
"Check that the database is accessible and not corrupted."
f"Failed to acquire scheduler lock due to database error: {e}"
) from e
@@ -213,15 +249,18 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
the lock from being considered stale. It only succeeds if this process
still holds the lock.
Error handling: If the heartbeat update fails due to a database error, this
function returns False (indicating lock loss) rather than raising an exception.
This prevents the scheduler from crashing due to transient database issues,
allowing the running application to continue and potentially recover the lock
if it still holds it.
Args:
db: The SQLite database connection.
Returns:
True if the heartbeat was updated (we still hold the lock), False if
we no longer hold the lock (another instance has taken over).
Raises:
RuntimeError: If database operations fail.
we no longer hold the lock or a database error occurred.
"""
now = time.time()
pid = os.getpid()
@@ -238,14 +277,22 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
log.warning(
"scheduler_lock_heartbeat_lost",
our_pid=pid,
message="Heartbeat failed; we no longer hold the lock.",
message="Heartbeat update failed; we no longer hold the lock.",
)
return False
log.debug("scheduler_lock_heartbeat_updated", pid=pid)
return True
except Exception as e:
raise RuntimeError(f"Failed to update scheduler lock heartbeat: {e}") from e
# Don't crash the scheduler if heartbeat update fails - log and return False
log.error(
"scheduler_lock_heartbeat_error",
our_pid=pid,
error=str(e),
message="Heartbeat update failed due to database error. Will retry on next interval.",
)
return False
async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] | None:
@@ -258,23 +305,84 @@ async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] |
db: The SQLite database connection.
Returns:
A dict with keys: pid, hostname, created_at, heartbeat_at, or None
if no lock exists.
A dict with keys: pid, hostname, created_at, heartbeat_at, heartbeat_timeout,
or None if no lock exists.
"""
try:
cursor = await db.execute(
"SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
"SELECT pid, hostname, created_at, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
)
row = await cursor.fetchone()
if row:
pid, hostname, created_at, heartbeat_at = row
pid, hostname, created_at, heartbeat_at, heartbeat_timeout = row
return {
"pid": pid,
"hostname": hostname,
"created_at": created_at,
"heartbeat_at": heartbeat_at,
"heartbeat_timeout": heartbeat_timeout,
}
return None
except Exception as e:
log.warning("scheduler_lock_info_query_failed", error=str(e))
return None
async def get_lock_health(db: aiosqlite.Connection) -> dict[str, Any]:
"""Get health status of the scheduler lock for monitoring.
Returns a dict with lock status, age, and whether it's stale. Used for
observability endpoints and monitoring dashboards.
Args:
db: The SQLite database connection.
Returns:
A dict with keys:
- has_lock: bool indicating if a lock exists
- is_stale: bool indicating if lock is stale (heartbeat timeout exceeded)
- pid: int or None
- hostname: str or None
- heartbeat_age_seconds: float or None (time since last heartbeat)
- created_at: float or None
- heartbeat_timeout: float or None
- stale_reason: str or None (why lock is considered stale)
"""
info = await get_scheduler_lock_info(db)
now = time.time()
if info is None:
return {
"has_lock": False,
"is_stale": False,
"pid": None,
"hostname": None,
"heartbeat_age_seconds": None,
"created_at": None,
"heartbeat_timeout": None,
"stale_reason": None,
}
heartbeat_age = now - info["heartbeat_at"]
is_stale_result = await is_lock_stale(
info["heartbeat_at"],
info["heartbeat_timeout"],
now,
)
stale_reason: str | None = None
if is_stale_result:
stale_reason = (
f"heartbeat_age ({heartbeat_age:.1f}s) > timeout ({info['heartbeat_timeout']:.1f}s)"
)
return {
"has_lock": True,
"is_stale": is_stale_result,
"pid": info["pid"],
"hostname": info["hostname"],
"heartbeat_age_seconds": heartbeat_age,
"created_at": info["created_at"],
"heartbeat_timeout": info["heartbeat_timeout"],
"stale_reason": stale_reason,
}