Fix HIGH priority issues: unbounded queries, rate limiting, health checks

Issue #3 - Unbounded Query Results (OOM): - get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default) - Added 'id' field to records from get_archived_history() and get_archived_history_keyset() - Protocol signature updated with page_size, max_rows, last_ban_id params Issue #7 - Docker Health Check Fails: - Added curl to Dockerfile.backend runtime image - HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health' - compose.prod.yml: increased start_period to 40s, timeout to 10s - Frontend healthcheck proxies to backend /api/health Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-01 21:47:36 +02:00
parent 1830da496d
commit 0d5882b32f
39 changed files with 2067 additions and 339 deletions
--- a/backend/app/utils/scheduler_lock.py
+++ b/backend/app/utils/scheduler_lock.py
@@ -5,10 +5,10 @@ BanGUI instance runs the background scheduler, even in container orchestration
 environments where multiple instances might start simultaneously.

 The lock uses atomic database operations to prevent race conditions:
- Lock acquisition is atomic: INSERT fails if the singleton row already exists
- Lock release is atomic: DELETE with PID check ensures only the owner releases
- Stale lock detection uses heartbeat timestamps: a lock older than TTL is
-  considered abandoned and eligible for cleanup on the next startup
+- Lock acquisition is atomic: INSERT ... ON CONFLICT with BEGIN IMMEDIATE transaction
+- Lock stealing: If heartbeat exceeds timeout, lock can be taken by another instance
+- Heartbeat update is conditional: UPDATE only if we still hold the lock
+- Stale lock detection uses heartbeat timestamps with configurable timeout

 This approach is more reliable than filesystem-based locking in containerized
 environments because:
@@ -23,12 +23,13 @@ The lock record stores:
    - hostname: Container/host name for debugging
    - created_at: When the lock was first acquired
    - heartbeat_at: When the lock was last confirmed alive (updated periodically)
+    - heartbeat_timeout: Seconds after which lock is considered stale (default 300)

 On startup:
-1. Cleanup any stale locks (where heartbeat_at > TTL)
-2. Try to insert the lock for this instance
+1. Cleanup any stale locks (where heartbeat_at + heartbeat_timeout < now)
+2. Try to insert the lock for this instance using ON CONFLICT to steal stale locks
 3. If INSERT succeeds, lock is acquired
-4. If INSERT fails (IntegrityError), another instance holds the lock
+4. If INSERT fails (IntegrityError), another instance holds a valid lock

 On running (periodic):
    - Update heartbeat_at to keep the lock alive and prevent false positives
@@ -62,6 +63,11 @@ SCHEDULER_LOCK_TTL_SECONDS: int = 60
 # expiring, providing robust protection against temporary delays.
 SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5

+# Default heartbeat timeout: how long to wait before considering a lock stale
+# when another instance tries to acquire it. This is the max time a lock holder
+# can go without updating heartbeat before someone else can steal it.
+SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS: int = 300
+

 async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
    """Create the scheduler_lock table if it doesn't exist.
@@ -79,23 +85,36 @@ async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
            pid INTEGER NOT NULL,
            hostname TEXT NOT NULL,
            created_at REAL NOT NULL,
-            heartbeat_at REAL NOT NULL
+            heartbeat_at REAL NOT NULL,
+            heartbeat_timeout REAL NOT NULL DEFAULT ?
        );
-        """
+        """,
+        (SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,),
    )
    await db.commit()


+async def is_lock_stale(heartbeat_at: float, timeout: float, now: float) -> bool:
+    """Check if a lock is considered stale based on heartbeat timestamp.
+
+    Args:
+        heartbeat_at: Last heartbeat timestamp from the lock record
+        timeout: Heartbeat timeout in seconds
+        now: Current timestamp
+
+    Returns:
+        True if (now - heartbeat_at) > timeout, indicating stale lock
+    """
+    return (now - heartbeat_at) > timeout
+
+
 async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
    """Try to acquire the scheduler lock.

-    This function performs two operations:
-    1. Clean up any stale locks (where heartbeat_at + TTL < now)
-    2. Try to insert a lock record for this instance
-
-    If another instance already holds a valid lock, the INSERT will fail and
-    this function returns False. The caller should reject startup with a clear
-    error message.
+    Uses atomic INSERT ... ON CONFLICT to acquire or steal the lock:
+    - If no lock exists: INSERT succeeds, lock acquired
+    - If stale lock (heartbeat timeout exceeded): INSERT succeeds, lock stolen
+    - If valid lock held by another process: INSERT fails with IntegrityError

    Args:
        db: The SQLite database connection.
@@ -104,30 +123,51 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
        True if the lock was successfully acquired, False if held by another instance.

    Raises:
-        RuntimeError: If database operations fail for reasons other than the lock
-                      being held (e.g., database is corrupted or inaccessible).
+        RuntimeError: If database operations fail.
    """
    now = time.time()
    pid = os.getpid()
    hostname = socket.gethostname()

    try:
-        # Clean up stale locks first
-        await db.execute(
-            """
-            DELETE FROM scheduler_lock
-            WHERE (? - heartbeat_at) > ?
-            """,
-            (now, SCHEDULER_LOCK_TTL_SECONDS),
-        )
+        await db.execute("BEGIN IMMEDIATE")

-        # Try to acquire the lock (atomic: INSERT fails if row exists)
+        # Clean up stale locks first (heartbeat timeout exceeded)
+        cursor = await db.execute(
+            "SELECT pid, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
+        )
+        row = await cursor.fetchone()
+
+        if row is not None:
+            lock_pid, lock_heartbeat, lock_timeout = row
+            if lock_pid == pid:
+                # Same process re-acquiring - allowed (refresh)
+                pass
+            elif (now - lock_heartbeat) <= lock_timeout:
+                # Another process holds a valid lock - cannot acquire
+                await db.rollback()
+                log.warning(
+                    "scheduler_lock_held_by_other_instance",
+                    our_pid=pid,
+                    lock_pid=lock_pid,
+                    lock_heartbeat_age_seconds=now - lock_heartbeat,
+                )
+                return False
+            # Stale lock (held by another process that crashed) - will be overwritten below
+
+        # Try to insert or update the lock
        await db.execute(
            """
-            INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
-            VALUES (1, ?, ?, ?, ?)
+            INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
+            VALUES (1, ?, ?, ?, ?, ?)
+            ON CONFLICT(id) DO UPDATE SET
+                pid = excluded.pid,
+                hostname = excluded.hostname,
+                created_at = excluded.created_at,
+                heartbeat_at = excluded.heartbeat_at,
+                heartbeat_timeout = excluded.heartbeat_timeout
            """,
-            (pid, hostname, now, now),
+            (pid, hostname, now, now, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
        )
        await db.commit()

@@ -140,34 +180,30 @@ async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:

    except aiosqlite.IntegrityError:
        # Lock is already held by another instance (INSERT failed due to UNIQUE constraint)
-        # Log details about who holds the lock to help with debugging
+        # and the ON CONFLICT WHERE condition was not met (lock is fresh, not stale)
        try:
            cursor = await db.execute(
-                "SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
+                "SELECT pid, hostname, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
            )
            row = await cursor.fetchone()
            if row:
-                lock_pid, lock_hostname, lock_created, lock_heartbeat = row
-                age_seconds = now - lock_created
+                lock_pid, lock_hostname, lock_heartbeat, lock_timeout = row
                heartbeat_age = now - lock_heartbeat
                log.warning(
                    "scheduler_lock_held_by_other_instance",
                    our_pid=pid,
                    lock_pid=lock_pid,
                    lock_hostname=lock_hostname,
-                    lock_age_seconds=age_seconds,
                    heartbeat_age_seconds=heartbeat_age,
+                    heartbeat_timeout=lock_timeout,
                )
        except Exception as e:
            log.warning("scheduler_lock_held_but_could_not_read_holder", error=str(e))
-
        return False

    except Exception as e:
-        # Unexpected database error (not an IntegrityError)
        raise RuntimeError(
-            f"Failed to acquire scheduler lock due to database error: {e}\n"
-            "Check that the database is accessible and not corrupted."
+            f"Failed to acquire scheduler lock due to database error: {e}"
        ) from e


@@ -213,15 +249,18 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
    the lock from being considered stale. It only succeeds if this process
    still holds the lock.

+    Error handling: If the heartbeat update fails due to a database error, this
+    function returns False (indicating lock loss) rather than raising an exception.
+    This prevents the scheduler from crashing due to transient database issues,
+    allowing the running application to continue and potentially recover the lock
+    if it still holds it.
+
    Args:
        db: The SQLite database connection.

    Returns:
        True if the heartbeat was updated (we still hold the lock), False if
-        we no longer hold the lock (another instance has taken over).
-
-    Raises:
-        RuntimeError: If database operations fail.
+        we no longer hold the lock or a database error occurred.
    """
    now = time.time()
    pid = os.getpid()
@@ -238,14 +277,22 @@ async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
            log.warning(
                "scheduler_lock_heartbeat_lost",
                our_pid=pid,
-                message="Heartbeat failed; we no longer hold the lock.",
+                message="Heartbeat update failed; we no longer hold the lock.",
            )
            return False

+        log.debug("scheduler_lock_heartbeat_updated", pid=pid)
        return True

    except Exception as e:
-        raise RuntimeError(f"Failed to update scheduler lock heartbeat: {e}") from e
+        # Don't crash the scheduler if heartbeat update fails - log and return False
+        log.error(
+            "scheduler_lock_heartbeat_error",
+            our_pid=pid,
+            error=str(e),
+            message="Heartbeat update failed due to database error. Will retry on next interval.",
+        )
+        return False


 async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] | None:
@@ -258,23 +305,84 @@ async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] |
        db: The SQLite database connection.

    Returns:
-        A dict with keys: pid, hostname, created_at, heartbeat_at, or None
-        if no lock exists.
+        A dict with keys: pid, hostname, created_at, heartbeat_at, heartbeat_timeout,
+        or None if no lock exists.
    """
    try:
        cursor = await db.execute(
-            "SELECT pid, hostname, created_at, heartbeat_at FROM scheduler_lock WHERE id = 1"
+            "SELECT pid, hostname, created_at, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
        )
        row = await cursor.fetchone()
        if row:
-            pid, hostname, created_at, heartbeat_at = row
+            pid, hostname, created_at, heartbeat_at, heartbeat_timeout = row
            return {
                "pid": pid,
                "hostname": hostname,
                "created_at": created_at,
                "heartbeat_at": heartbeat_at,
+                "heartbeat_timeout": heartbeat_timeout,
            }
        return None
    except Exception as e:
        log.warning("scheduler_lock_info_query_failed", error=str(e))
        return None
+
+
+async def get_lock_health(db: aiosqlite.Connection) -> dict[str, Any]:
+    """Get health status of the scheduler lock for monitoring.
+
+    Returns a dict with lock status, age, and whether it's stale. Used for
+    observability endpoints and monitoring dashboards.
+
+    Args:
+        db: The SQLite database connection.
+
+    Returns:
+        A dict with keys:
+            - has_lock: bool indicating if a lock exists
+            - is_stale: bool indicating if lock is stale (heartbeat timeout exceeded)
+            - pid: int or None
+            - hostname: str or None
+            - heartbeat_age_seconds: float or None (time since last heartbeat)
+            - created_at: float or None
+            - heartbeat_timeout: float or None
+            - stale_reason: str or None (why lock is considered stale)
+    """
+    info = await get_scheduler_lock_info(db)
+    now = time.time()
+
+    if info is None:
+        return {
+            "has_lock": False,
+            "is_stale": False,
+            "pid": None,
+            "hostname": None,
+            "heartbeat_age_seconds": None,
+            "created_at": None,
+            "heartbeat_timeout": None,
+            "stale_reason": None,
+        }
+
+    heartbeat_age = now - info["heartbeat_at"]
+    is_stale_result = await is_lock_stale(
+        info["heartbeat_at"],
+        info["heartbeat_timeout"],
+        now,
+    )
+
+    stale_reason: str | None = None
+    if is_stale_result:
+        stale_reason = (
+            f"heartbeat_age ({heartbeat_age:.1f}s) > timeout ({info['heartbeat_timeout']:.1f}s)"
+        )
+
+    return {
+        "has_lock": True,
+        "is_stale": is_stale_result,
+        "pid": info["pid"],
+        "hostname": info["hostname"],
+        "heartbeat_age_seconds": heartbeat_age,
+        "created_at": info["created_at"],
+        "heartbeat_timeout": info["heartbeat_timeout"],
+        "stale_reason": stale_reason,
+    }