BanGUI/backend/app/utils/scheduler_lock.py

"""Database-based scheduler lock for single-executor enforcement.

This module implements a database-backed lock mechanism that ensures only one
BanGUI instance runs the background scheduler, even in container orchestration
environments where multiple instances might start simultaneously.

The lock uses atomic database operations to prevent race conditions:
- Lock acquisition is atomic: INSERT ... ON CONFLICT with BEGIN IMMEDIATE transaction
- Lock stealing: If heartbeat exceeds timeout, lock can be taken by another instance
- Heartbeat update is conditional: UPDATE only if we still hold the lock
- Stale lock detection uses heartbeat timestamps with configurable timeout

This approach is more reliable than filesystem-based locking in containerized
environments because:
1. Database transactions are atomic (no TOCTOU race windows)
2. No NFS/network filesystem edge cases
3. Stale lock detection is timestamp-based, not PID-based (PID reuse is unreliable)
4. Works across container restarts and rolling deployments

The lock record stores:
    - id: Always 1 (singleton table)
    - pid: Process ID of the lock holder
    - hostname: Container/host name for debugging
    - created_at: When the lock was first acquired
    - heartbeat_at: When the lock was last confirmed alive (updated periodically)
    - heartbeat_timeout: Seconds after which lock is considered stale (default 300)

On startup:
1. Cleanup any stale locks (where heartbeat_at + heartbeat_timeout < now)
2. Try to insert the lock for this instance using ON CONFLICT to steal stale locks
3. If INSERT succeeds, lock is acquired
4. If INSERT fails (IntegrityError), another instance holds a valid lock

On running (periodic):
    - Update heartbeat_at to keep the lock alive and prevent false positives

On shutdown:
    - Delete the lock (this instance is no longer running the scheduler)
"""

from __future__ import annotations

import os
import socket
import time
from typing import Any

import aiosqlite
from app.utils.logging_compat import get_logger

log = get_logger(__name__)

# Lock record expires if heartbeat hasn't been updated for this many seconds.
# This prevents stale locks from a crashed instance from blocking new startups.
# Set conservatively to allow temporary delays (e.g., high load) before considering
# the lock abandoned.
SCHEDULER_LOCK_TTL_SECONDS: int = 60

# Heartbeat interval: how often to update the lock's heartbeat_at timestamp.
# Must be significantly less than TTL (at least 3-4x smaller) to allow multiple
# consecutive missed heartbeats before the lock is considered stale.
# With TTL=60s and interval=5s, the lock survives ~12 missed heartbeats before
# expiring, providing robust protection against temporary delays.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS: int = 5

# Default heartbeat timeout: how long to wait before considering a lock stale
# when another instance tries to acquire it. This is the max time a lock holder
# can go without updating heartbeat before someone else can steal it.
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS: int = 300


async def init_scheduler_lock_table(db: aiosqlite.Connection) -> None:
    """Create the scheduler_lock table if it doesn't exist.

    This is called during database schema initialization and is safe to call
    multiple times (CREATE TABLE IF NOT EXISTS is idempotent).

    Args:
        db: The SQLite database connection.
    """
    await db.execute(
        """
        CREATE TABLE IF NOT EXISTS scheduler_lock (
            id INTEGER PRIMARY KEY CHECK (id = 1),
            pid INTEGER NOT NULL,
            hostname TEXT NOT NULL,
            created_at REAL NOT NULL,
            heartbeat_at REAL NOT NULL,
            heartbeat_timeout REAL NOT NULL DEFAULT ?
        );
        """,
        (SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,),
    )
    await db.commit()


async def is_lock_stale(heartbeat_at: float, timeout: float, now: float) -> bool:
    """Check if a lock is considered stale based on heartbeat timestamp.

    Args:
        heartbeat_at: Last heartbeat timestamp from the lock record
        timeout: Heartbeat timeout in seconds
        now: Current timestamp

    Returns:
        True if (now - heartbeat_at) > timeout, indicating stale lock
    """
    return (now - heartbeat_at) > timeout


async def acquire_scheduler_lock(db: aiosqlite.Connection) -> bool:
    """Try to acquire the scheduler lock.

    Uses atomic INSERT ... ON CONFLICT to acquire or steal the lock:
    - If no lock exists: INSERT succeeds, lock acquired
    - If stale lock (heartbeat timeout exceeded): INSERT succeeds, lock stolen
    - If valid lock held by another process: INSERT fails with IntegrityError

    Args:
        db: The SQLite database connection.

    Returns:
        True if the lock was successfully acquired, False if held by another instance.

    Raises:
        RuntimeError: If database operations fail.
    """
    now = time.time()
    pid = os.getpid()
    hostname = socket.gethostname()

    try:
        await db.execute("BEGIN IMMEDIATE")

        # Clean up stale locks first (heartbeat timeout exceeded)
        cursor = await db.execute(
            "SELECT pid, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
        )
        row = await cursor.fetchone()

        if row is not None:
            lock_pid, lock_heartbeat, lock_timeout = row
            if lock_pid == pid:
                # Same process re-acquiring - allowed (refresh)
                pass
            elif (now - lock_heartbeat) <= lock_timeout:
                # Another process holds a valid lock - cannot acquire
                await db.rollback()
                log.warning(
                    "scheduler_lock_held_by_other_instance",
                    our_pid=pid,
                    lock_pid=lock_pid,
                    lock_heartbeat_age_seconds=now - lock_heartbeat,
                )
                return False
            # Stale lock (held by another process that crashed) - will be overwritten below

        # Try to insert or update the lock
        await db.execute(
            """
            INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
            VALUES (1, ?, ?, ?, ?, ?)
            ON CONFLICT(id) DO UPDATE SET
                pid = excluded.pid,
                hostname = excluded.hostname,
                created_at = excluded.created_at,
                heartbeat_at = excluded.heartbeat_at,
                heartbeat_timeout = excluded.heartbeat_timeout
            """,
            (pid, hostname, now, now, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
        )
        await db.commit()

        log.info(
            "scheduler_lock_acquired",
            pid=pid,
            hostname=hostname,
        )
        return True

    except aiosqlite.IntegrityError:
        # Lock is already held by another instance (INSERT failed due to UNIQUE constraint)
        # and the ON CONFLICT WHERE condition was not met (lock is fresh, not stale)
        try:
            cursor = await db.execute(
                "SELECT pid, hostname, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
            )
            row = await cursor.fetchone()
            if row:
                lock_pid, lock_hostname, lock_heartbeat, lock_timeout = row
                heartbeat_age = now - lock_heartbeat
                log.warning(
                    "scheduler_lock_held_by_other_instance",
                    our_pid=pid,
                    lock_pid=lock_pid,
                    lock_hostname=lock_hostname,
                    heartbeat_age_seconds=heartbeat_age,
                    heartbeat_timeout=lock_timeout,
                )
        except Exception as e:
            log.warning("scheduler_lock_held_but_could_not_read_holder", error=str(e))
        return False

    except Exception as e:
        raise RuntimeError(
            f"Failed to acquire scheduler lock due to database error: {e}"
        ) from e


async def release_scheduler_lock(db: aiosqlite.Connection) -> None:
    """Release the scheduler lock.

    This function should be called during application shutdown. It removes the
    lock record, allowing other instances to acquire it.

    Args:
        db: The SQLite database connection.

    Raises:
        RuntimeError: If database operations fail.
    """
    pid = os.getpid()

    try:
        cursor = await db.execute(
            "DELETE FROM scheduler_lock WHERE id = 1 AND pid = ?",
            (pid,),
        )
        await db.commit()

        if cursor.rowcount == 0:
            # This shouldn't happen in normal operation, but log it for visibility
            log.warning(
                "scheduler_lock_release_mismatch",
                our_pid=pid,
                message="Tried to release lock but we don't hold it. Another instance may have replaced us.",
            )
        else:
            log.info("scheduler_lock_released", pid=pid)

    except Exception as e:
        raise RuntimeError(f"Failed to release scheduler lock: {e}") from e


async def update_scheduler_lock_heartbeat(db: aiosqlite.Connection) -> bool:
    """Update the heartbeat timestamp to keep the lock alive.

    This function should be called periodically (every ~10 seconds) to prevent
    the lock from being considered stale. It only succeeds if this process
    still holds the lock.

    Error handling: If the heartbeat update fails due to a database error, this
    function returns False (indicating lock loss) rather than raising an exception.
    This prevents the scheduler from crashing due to transient database issues,
    allowing the running application to continue and potentially recover the lock
    if it still holds it.

    Args:
        db: The SQLite database connection.

    Returns:
        True if the heartbeat was updated (we still hold the lock), False if
        we no longer hold the lock or a database error occurred.
    """
    now = time.time()
    pid = os.getpid()

    try:
        cursor = await db.execute(
            "UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1 AND pid = ?",
            (now, pid),
        )
        await db.commit()

        if cursor.rowcount == 0:
            # We no longer hold the lock
            log.warning(
                "scheduler_lock_heartbeat_lost",
                our_pid=pid,
                message="Heartbeat update failed; we no longer hold the lock.",
            )
            return False

        log.debug("scheduler_lock_heartbeat_updated", pid=pid)
        return True

    except Exception as e:
        # Don't crash the scheduler if heartbeat update fails - log and return False
        log.error(
            "scheduler_lock_heartbeat_error",
            our_pid=pid,
            error=str(e),
            message="Heartbeat update failed due to database error. Will retry on next interval.",
        )
        return False


async def get_scheduler_lock_info(db: aiosqlite.Connection) -> dict[str, Any] | None:
    """Retrieve information about the current scheduler lock.

    This function is useful for debugging and monitoring. Returns None if no
    lock is currently held.

    Args:
        db: The SQLite database connection.

    Returns:
        A dict with keys: pid, hostname, created_at, heartbeat_at, heartbeat_timeout,
        or None if no lock exists.
    """
    try:
        cursor = await db.execute(
            "SELECT pid, hostname, created_at, heartbeat_at, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
        )
        row = await cursor.fetchone()
        if row:
            pid, hostname, created_at, heartbeat_at, heartbeat_timeout = row
            return {
                "pid": pid,
                "hostname": hostname,
                "created_at": created_at,
                "heartbeat_at": heartbeat_at,
                "heartbeat_timeout": heartbeat_timeout,
            }
        return None
    except Exception as e:
        log.warning("scheduler_lock_info_query_failed", error=str(e))
        return None


async def get_lock_health(db: aiosqlite.Connection) -> dict[str, Any]:
    """Get health status of the scheduler lock for monitoring.

    Returns a dict with lock status, age, and whether it's stale. Used for
    observability endpoints and monitoring dashboards.

    Args:
        db: The SQLite database connection.

    Returns:
        A dict with keys:
            - has_lock: bool indicating if a lock exists
            - is_stale: bool indicating if lock is stale (heartbeat timeout exceeded)
            - pid: int or None
            - hostname: str or None
            - heartbeat_age_seconds: float or None (time since last heartbeat)
            - created_at: float or None
            - heartbeat_timeout: float or None
            - stale_reason: str or None (why lock is considered stale)
    """
    info = await get_scheduler_lock_info(db)
    now = time.time()

    if info is None:
        return {
            "has_lock": False,
            "is_stale": False,
            "pid": None,
            "hostname": None,
            "heartbeat_age_seconds": None,
            "created_at": None,
            "heartbeat_timeout": None,
            "stale_reason": None,
        }

    heartbeat_age = now - info["heartbeat_at"]
    is_stale_result = await is_lock_stale(
        info["heartbeat_at"],
        info["heartbeat_timeout"],
        now,
    )

    stale_reason: str | None = None
    if is_stale_result:
        stale_reason = (
            f"heartbeat_age ({heartbeat_age:.1f}s) > timeout ({info['heartbeat_timeout']:.1f}s)"
        )

    return {
        "has_lock": True,
        "is_stale": is_stale_result,
        "pid": info["pid"],
        "hostname": info["hostname"],
        "heartbeat_age_seconds": heartbeat_age,
        "created_at": info["created_at"],
        "heartbeat_timeout": info["heartbeat_timeout"],
        "stale_reason": stale_reason,
    }