BanGUI/backend/app/tasks/scheduler_lock_heartbeat.py

"""Scheduler lock heartbeat background task.

Registers an APScheduler job that periodically updates the scheduler lock's
heartbeat timestamp. This prevents the lock from being considered stale
if the running instance experiences temporary delays or high load.

Without this heartbeat, stale lock detection (based on TTL) could incorrectly
determine that the scheduler instance has crashed when it's merely busy, and
a new instance could take over.

Correlation IDs are propagated through the task using :mod:`app.utils.correlation`
so that task logs can be correlated across runs.
"""

from __future__ import annotations

import uuid
from typing import TYPE_CHECKING

from app.utils.logging_compat import get_logger

from app.tasks.db import task_db
from app.tasks.timeout_utils import run_with_timeout
from app.utils.correlation import get_correlation_id, reset_correlation_id, set_correlation_id
from app.utils.runtime_state import get_effective_settings
from app.utils.scheduler_lock import update_scheduler_lock_heartbeat

if TYPE_CHECKING:
    from fastapi import FastAPI

    from app.config import Settings

log = get_logger(__name__)

#: How often the heartbeat job fires (seconds). Must be significantly less than
#: the lock TTL to allow multiple missed heartbeats before lock expiry.
SCHEDULER_LOCK_HEARTBEAT_INTERVAL: int = 5

#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates.
JOB_ID: str = "scheduler_lock_heartbeat"

#: Maximum seconds to allow for scheduler lock heartbeat to complete.
TASK_TIMEOUT_SECONDS: int = 5


async def _update_heartbeat_with_resources(
    settings: Settings,
    correlation_id: str | None = None,
) -> None:
    """Update the scheduler lock heartbeat timestamp.

    If the heartbeat update fails (e.g., we no longer hold the lock), log
    a warning but don't crash the scheduler. This allows the running
    application to continue even if something went wrong.

    The heartbeat must complete within TASK_TIMEOUT_SECONDS to prevent
    scheduler starvation. If it exceeds this timeout, a warning is logged
    and the task is cancelled.

    Args:
        settings: The resolved application settings used for database access.
        correlation_id: Optional correlation ID from the triggering request.
    """
    if correlation_id is None:
        correlation_id = str(uuid.uuid4())

    token = set_correlation_id(correlation_id)
    try:
        await _do_update_heartbeat_with_settings(settings)
    finally:
        reset_correlation_id(token)


async def _do_update_heartbeat_with_settings(settings: Settings) -> None:
    """Inner heartbeat logic that runs with correlation context set."""

    async def _do_update() -> None:
        async with task_db(settings) as db:
            success = await update_scheduler_lock_heartbeat(db)

        if success:
            log.debug("scheduler_lock_heartbeat_updated", correlation_id=get_correlation_id())
        else:
            log.warning(
                "scheduler_lock_heartbeat_failed",
                correlation_id=get_correlation_id(),
                message="Failed to update heartbeat; we no longer hold the lock. "
                "Another instance may have taken over or the database connection failed.",
            )

    try:
        await run_with_timeout("scheduler_lock_heartbeat", _do_update(), TASK_TIMEOUT_SECONDS)
    except TimeoutError:
        log.error(
            "scheduler_lock_heartbeat_timeout",
            correlation_id=get_correlation_id(),
            timeout_seconds=TASK_TIMEOUT_SECONDS,
            message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
        )
    except Exception as e:
        log.error(
            "scheduler_lock_heartbeat_error",
            correlation_id=get_correlation_id(),
            error=str(e),
            message="Unexpected error during heartbeat update.",
        )


async def _update_heartbeat(app: FastAPI) -> None:
    await _update_heartbeat_with_resources(get_effective_settings(app))


def register(app: FastAPI) -> None:
    """Add (or replace) the scheduler lock heartbeat job.

    Must be called after the scheduler has been started (i.e., inside the
    lifespan handler, after ``scheduler.start()``).

    Args:
        app: The :class:`fastapi.FastAPI` application instance whose
            ``app.state.scheduler`` will receive the job.
    """
    settings = get_effective_settings(app)
    scheduler = getattr(app.state, "scheduler", None)
    if scheduler is None:
        # In tests or standalone usage, scheduler may not be on app.state yet.
        # Use a no-op fallback — the heartbeat won't be registered but no crash.
        log.warning("scheduler_lock_heartbeat_no_scheduler")
        return
    scheduler.add_job(
        _update_heartbeat_with_resources,
        trigger="interval",
        seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
        kwargs={"settings": settings},
        id=JOB_ID,
        replace_existing=True,
    )
    log.info(
        "scheduler_lock_heartbeat_scheduled",
        interval_seconds=SCHEDULER_LOCK_HEARTBEAT_INTERVAL,
    )