- Add heartbeat-based lock renewal in scheduler_lock_heartbeat.py - Update scheduler_lock.py with improved lock management - Add comprehensive tests for scheduler lock functionality - Update deployment and task documentation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
296 lines
9.4 KiB
Python
296 lines
9.4 KiB
Python
"""Tests for the scheduler lock mechanism.
|
|
|
|
These tests verify that the database-backed scheduler lock correctly enforces
|
|
single-executor safety across multiple startup attempts, including stale lock
|
|
cleanup, heartbeat updates, and multi-process race condition prevention.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import time
|
|
from typing import Any
|
|
|
|
import aiosqlite
|
|
import pytest
|
|
|
|
from app.utils.scheduler_lock import (
|
|
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS,
|
|
SCHEDULER_LOCK_TTL_SECONDS,
|
|
acquire_scheduler_lock,
|
|
get_scheduler_lock_info,
|
|
release_scheduler_lock,
|
|
update_scheduler_lock_heartbeat,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
async def lock_db(tmp_path: Any) -> aiosqlite.Connection:
|
|
"""Create a temporary database with scheduler_lock table."""
|
|
db_path = tmp_path / "test.db"
|
|
db = await aiosqlite.connect(str(db_path))
|
|
await db.execute(
|
|
"""
|
|
CREATE TABLE scheduler_lock (
|
|
id INTEGER PRIMARY KEY CHECK (id = 1),
|
|
pid INTEGER NOT NULL,
|
|
hostname TEXT NOT NULL,
|
|
created_at REAL NOT NULL,
|
|
heartbeat_at REAL NOT NULL
|
|
);
|
|
"""
|
|
)
|
|
await db.commit()
|
|
yield db
|
|
await db.close()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_acquire_scheduler_lock_success(lock_db: aiosqlite.Connection) -> None:
|
|
"""Test successful lock acquisition."""
|
|
result = await acquire_scheduler_lock(lock_db)
|
|
assert result is True
|
|
|
|
# Verify the lock is in the database
|
|
cursor = await lock_db.execute("SELECT COUNT(*) FROM scheduler_lock")
|
|
count = await cursor.fetchone()
|
|
assert count[0] == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_acquire_scheduler_lock_fails_when_held(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that lock acquisition fails if already held."""
|
|
# First instance acquires the lock
|
|
result1 = await acquire_scheduler_lock(lock_db)
|
|
assert result1 is True
|
|
|
|
# Second instance tries to acquire, should fail
|
|
result2 = await acquire_scheduler_lock(lock_db)
|
|
assert result2 is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_acquire_scheduler_lock_cleans_stale_locks(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that stale locks are automatically cleaned up."""
|
|
# Insert a stale lock manually (old heartbeat)
|
|
now = time.time()
|
|
stale_heartbeat = now - SCHEDULER_LOCK_TTL_SECONDS - 10
|
|
await lock_db.execute(
|
|
"""
|
|
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
|
|
VALUES (1, 9999, 'stale-host', ?, ?)
|
|
""",
|
|
(now - 100, stale_heartbeat),
|
|
)
|
|
await lock_db.commit()
|
|
|
|
# New instance should clean up the stale lock and acquire
|
|
result = await acquire_scheduler_lock(lock_db)
|
|
assert result is True
|
|
|
|
# Verify the old lock is gone and new one is in place
|
|
cursor = await lock_db.execute(
|
|
"SELECT pid, hostname FROM scheduler_lock WHERE id = 1"
|
|
)
|
|
row = await cursor.fetchone()
|
|
assert row is not None
|
|
pid, hostname = row
|
|
assert pid == os.getpid()
|
|
assert hostname is not None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_release_scheduler_lock_success(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test successful lock release."""
|
|
# Acquire the lock
|
|
await acquire_scheduler_lock(lock_db)
|
|
|
|
# Release it
|
|
await release_scheduler_lock(lock_db)
|
|
|
|
# Verify the lock is gone
|
|
cursor = await lock_db.execute("SELECT COUNT(*) FROM scheduler_lock")
|
|
count = await cursor.fetchone()
|
|
assert count[0] == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_release_scheduler_lock_not_held(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that releasing a lock we don't hold is safe."""
|
|
# Try to release without acquiring — should not crash
|
|
await release_scheduler_lock(lock_db)
|
|
|
|
# Verify the lock is still empty
|
|
cursor = await lock_db.execute("SELECT COUNT(*) FROM scheduler_lock")
|
|
count = await cursor.fetchone()
|
|
assert count[0] == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_update_scheduler_lock_heartbeat_success(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test successful heartbeat update."""
|
|
# Acquire the lock
|
|
await acquire_scheduler_lock(lock_db)
|
|
|
|
# Get the original heartbeat
|
|
cursor = await lock_db.execute("SELECT heartbeat_at FROM scheduler_lock WHERE id = 1")
|
|
original_row = await cursor.fetchone()
|
|
original_heartbeat = original_row[0]
|
|
|
|
# Wait a moment and update the heartbeat
|
|
time.sleep(0.01)
|
|
result = await update_scheduler_lock_heartbeat(lock_db)
|
|
assert result is True
|
|
|
|
# Verify the heartbeat was updated
|
|
cursor = await lock_db.execute("SELECT heartbeat_at FROM scheduler_lock WHERE id = 1")
|
|
new_row = await cursor.fetchone()
|
|
new_heartbeat = new_row[0]
|
|
assert new_heartbeat > original_heartbeat
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_update_scheduler_lock_heartbeat_fails_if_not_held(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that heartbeat update fails if we don't hold the lock."""
|
|
result = await update_scheduler_lock_heartbeat(lock_db)
|
|
assert result is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_scheduler_lock_info_returns_details(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that lock info includes all relevant fields."""
|
|
await acquire_scheduler_lock(lock_db)
|
|
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
assert info is not None
|
|
assert "pid" in info
|
|
assert "hostname" in info
|
|
assert "created_at" in info
|
|
assert "heartbeat_at" in info
|
|
assert info["pid"] == os.getpid()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_get_scheduler_lock_info_returns_none_when_empty(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that lock info returns None when no lock is held."""
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
assert info is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scheduler_lock_full_lifecycle(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test the full lifecycle: acquire, update heartbeat, release."""
|
|
# Initially no lock
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
assert info is None
|
|
|
|
# Acquire the lock
|
|
result = await acquire_scheduler_lock(lock_db)
|
|
assert result is True
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
assert info is not None
|
|
initial_heartbeat = info["heartbeat_at"]
|
|
|
|
# Update heartbeat multiple times
|
|
time.sleep(0.01)
|
|
result = await update_scheduler_lock_heartbeat(lock_db)
|
|
assert result is True
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
updated_heartbeat = info["heartbeat_at"]
|
|
assert updated_heartbeat > initial_heartbeat
|
|
|
|
# Release the lock
|
|
await release_scheduler_lock(lock_db)
|
|
info = await get_scheduler_lock_info(lock_db)
|
|
assert info is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scheduler_lock_heartbeat_interval_sanity(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Verify heartbeat interval is less than TTL to prevent premature expiry.
|
|
|
|
With a 5-second heartbeat interval and 60-second TTL, the lock can survive
|
|
~12 missed heartbeats before expiring. This provides robust protection against
|
|
temporary delays or high load that could cause a single missed heartbeat.
|
|
"""
|
|
# Verify the configuration ratio is safe (interval < TTL)
|
|
assert SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS < SCHEDULER_LOCK_TTL_SECONDS
|
|
|
|
# With this ratio, the lock can survive at least 12 missed heartbeats
|
|
# (60s TTL / 5s interval = 12 intervals between heartbeats before expiry)
|
|
safe_ratio = SCHEDULER_LOCK_TTL_SECONDS / SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS
|
|
assert safe_ratio >= 12, (
|
|
f"Heartbeat interval too long: lock can only survive {safe_ratio:.1f} missed heartbeats. "
|
|
f"Should be at least 12 for safety."
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scheduler_lock_race_condition_prevention(
|
|
lock_db: aiosqlite.Connection,
|
|
) -> None:
|
|
"""Test that the lock prevents concurrent execution (race condition).
|
|
|
|
Scenario: Process A acquires the lock and starts working. Process B starts
|
|
up and tries to acquire the lock. Even if Process A's heartbeat fails
|
|
momentarily, Process B should not acquire the lock immediately.
|
|
|
|
This test verifies:
|
|
1. Only one process can hold the lock at a time
|
|
2. The lock cannot be stolen while being actively maintained (via heartbeat)
|
|
3. Stale locks are only cleaned after TTL expires
|
|
"""
|
|
# Process A acquires the lock
|
|
result_a = await acquire_scheduler_lock(lock_db)
|
|
assert result_a is True
|
|
|
|
# Get the lock info
|
|
info_a = await get_scheduler_lock_info(lock_db)
|
|
assert info_a is not None
|
|
lock_heartbeat_a = info_a["heartbeat_at"]
|
|
|
|
# Process B tries to acquire — should fail
|
|
result_b = await acquire_scheduler_lock(lock_db)
|
|
assert result_b is False
|
|
|
|
# Process A updates its heartbeat (simulating ongoing work)
|
|
time.sleep(0.01)
|
|
result_heartbeat = await update_scheduler_lock_heartbeat(lock_db)
|
|
assert result_heartbeat is True
|
|
|
|
# Verify heartbeat was updated
|
|
info_a_updated = await get_scheduler_lock_info(lock_db)
|
|
assert info_a_updated is not None
|
|
assert info_a_updated["heartbeat_at"] > lock_heartbeat_a
|
|
|
|
# Process B still cannot acquire the lock (it's active and well-maintained)
|
|
result_b_retry = await acquire_scheduler_lock(lock_db)
|
|
assert result_b_retry is False
|
|
|
|
# Process A releases the lock
|
|
await release_scheduler_lock(lock_db)
|
|
|
|
# Now Process B can acquire it
|
|
result_b_final = await acquire_scheduler_lock(lock_db)
|
|
assert result_b_final is True
|