Fix HIGH priority issues: unbounded queries, rate limiting, health checks
Issue #3 - Unbounded Query Results (OOM): - get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default) - Added 'id' field to records from get_archived_history() and get_archived_history_keyset() - Protocol signature updated with page_size, max_rows, last_ban_id params Issue #7 - Docker Health Check Fails: - Added curl to Dockerfile.backend runtime image - HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health' - compose.prod.yml: increased start_period to 40s, timeout to 10s - Frontend healthcheck proxies to backend /api/health Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -16,9 +16,12 @@ import pytest
|
||||
|
||||
from app.utils.scheduler_lock import (
|
||||
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS,
|
||||
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,
|
||||
SCHEDULER_LOCK_TTL_SECONDS,
|
||||
acquire_scheduler_lock,
|
||||
get_lock_health,
|
||||
get_scheduler_lock_info,
|
||||
is_lock_stale,
|
||||
release_scheduler_lock,
|
||||
update_scheduler_lock_heartbeat,
|
||||
)
|
||||
@@ -30,13 +33,14 @@ async def lock_db(tmp_path: Any) -> aiosqlite.Connection:
|
||||
db_path = tmp_path / "test.db"
|
||||
db = await aiosqlite.connect(str(db_path))
|
||||
await db.execute(
|
||||
"""
|
||||
f"""
|
||||
CREATE TABLE scheduler_lock (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
pid INTEGER NOT NULL,
|
||||
hostname TEXT NOT NULL,
|
||||
created_at REAL NOT NULL,
|
||||
heartbeat_at REAL NOT NULL
|
||||
heartbeat_at REAL NOT NULL,
|
||||
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
|
||||
);
|
||||
"""
|
||||
)
|
||||
@@ -61,14 +65,54 @@ async def test_acquire_scheduler_lock_success(lock_db: aiosqlite.Connection) ->
|
||||
async def test_acquire_scheduler_lock_fails_when_held(
|
||||
lock_db: aiosqlite.Connection,
|
||||
) -> None:
|
||||
"""Test that lock acquisition fails if already held."""
|
||||
"""Test that lock acquisition fails if already held by another process.
|
||||
|
||||
Note: Same-PID re-acquire is allowed (refresh). Use separate connection
|
||||
with different PID to test rejection.
|
||||
"""
|
||||
# First instance acquires the lock
|
||||
result1 = await acquire_scheduler_lock(lock_db)
|
||||
assert result1 is True
|
||||
|
||||
# Second instance tries to acquire, should fail
|
||||
result2 = await acquire_scheduler_lock(lock_db)
|
||||
assert result2 is False
|
||||
# Second instance (same process, same PID) - re-acquire succeeds (refresh)
|
||||
result_same_pid = await acquire_scheduler_lock(lock_db)
|
||||
assert result_same_pid is True
|
||||
|
||||
# To test rejection, create a separate database with a conflicting lock
|
||||
# Simulate a different process holding the lock by inserting directly
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
# Create a new in-memory database with pre-existing lock from "another process"
|
||||
db_other = await aiosqlite.connect(":memory:")
|
||||
await db_other.execute(
|
||||
f"""
|
||||
CREATE TABLE scheduler_lock (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
pid INTEGER NOT NULL,
|
||||
hostname TEXT NOT NULL,
|
||||
created_at REAL NOT NULL,
|
||||
heartbeat_at REAL NOT NULL,
|
||||
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
|
||||
)
|
||||
"""
|
||||
)
|
||||
# Insert lock with PID=-1 (simulating another active process with recent heartbeat)
|
||||
now = time.time()
|
||||
await db_other.execute(
|
||||
f"""
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
||||
VALUES (1, -1, 'other-host', ?, ?, {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS})
|
||||
""",
|
||||
(now, now),
|
||||
)
|
||||
await db_other.commit()
|
||||
|
||||
# Now test that acquire fails when lock is held by another process
|
||||
result_other = await acquire_scheduler_lock(db_other)
|
||||
assert result_other is False
|
||||
|
||||
await db_other.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -78,13 +122,13 @@ async def test_acquire_scheduler_lock_cleans_stale_locks(
|
||||
"""Test that stale locks are automatically cleaned up."""
|
||||
# Insert a stale lock manually (old heartbeat)
|
||||
now = time.time()
|
||||
stale_heartbeat = now - SCHEDULER_LOCK_TTL_SECONDS - 10
|
||||
stale_heartbeat = now - SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS - 10
|
||||
await lock_db.execute(
|
||||
"""
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
|
||||
VALUES (1, 9999, 'stale-host', ?, ?)
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
||||
VALUES (1, 9999, 'stale-host', ?, ?, ?)
|
||||
""",
|
||||
(now - 100, stale_heartbeat),
|
||||
(now - 100, stale_heartbeat, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
|
||||
)
|
||||
await lock_db.commit()
|
||||
|
||||
@@ -103,6 +147,39 @@ async def test_acquire_scheduler_lock_cleans_stale_locks(
|
||||
assert hostname is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_scheduler_lock_cleans_stale_locks_with_new_schema(
|
||||
lock_db: aiosqlite.Connection,
|
||||
) -> None:
|
||||
"""Test that stale locks are automatically cleaned up with new timeout field."""
|
||||
# Insert a stale lock manually (heartbeat past timeout)
|
||||
now = time.time()
|
||||
stale_heartbeat = now - SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS - 10
|
||||
await lock_db.execute(
|
||||
"""
|
||||
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
||||
VALUES (1, 9999, 'stale-host', ?, ?, ?)
|
||||
""",
|
||||
(now - 100, stale_heartbeat, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
|
||||
)
|
||||
await lock_db.commit()
|
||||
|
||||
# New instance should steal the stale lock and acquire
|
||||
result = await acquire_scheduler_lock(lock_db)
|
||||
assert result is True
|
||||
|
||||
# Verify the old lock is gone and new one is in place
|
||||
cursor = await lock_db.execute(
|
||||
"SELECT pid, hostname, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
assert row is not None
|
||||
pid, hostname, timeout = row
|
||||
assert pid == os.getpid()
|
||||
assert hostname is not None
|
||||
assert timeout == SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_scheduler_lock_success(
|
||||
lock_db: aiosqlite.Connection,
|
||||
@@ -246,50 +323,210 @@ async def test_scheduler_lock_heartbeat_interval_sanity(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scheduler_lock_race_condition_prevention(
|
||||
async def test_scheduler_lock_two_instances_cannot_both_hold(
|
||||
tmp_path: Any,
|
||||
) -> None:
|
||||
"""Test that two different processes cannot both hold the lock.
|
||||
|
||||
This simulates two instances trying to acquire the lock. The second
|
||||
instance should fail to acquire while the first holds a valid lock.
|
||||
|
||||
Note: Same-PID re-acquire is allowed (refresh). To test rejection,
|
||||
we insert a lock with a different PID before testing.
|
||||
"""
|
||||
db_path = tmp_path / "test.db"
|
||||
|
||||
# Instance A connects and acquires the lock
|
||||
db_a = await aiosqlite.connect(str(db_path))
|
||||
await db_a.execute(
|
||||
f"""
|
||||
CREATE TABLE scheduler_lock (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
pid INTEGER NOT NULL,
|
||||
hostname TEXT NOT NULL,
|
||||
created_at REAL NOT NULL,
|
||||
heartbeat_at REAL NOT NULL,
|
||||
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
|
||||
);
|
||||
"""
|
||||
)
|
||||
await db_a.commit()
|
||||
|
||||
result_a = await acquire_scheduler_lock(db_a)
|
||||
assert result_a is True
|
||||
|
||||
# Same-PID re-acquire succeeds (refresh behavior)
|
||||
result_a_refresh = await acquire_scheduler_lock(db_a)
|
||||
assert result_a_refresh is True
|
||||
|
||||
# Simulate another process holding the lock by inserting with a different PID
|
||||
# (this is the "conflicting" lock we want to reject)
|
||||
await db_a.execute(
|
||||
f"""
|
||||
INSERT OR REPLACE INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
|
||||
VALUES (1, -999, 'other-host', {time.time()}, {time.time()}, {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS})
|
||||
"""
|
||||
)
|
||||
await db_a.commit()
|
||||
|
||||
# Instance B (different connection, same PID in test) tries to acquire
|
||||
# Should fail because different PID (-999) holds the lock
|
||||
db_b = await aiosqlite.connect(str(db_path))
|
||||
result_b = await acquire_scheduler_lock(db_b)
|
||||
assert result_b is False
|
||||
|
||||
# Clear the conflicting lock directly (simulating other process dying)
|
||||
await db_a.execute("DELETE FROM scheduler_lock")
|
||||
await db_a.commit()
|
||||
|
||||
# Now Instance B can acquire
|
||||
result_b3 = await acquire_scheduler_lock(db_b)
|
||||
assert result_b3 is True
|
||||
|
||||
await db_a.close()
|
||||
await db_b.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_acquire_scheduler_lock_steals_stale_lock(
|
||||
lock_db: aiosqlite.Connection,
|
||||
) -> None:
|
||||
"""Test that the lock prevents concurrent execution (race condition).
|
||||
"""Test that a stale lock can be stolen by another instance.
|
||||
|
||||
Scenario: Process A acquires the lock and starts working. Process B starts
|
||||
up and tries to acquire the lock. Even if Process A's heartbeat fails
|
||||
momentarily, Process B should not acquire the lock immediately.
|
||||
Scenario: Process A acquires the lock but crashes (never releases it).
|
||||
Process B starts up and sees the lock has stale heartbeat (past timeout).
|
||||
Process B should be able to steal the lock.
|
||||
|
||||
This test verifies:
|
||||
1. Only one process can hold the lock at a time
|
||||
2. The lock cannot be stolen while being actively maintained (via heartbeat)
|
||||
3. Stale locks are only cleaned after TTL expires
|
||||
This is the key fix for the race condition issue: orphaned locks no longer
|
||||
permanently block the scheduler.
|
||||
"""
|
||||
# Process A acquires the lock
|
||||
# Simulate Process A acquiring the lock
|
||||
result_a = await acquire_scheduler_lock(lock_db)
|
||||
assert result_a is True
|
||||
|
||||
# Get the lock info
|
||||
info_a = await get_scheduler_lock_info(lock_db)
|
||||
assert info_a is not None
|
||||
lock_heartbeat_a = info_a["heartbeat_at"]
|
||||
# Get lock info to see heartbeat timeout
|
||||
info = await get_scheduler_lock_info(lock_db)
|
||||
assert info is not None
|
||||
heartbeat_timeout = info["heartbeat_timeout"]
|
||||
|
||||
# Process B tries to acquire — should fail
|
||||
# Simulate stale lock: manually set heartbeat to far in the past
|
||||
now = time.time()
|
||||
stale_heartbeat = now - heartbeat_timeout - 10
|
||||
await lock_db.execute(
|
||||
"UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1",
|
||||
(stale_heartbeat,),
|
||||
)
|
||||
await lock_db.commit()
|
||||
|
||||
# Process B should now be able to acquire (steal) the stale lock
|
||||
result_b = await acquire_scheduler_lock(lock_db)
|
||||
assert result_b is False
|
||||
assert result_b is True
|
||||
|
||||
# Process A updates its heartbeat (simulating ongoing work)
|
||||
time.sleep(0.01)
|
||||
result_heartbeat = await update_scheduler_lock_heartbeat(lock_db)
|
||||
assert result_heartbeat is True
|
||||
# Verify Process B now holds the lock
|
||||
info_b = await get_scheduler_lock_info(lock_db)
|
||||
assert info_b is not None
|
||||
assert info_b["pid"] == os.getpid()
|
||||
|
||||
# Verify heartbeat was updated
|
||||
info_a_updated = await get_scheduler_lock_info(lock_db)
|
||||
assert info_a_updated is not None
|
||||
assert info_a_updated["heartbeat_at"] > lock_heartbeat_a
|
||||
|
||||
# Process B still cannot acquire the lock (it's active and well-maintained)
|
||||
result_b_retry = await acquire_scheduler_lock(lock_db)
|
||||
assert result_b_retry is False
|
||||
@pytest.mark.asyncio
|
||||
async def test_is_lock_stale_function() -> None:
|
||||
"""Test the is_lock_stale helper function."""
|
||||
now = time.time()
|
||||
timeout = 300.0
|
||||
|
||||
# Process A releases the lock
|
||||
# Fresh lock is not stale
|
||||
heartbeat_at = now - 10
|
||||
assert await is_lock_stale(heartbeat_at, timeout, now) is False
|
||||
|
||||
# Lock past timeout is stale
|
||||
heartbeat_at = now - 400
|
||||
assert await is_lock_stale(heartbeat_at, timeout, now) is True
|
||||
|
||||
# Exactly at timeout is not stale (boundary condition)
|
||||
heartbeat_at = now - 300
|
||||
assert await is_lock_stale(heartbeat_at, timeout, now) is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_lock_health_no_lock(lock_db: aiosqlite.Connection) -> None:
|
||||
"""Test get_lock_health when no lock exists."""
|
||||
health = await get_lock_health(lock_db)
|
||||
assert health["has_lock"] is False
|
||||
assert health["is_stale"] is False
|
||||
assert health["pid"] is None
|
||||
assert health["stale_reason"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_lock_health_active_lock(lock_db: aiosqlite.Connection) -> None:
|
||||
"""Test get_lock_health with an active, healthy lock."""
|
||||
await acquire_scheduler_lock(lock_db)
|
||||
|
||||
health = await get_lock_health(lock_db)
|
||||
assert health["has_lock"] is True
|
||||
assert health["is_stale"] is False
|
||||
assert health["pid"] == os.getpid()
|
||||
assert health["hostname"] is not None
|
||||
assert health["heartbeat_timeout"] == SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS
|
||||
assert health["stale_reason"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_lock_health_stale_lock(lock_db: aiosqlite.Connection) -> None:
|
||||
"""Test get_lock_health with a stale lock."""
|
||||
await acquire_scheduler_lock(lock_db)
|
||||
|
||||
# Manually make the lock stale
|
||||
now = time.time()
|
||||
info = await get_scheduler_lock_info(lock_db)
|
||||
stale_heartbeat = now - info["heartbeat_timeout"] - 10
|
||||
await lock_db.execute(
|
||||
"UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1",
|
||||
(stale_heartbeat,),
|
||||
)
|
||||
await lock_db.commit()
|
||||
|
||||
health = await get_lock_health(lock_db)
|
||||
assert health["has_lock"] is True
|
||||
assert health["is_stale"] is True
|
||||
assert health["stale_reason"] is not None
|
||||
assert "heartbeat_age" in health["stale_reason"]
|
||||
assert "timeout" in health["stale_reason"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_heartbeat_update_error_returns_false(
|
||||
lock_db: aiosqlite.Connection,
|
||||
) -> None:
|
||||
"""Test that heartbeat update errors return False instead of raising."""
|
||||
# Try to update heartbeat without acquiring lock first
|
||||
result = await update_scheduler_lock_heartbeat(lock_db)
|
||||
assert result is False
|
||||
|
||||
# Acquire lock
|
||||
await acquire_scheduler_lock(lock_db)
|
||||
|
||||
# Heartbeat should work
|
||||
result = await update_scheduler_lock_heartbeat(lock_db)
|
||||
assert result is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_acquire_from_same_process(lock_db: aiosqlite.Connection) -> None:
|
||||
"""Test that concurrent acquire attempts from same process re-acquires (refreshes)."""
|
||||
# First acquisition should succeed
|
||||
result1 = await acquire_scheduler_lock(lock_db)
|
||||
assert result1 is True
|
||||
|
||||
# Second acquisition from same process should succeed (re-acquire/refresh)
|
||||
result2 = await acquire_scheduler_lock(lock_db)
|
||||
assert result2 is True
|
||||
|
||||
# Heartbeat should be updated
|
||||
info = await get_scheduler_lock_info(lock_db)
|
||||
assert info is not None
|
||||
|
||||
# Release and re-acquire should work
|
||||
await release_scheduler_lock(lock_db)
|
||||
|
||||
# Now Process B can acquire it
|
||||
result_b_final = await acquire_scheduler_lock(lock_db)
|
||||
assert result_b_final is True
|
||||
result3 = await acquire_scheduler_lock(lock_db)
|
||||
assert result3 is True
|
||||
|
||||
Reference in New Issue
Block a user