Fix HIGH priority issues: unbounded queries, rate limiting, health checks

Issue #3 - Unbounded Query Results (OOM):
- get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default)
- Added 'id' field to records from get_archived_history() and get_archived_history_keyset()
- Protocol signature updated with page_size, max_rows, last_ban_id params

Issue #7 - Docker Health Check Fails:
- Added curl to Dockerfile.backend runtime image
- HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health'
- compose.prod.yml: increased start_period to 40s, timeout to 10s
- Frontend healthcheck proxies to backend /api/health

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-01 21:47:36 +02:00
parent 1830da496d
commit 0d5882b32f
39 changed files with 2067 additions and 339 deletions

View File

@@ -16,9 +16,12 @@ import pytest
from app.utils.scheduler_lock import (
SCHEDULER_LOCK_HEARTBEAT_INTERVAL_SECONDS,
SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS,
SCHEDULER_LOCK_TTL_SECONDS,
acquire_scheduler_lock,
get_lock_health,
get_scheduler_lock_info,
is_lock_stale,
release_scheduler_lock,
update_scheduler_lock_heartbeat,
)
@@ -30,13 +33,14 @@ async def lock_db(tmp_path: Any) -> aiosqlite.Connection:
db_path = tmp_path / "test.db"
db = await aiosqlite.connect(str(db_path))
await db.execute(
"""
f"""
CREATE TABLE scheduler_lock (
id INTEGER PRIMARY KEY CHECK (id = 1),
pid INTEGER NOT NULL,
hostname TEXT NOT NULL,
created_at REAL NOT NULL,
heartbeat_at REAL NOT NULL
heartbeat_at REAL NOT NULL,
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
);
"""
)
@@ -61,14 +65,54 @@ async def test_acquire_scheduler_lock_success(lock_db: aiosqlite.Connection) ->
async def test_acquire_scheduler_lock_fails_when_held(
lock_db: aiosqlite.Connection,
) -> None:
"""Test that lock acquisition fails if already held."""
"""Test that lock acquisition fails if already held by another process.
Note: Same-PID re-acquire is allowed (refresh). Use separate connection
with different PID to test rejection.
"""
# First instance acquires the lock
result1 = await acquire_scheduler_lock(lock_db)
assert result1 is True
# Second instance tries to acquire, should fail
result2 = await acquire_scheduler_lock(lock_db)
assert result2 is False
# Second instance (same process, same PID) - re-acquire succeeds (refresh)
result_same_pid = await acquire_scheduler_lock(lock_db)
assert result_same_pid is True
# To test rejection, create a separate database with a conflicting lock
# Simulate a different process holding the lock by inserting directly
import tempfile
import os
# Create a new in-memory database with pre-existing lock from "another process"
db_other = await aiosqlite.connect(":memory:")
await db_other.execute(
f"""
CREATE TABLE scheduler_lock (
id INTEGER PRIMARY KEY CHECK (id = 1),
pid INTEGER NOT NULL,
hostname TEXT NOT NULL,
created_at REAL NOT NULL,
heartbeat_at REAL NOT NULL,
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
)
"""
)
# Insert lock with PID=-1 (simulating another active process with recent heartbeat)
now = time.time()
await db_other.execute(
f"""
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
VALUES (1, -1, 'other-host', ?, ?, {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS})
""",
(now, now),
)
await db_other.commit()
# Now test that acquire fails when lock is held by another process
result_other = await acquire_scheduler_lock(db_other)
assert result_other is False
await db_other.close()
@pytest.mark.asyncio
@@ -78,13 +122,13 @@ async def test_acquire_scheduler_lock_cleans_stale_locks(
"""Test that stale locks are automatically cleaned up."""
# Insert a stale lock manually (old heartbeat)
now = time.time()
stale_heartbeat = now - SCHEDULER_LOCK_TTL_SECONDS - 10
stale_heartbeat = now - SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS - 10
await lock_db.execute(
"""
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at)
VALUES (1, 9999, 'stale-host', ?, ?)
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
VALUES (1, 9999, 'stale-host', ?, ?, ?)
""",
(now - 100, stale_heartbeat),
(now - 100, stale_heartbeat, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
)
await lock_db.commit()
@@ -103,6 +147,39 @@ async def test_acquire_scheduler_lock_cleans_stale_locks(
assert hostname is not None
@pytest.mark.asyncio
async def test_acquire_scheduler_lock_cleans_stale_locks_with_new_schema(
lock_db: aiosqlite.Connection,
) -> None:
"""Test that stale locks are automatically cleaned up with new timeout field."""
# Insert a stale lock manually (heartbeat past timeout)
now = time.time()
stale_heartbeat = now - SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS - 10
await lock_db.execute(
"""
INSERT INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
VALUES (1, 9999, 'stale-host', ?, ?, ?)
""",
(now - 100, stale_heartbeat, SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS),
)
await lock_db.commit()
# New instance should steal the stale lock and acquire
result = await acquire_scheduler_lock(lock_db)
assert result is True
# Verify the old lock is gone and new one is in place
cursor = await lock_db.execute(
"SELECT pid, hostname, heartbeat_timeout FROM scheduler_lock WHERE id = 1"
)
row = await cursor.fetchone()
assert row is not None
pid, hostname, timeout = row
assert pid == os.getpid()
assert hostname is not None
assert timeout == SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS
@pytest.mark.asyncio
async def test_release_scheduler_lock_success(
lock_db: aiosqlite.Connection,
@@ -246,50 +323,210 @@ async def test_scheduler_lock_heartbeat_interval_sanity(
@pytest.mark.asyncio
async def test_scheduler_lock_race_condition_prevention(
async def test_scheduler_lock_two_instances_cannot_both_hold(
tmp_path: Any,
) -> None:
"""Test that two different processes cannot both hold the lock.
This simulates two instances trying to acquire the lock. The second
instance should fail to acquire while the first holds a valid lock.
Note: Same-PID re-acquire is allowed (refresh). To test rejection,
we insert a lock with a different PID before testing.
"""
db_path = tmp_path / "test.db"
# Instance A connects and acquires the lock
db_a = await aiosqlite.connect(str(db_path))
await db_a.execute(
f"""
CREATE TABLE scheduler_lock (
id INTEGER PRIMARY KEY CHECK (id = 1),
pid INTEGER NOT NULL,
hostname TEXT NOT NULL,
created_at REAL NOT NULL,
heartbeat_at REAL NOT NULL,
heartbeat_timeout REAL NOT NULL DEFAULT {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS}
);
"""
)
await db_a.commit()
result_a = await acquire_scheduler_lock(db_a)
assert result_a is True
# Same-PID re-acquire succeeds (refresh behavior)
result_a_refresh = await acquire_scheduler_lock(db_a)
assert result_a_refresh is True
# Simulate another process holding the lock by inserting with a different PID
# (this is the "conflicting" lock we want to reject)
await db_a.execute(
f"""
INSERT OR REPLACE INTO scheduler_lock (id, pid, hostname, created_at, heartbeat_at, heartbeat_timeout)
VALUES (1, -999, 'other-host', {time.time()}, {time.time()}, {SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS})
"""
)
await db_a.commit()
# Instance B (different connection, same PID in test) tries to acquire
# Should fail because different PID (-999) holds the lock
db_b = await aiosqlite.connect(str(db_path))
result_b = await acquire_scheduler_lock(db_b)
assert result_b is False
# Clear the conflicting lock directly (simulating other process dying)
await db_a.execute("DELETE FROM scheduler_lock")
await db_a.commit()
# Now Instance B can acquire
result_b3 = await acquire_scheduler_lock(db_b)
assert result_b3 is True
await db_a.close()
await db_b.close()
@pytest.mark.asyncio
async def test_acquire_scheduler_lock_steals_stale_lock(
lock_db: aiosqlite.Connection,
) -> None:
"""Test that the lock prevents concurrent execution (race condition).
"""Test that a stale lock can be stolen by another instance.
Scenario: Process A acquires the lock and starts working. Process B starts
up and tries to acquire the lock. Even if Process A's heartbeat fails
momentarily, Process B should not acquire the lock immediately.
Scenario: Process A acquires the lock but crashes (never releases it).
Process B starts up and sees the lock has stale heartbeat (past timeout).
Process B should be able to steal the lock.
This test verifies:
1. Only one process can hold the lock at a time
2. The lock cannot be stolen while being actively maintained (via heartbeat)
3. Stale locks are only cleaned after TTL expires
This is the key fix for the race condition issue: orphaned locks no longer
permanently block the scheduler.
"""
# Process A acquires the lock
# Simulate Process A acquiring the lock
result_a = await acquire_scheduler_lock(lock_db)
assert result_a is True
# Get the lock info
info_a = await get_scheduler_lock_info(lock_db)
assert info_a is not None
lock_heartbeat_a = info_a["heartbeat_at"]
# Get lock info to see heartbeat timeout
info = await get_scheduler_lock_info(lock_db)
assert info is not None
heartbeat_timeout = info["heartbeat_timeout"]
# Process B tries to acquire — should fail
# Simulate stale lock: manually set heartbeat to far in the past
now = time.time()
stale_heartbeat = now - heartbeat_timeout - 10
await lock_db.execute(
"UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1",
(stale_heartbeat,),
)
await lock_db.commit()
# Process B should now be able to acquire (steal) the stale lock
result_b = await acquire_scheduler_lock(lock_db)
assert result_b is False
assert result_b is True
# Process A updates its heartbeat (simulating ongoing work)
time.sleep(0.01)
result_heartbeat = await update_scheduler_lock_heartbeat(lock_db)
assert result_heartbeat is True
# Verify Process B now holds the lock
info_b = await get_scheduler_lock_info(lock_db)
assert info_b is not None
assert info_b["pid"] == os.getpid()
# Verify heartbeat was updated
info_a_updated = await get_scheduler_lock_info(lock_db)
assert info_a_updated is not None
assert info_a_updated["heartbeat_at"] > lock_heartbeat_a
# Process B still cannot acquire the lock (it's active and well-maintained)
result_b_retry = await acquire_scheduler_lock(lock_db)
assert result_b_retry is False
@pytest.mark.asyncio
async def test_is_lock_stale_function() -> None:
"""Test the is_lock_stale helper function."""
now = time.time()
timeout = 300.0
# Process A releases the lock
# Fresh lock is not stale
heartbeat_at = now - 10
assert await is_lock_stale(heartbeat_at, timeout, now) is False
# Lock past timeout is stale
heartbeat_at = now - 400
assert await is_lock_stale(heartbeat_at, timeout, now) is True
# Exactly at timeout is not stale (boundary condition)
heartbeat_at = now - 300
assert await is_lock_stale(heartbeat_at, timeout, now) is False
@pytest.mark.asyncio
async def test_get_lock_health_no_lock(lock_db: aiosqlite.Connection) -> None:
"""Test get_lock_health when no lock exists."""
health = await get_lock_health(lock_db)
assert health["has_lock"] is False
assert health["is_stale"] is False
assert health["pid"] is None
assert health["stale_reason"] is None
@pytest.mark.asyncio
async def test_get_lock_health_active_lock(lock_db: aiosqlite.Connection) -> None:
"""Test get_lock_health with an active, healthy lock."""
await acquire_scheduler_lock(lock_db)
health = await get_lock_health(lock_db)
assert health["has_lock"] is True
assert health["is_stale"] is False
assert health["pid"] == os.getpid()
assert health["hostname"] is not None
assert health["heartbeat_timeout"] == SCHEDULER_LOCK_HEARTBEAT_TIMEOUT_SECONDS
assert health["stale_reason"] is None
@pytest.mark.asyncio
async def test_get_lock_health_stale_lock(lock_db: aiosqlite.Connection) -> None:
"""Test get_lock_health with a stale lock."""
await acquire_scheduler_lock(lock_db)
# Manually make the lock stale
now = time.time()
info = await get_scheduler_lock_info(lock_db)
stale_heartbeat = now - info["heartbeat_timeout"] - 10
await lock_db.execute(
"UPDATE scheduler_lock SET heartbeat_at = ? WHERE id = 1",
(stale_heartbeat,),
)
await lock_db.commit()
health = await get_lock_health(lock_db)
assert health["has_lock"] is True
assert health["is_stale"] is True
assert health["stale_reason"] is not None
assert "heartbeat_age" in health["stale_reason"]
assert "timeout" in health["stale_reason"]
@pytest.mark.asyncio
async def test_heartbeat_update_error_returns_false(
lock_db: aiosqlite.Connection,
) -> None:
"""Test that heartbeat update errors return False instead of raising."""
# Try to update heartbeat without acquiring lock first
result = await update_scheduler_lock_heartbeat(lock_db)
assert result is False
# Acquire lock
await acquire_scheduler_lock(lock_db)
# Heartbeat should work
result = await update_scheduler_lock_heartbeat(lock_db)
assert result is True
@pytest.mark.asyncio
async def test_concurrent_acquire_from_same_process(lock_db: aiosqlite.Connection) -> None:
"""Test that concurrent acquire attempts from same process re-acquires (refreshes)."""
# First acquisition should succeed
result1 = await acquire_scheduler_lock(lock_db)
assert result1 is True
# Second acquisition from same process should succeed (re-acquire/refresh)
result2 = await acquire_scheduler_lock(lock_db)
assert result2 is True
# Heartbeat should be updated
info = await get_scheduler_lock_info(lock_db)
assert info is not None
# Release and re-acquire should work
await release_scheduler_lock(lock_db)
# Now Process B can acquire it
result_b_final = await acquire_scheduler_lock(lock_db)
assert result_b_final is True
result3 = await acquire_scheduler_lock(lock_db)
assert result3 is True