Files
BanGUI/backend/tests/test_repositories/test_geo_cache_repo.py
Lukas e2560f5db0 TASK-032: Implement geo_cache retention policy and cleanup
Add automatic cleanup of stale geolocation cache entries to prevent
unbounded database growth. Resolves the issue where unique IP addresses
accumulated indefinitely in the geo_cache table, degrading query performance.

## Changes

### Database Schema (Migration 3)
- Add 'last_seen' column to geo_cache table tracking last reference time
- Existing entries default to current timestamp

### Repository Layer (geo_cache_repo.py)
- Update upsert_entry() to set/refresh last_seen on insert/update
- Update upsert_neg_entry() to set/refresh last_seen on negative cache hits
- Update bulk_upsert_entries() to set/refresh last_seen in batch operations
- Add delete_stale_entries(db, cutoff_iso) -> int for purging old entries

### Background Task (geo_cache_cleanup.py)
- New APScheduler task that runs nightly (24-hour interval)
- Calculates cutoff as 90 days ago from current time (UTC)
- Deletes all entries with last_seen older than cutoff
- Logs operation results (info when deleted > 0, debug when 0 deleted)
- Configurable retention period via GEO_CACHE_RETENTION_DAYS constant

### Application Startup (startup.py)
- Register geo_cache_cleanup task in scheduler during app startup
- Placed after geo_cache_flush in task registration order

### Tests
- Add delete_stale_entries test cases covering:
  * Removal of old entries beyond cutoff
  * No deletion when all entries are recent
  * Empty table edge case
- Update existing test fixtures to include last_seen column
- Add full test suite for cleanup task registration and execution

### Documentation
- Architekture.md: Document cleanup task, update schema/diagram
- Backend-Development.md: Add retention policy documentation

## Behavior

When an IP is accessed, its last_seen is refreshed. After 90 days of no
access, an IP is purged by the nightly cleanup. On next encounter, the IP
is re-resolved from MaxMind MMDB or ip-api.com (if configured).

This is acceptable because:
1. Stale geolocation data may become inaccurate over time
2. Re-resolution cost is minimal compared to unbounded storage growth
3. Active IPs maintain fresh data through their last_seen updates

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-26 19:24:34 +02:00

266 lines
8.8 KiB
Python

"""Tests for the geo cache repository."""
from pathlib import Path
import aiosqlite
import pytest
from app.repositories import geo_cache_repo
async def _create_geo_cache_table(db: aiosqlite.Connection) -> None:
await db.execute(
"""
CREATE TABLE IF NOT EXISTS geo_cache (
ip TEXT PRIMARY KEY,
country_code TEXT,
country_name TEXT,
asn TEXT,
org TEXT,
cached_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')),
last_seen TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now'))
)
"""
)
await db.commit()
@pytest.mark.asyncio
async def test_get_unresolved_ips_returns_empty_when_none_exist(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
await db.execute(
"INSERT INTO geo_cache (ip, country_code, country_name, asn, org) VALUES (?, ?, ?, ?, ?)",
("1.1.1.1", "DE", "Germany", "AS123", "Test"),
)
await db.commit()
async with aiosqlite.connect(db_path) as db:
ips = await geo_cache_repo.get_unresolved_ips(db)
assert ips == []
@pytest.mark.asyncio
async def test_get_unresolved_ips_returns_pending_ips(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
await db.executemany(
"INSERT INTO geo_cache (ip, country_code) VALUES (?, ?)",
[
("2.2.2.2", None),
("3.3.3.3", None),
("4.4.4.4", "US"),
],
)
await db.commit()
async with aiosqlite.connect(db_path) as db:
ips = await geo_cache_repo.get_unresolved_ips(db)
assert sorted(ips) == ["2.2.2.2", "3.3.3.3"]
@pytest.mark.asyncio
async def test_load_all_and_count_unresolved(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
await db.executemany(
"INSERT INTO geo_cache (ip, country_code, country_name, asn, org) VALUES (?, ?, ?, ?, ?)",
[
("5.5.5.5", None, None, None, None),
("6.6.6.6", "FR", "France", "AS456", "TestOrg"),
],
)
await db.commit()
async with aiosqlite.connect(db_path) as db:
rows = await geo_cache_repo.load_all(db)
unresolved = await geo_cache_repo.count_unresolved(db)
assert unresolved == 1
assert any(row["ip"] == "6.6.6.6" and row["country_code"] == "FR" for row in rows)
@pytest.mark.asyncio
async def test_upsert_entry_and_neg_entry(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
await geo_cache_repo.upsert_entry(
db,
"7.7.7.7",
"GB",
"United Kingdom",
"AS789",
"TestOrg",
)
await db.commit()
await geo_cache_repo.upsert_neg_entry(db, "8.8.8.8")
await db.commit()
# Ensure positive entry is present.
async with db.execute("SELECT country_code FROM geo_cache WHERE ip = ?", ("7.7.7.7",)) as cur:
row = await cur.fetchone()
assert row is not None
assert row[0] == "GB"
# Ensure negative entry exists with NULL country_code.
async with db.execute("SELECT country_code FROM geo_cache WHERE ip = ?", ("8.8.8.8",)) as cur:
row = await cur.fetchone()
assert row is not None
assert row[0] is None
@pytest.mark.asyncio
async def test_upsert_entry_and_commit_commits_transaction(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
await geo_cache_repo.upsert_entry_and_commit(
db,
"13.13.13.13",
"NL",
"Netherlands",
"AS1313",
"TestOrg",
)
async with db.execute("SELECT country_code FROM geo_cache WHERE ip = ?", ("13.13.13.13",)) as cur:
row = await cur.fetchone()
assert row is not None
assert row[0] == "NL"
@pytest.mark.asyncio
async def test_bulk_upsert_entries_and_neg_entries_and_commit_commits_once(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
rows = [
("14.14.14.14", "BE", "Belgium", "AS1414", "Test"),
]
count, neg_count = await geo_cache_repo.bulk_upsert_entries_and_neg_entries_and_commit(
db,
rows,
["15.15.15.15"],
)
assert count == 1
assert neg_count == 1
async with db.execute("SELECT COUNT(*) FROM geo_cache") as cur:
row = await cur.fetchone()
assert row is not None
assert int(row[0]) == 2
@pytest.mark.asyncio
async def test_bulk_upsert_entries_and_neg_entries(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
rows = [
("9.9.9.9", "NL", "Netherlands", "AS101", "Test"),
("10.10.10.10", "JP", "Japan", "AS102", "Test"),
]
count = await geo_cache_repo.bulk_upsert_entries(db, rows)
assert count == 2
neg_count = await geo_cache_repo.bulk_upsert_neg_entries(db, ["11.11.11.11", "12.12.12.12"])
assert neg_count == 2
await db.commit()
async with db.execute("SELECT COUNT(*) FROM geo_cache") as cur:
row = await cur.fetchone()
assert row is not None
assert int(row[0]) == 4
@pytest.mark.asyncio
async def test_delete_stale_entries_removes_old_entries(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
# Insert entries with various last_seen times
await db.execute(
"INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)",
("1.1.1.1", "US", "2020-01-01T00:00:00Z"),
)
await db.execute(
"INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)",
("2.2.2.2", "DE", "2024-12-01T00:00:00Z"),
)
await db.execute(
"INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)",
("3.3.3.3", "FR", "2025-01-01T00:00:00Z"),
)
await db.commit()
async with aiosqlite.connect(db_path) as db:
# Delete entries older than 2024-06-01
deleted = await geo_cache_repo.delete_stale_entries(db, "2024-06-01T00:00:00Z")
await db.commit()
assert deleted == 1
# Verify the correct entry was deleted
async with aiosqlite.connect(db_path) as db, db.execute("SELECT ip FROM geo_cache ORDER BY ip") as cur:
rows = await cur.fetchall()
ips = [row[0] for row in rows]
assert sorted(ips) == ["2.2.2.2", "3.3.3.3"]
@pytest.mark.asyncio
async def test_delete_stale_entries_returns_zero_when_none_stale(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
# Insert entries with recent last_seen times
await db.execute(
"INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)",
("1.1.1.1", "US", "2025-01-01T00:00:00Z"),
)
await db.execute(
"INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)",
("2.2.2.2", "DE", "2025-01-02T00:00:00Z"),
)
await db.commit()
async with aiosqlite.connect(db_path) as db:
# Try to delete entries older than 2020-01-01 (all are newer)
deleted = await geo_cache_repo.delete_stale_entries(db, "2020-01-01T00:00:00Z")
await db.commit()
assert deleted == 0
# Verify no entries were deleted
async with aiosqlite.connect(db_path) as db, db.execute("SELECT COUNT(*) FROM geo_cache") as cur:
row = await cur.fetchone()
assert row is not None
assert int(row[0]) == 2
@pytest.mark.asyncio
async def test_delete_stale_entries_with_empty_table(tmp_path: Path) -> None:
db_path = str(tmp_path / "geo_cache.db")
async with aiosqlite.connect(db_path) as db:
await _create_geo_cache_table(db)
async with aiosqlite.connect(db_path) as db:
deleted = await geo_cache_repo.delete_stale_entries(db, "2024-01-01T00:00:00Z")
await db.commit()
assert deleted == 0