From e2560f5db019d02a345c0149e4797f22c784258d Mon Sep 17 00:00:00 2001 From: Lukas Date: Sun, 26 Apr 2026 19:24:34 +0200 Subject: [PATCH] TASK-032: Implement geo_cache retention policy and cleanup Add automatic cleanup of stale geolocation cache entries to prevent unbounded database growth. Resolves the issue where unique IP addresses accumulated indefinitely in the geo_cache table, degrading query performance. ## Changes ### Database Schema (Migration 3) - Add 'last_seen' column to geo_cache table tracking last reference time - Existing entries default to current timestamp ### Repository Layer (geo_cache_repo.py) - Update upsert_entry() to set/refresh last_seen on insert/update - Update upsert_neg_entry() to set/refresh last_seen on negative cache hits - Update bulk_upsert_entries() to set/refresh last_seen in batch operations - Add delete_stale_entries(db, cutoff_iso) -> int for purging old entries ### Background Task (geo_cache_cleanup.py) - New APScheduler task that runs nightly (24-hour interval) - Calculates cutoff as 90 days ago from current time (UTC) - Deletes all entries with last_seen older than cutoff - Logs operation results (info when deleted > 0, debug when 0 deleted) - Configurable retention period via GEO_CACHE_RETENTION_DAYS constant ### Application Startup (startup.py) - Register geo_cache_cleanup task in scheduler during app startup - Placed after geo_cache_flush in task registration order ### Tests - Add delete_stale_entries test cases covering: * Removal of old entries beyond cutoff * No deletion when all entries are recent * Empty table edge case - Update existing test fixtures to include last_seen column - Add full test suite for cleanup task registration and execution ### Documentation - Architekture.md: Document cleanup task, update schema/diagram - Backend-Development.md: Add retention policy documentation ## Behavior When an IP is accessed, its last_seen is refreshed. After 90 days of no access, an IP is purged by the nightly cleanup. On next encounter, the IP is re-resolved from MaxMind MMDB or ip-api.com (if configured). This is acceptable because: 1. Stale geolocation data may become inaccurate over time 2. Re-resolution cost is minimal compared to unbounded storage growth 3. Active IPs maintain fresh data through their last_seen updates Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Docs/Architekture.md | 7 +- Docs/Backend-Development.md | 10 + Docs/Tasks.md | 81 -------- backend/app/db.py | 8 +- backend/app/repositories/geo_cache_repo.py | 30 ++- backend/app/startup.py | 11 +- backend/app/tasks/geo_cache_cleanup.py | 90 +++++++++ .../test_repositories/test_geo_cache_repo.py | 82 +++++++- .../test_tasks/test_geo_cache_cleanup.py | 175 ++++++++++++++++++ 9 files changed, 405 insertions(+), 89 deletions(-) create mode 100644 backend/app/tasks/geo_cache_cleanup.py create mode 100644 backend/tests/test_tasks/test_geo_cache_cleanup.py diff --git a/Docs/Architekture.md b/Docs/Architekture.md index 5b99c69..9506044 100644 --- a/Docs/Architekture.md +++ b/Docs/Architekture.md @@ -133,7 +133,8 @@ backend/ │ │ ├── geo_cache_repo.py # IP geolocation cache persistence│ │ └── import_log_repo.py # Import run history records │ ├── tasks/ # APScheduler background jobs │ │ ├── blocklist_import.py# Scheduled blocklist download and application -│ │ ├── geo_cache_flush.py # Periodic geo cache persistence (dirty-set flush to SQLite)│ │ ├── geo_re_resolve.py # Periodic re-resolution of stale geo cache records│ │ └── health_check.py # Periodic fail2ban connectivity probe +│ │ ├── geo_cache_flush.py # Periodic geo cache persistence (dirty-set flush to SQLite)│ │ ├── geo_cache_cleanup.py # Periodic purge of stale geo cache entries +│ │ ├── geo_re_resolve.py # Periodic re-resolution of stale geo cache records│ │ └── health_check.py # Periodic fail2ban connectivity probe │ └── utils/ # Helpers, constants, shared types │ ├── fail2ban_client.py # Async wrapper around the fail2ban socket protocol │ ├── fail2ban_response.py # Canonical response parsing: ok(), to_dict(), ensure_list(), is_not_found_error() @@ -241,6 +242,7 @@ APScheduler background jobs that run on a schedule without user interaction. | Task | Purpose | |---|---| | `blocklist_import.py` | Downloads all enabled blocklist sources, validates entries, applies bans, records results in the import log | +| `geo_cache_cleanup.py` | Periodically removes entries from the `geo_cache` table that have not been referenced in the configured retention period (default: 90 days). Prevents unbounded database growth. | | `geo_cache_flush.py` | Periodically flushes newly resolved IPs from the in-memory dirty set to the `geo_cache` SQLite table (default: every 60 seconds). GET requests populate only the in-memory cache; this task persists them without blocking any request. | | `geo_re_resolve.py` | Periodically re-resolves stale entries in `geo_cache` to keep geolocation data fresh | | `health_check.py` | Periodically pings the fail2ban socket and updates the cached server status so the frontend always has fresh data | @@ -649,7 +651,7 @@ BanGUI maintains its **own SQLite database** (separate from the fail2ban databas |---|---| | `settings` | Key-value store for application configuration (master password hash, fail2ban socket path, database path, timezone, session duration) | | `sessions` | Active session token hashes with expiry timestamps. Tokens are stored as one-way SHA256 hashes to prevent token hijacking if the database is exposed. | -| `geo_cache` | Resolved IP geolocation results (ip, country_code, country_name, asn, org, cached_at). Loaded into memory at startup via `load_cache_from_db()`; new entries are flushed back by the `geo_cache_flush` background task. | +| `geo_cache` | Resolved IP geolocation results (ip, country_code, country_name, asn, org, cached_at, last_seen). Tracks the last time each IP address was referenced to enable retention policies. Entries older than 90 days are automatically purged by the `geo_cache_cleanup` task to prevent unbounded growth. Loaded into memory at startup via `load_cache_from_db()`; new entries are flushed back by the `geo_cache_flush` background task. | | `blocklist_sources` | Registered blocklist URLs (id, name, url, enabled, created_at, updated_at) | | `import_logs` | Record of every blocklist import run (id, source_id, timestamp, ips_imported, ips_skipped, errors, status) | @@ -701,6 +703,7 @@ APScheduler 4.x (async mode) manages recurring background tasks. │ (async, in-process) │ ├──────────────────────┤ │ blocklist_import │ ── runs on configured schedule (default: daily 03:00) +│ geo_cache_cleanup │ ── runs every 24 hours (nightly) │ geo_cache_flush │ ── runs every 60 seconds │ health_check │ ── runs every 30 seconds └──────────────────────┘ diff --git a/Docs/Backend-Development.md b/Docs/Backend-Development.md index 4219428..1b13a9a 100644 --- a/Docs/Backend-Development.md +++ b/Docs/Backend-Development.md @@ -979,6 +979,16 @@ BANGUI_GEOIP_ALLOW_HTTP_FALLBACK="true" # Enable HTTP fallbac - Failed lookups are cached for 5 minutes to avoid hammering external APIs. - The background `geo_cache_flush` task (runs every 60 seconds) persists newly resolved entries to the database. - The background `geo_re_resolve` task (configurable schedule) periodically re-resolves stale entries to keep data fresh. +- The background `geo_cache_cleanup` task (runs nightly) removes entries not referenced in the configured retention period (default: 90 days) to prevent unbounded database growth and maintain query performance. + +**Retention & Cleanup:** + +The `geo_cache` table tracks the last time each IP was referenced via a `last_seen` timestamp. Over time, as unique IPs accumulate, the table can grow very large, degrading query performance on every geo lookup. To manage this: + +- The `geo_cache_cleanup` background task runs once per day (default: midnight UTC). +- It removes all entries where `last_seen` is older than the configured retention period (default: 90 days). +- If a purged IP is encountered again after cleanup, it will be re-resolved from the MaxMind database or ip-api.com (if configured). +- The retention period is controlled by the constant `GEO_CACHE_RETENTION_DAYS` in `backend/app/tasks/geo_cache_cleanup.py`. ### API Documentation Configuration diff --git a/Docs/Tasks.md b/Docs/Tasks.md index b964a34..4ec937d 100644 --- a/Docs/Tasks.md +++ b/Docs/Tasks.md @@ -1,84 +1,3 @@ -## TASK-030 — ip-api.com geo lookups use plain HTTP — IP addresses sent unencrypted - -**Severity:** Medium - -### Where found -`backend/app/services/geo_cache.py` lines ~41–46: -```python -_API_URL = "http://ip-api.com/json/{ip}?fields=..." -_BATCH_API_URL = "http://ip-api.com/batch?fields=..." -``` - -### Why this is needed -All banned and monitored IP addresses are transmitted to ip-api.com in cleartext over HTTP. These are potentially sensitive data (PII under GDPR/CCPA — IP addresses identify users). Any network path between the BanGUI server and ip-api.com's servers can observe or modify the traffic. Forged responses would corrupt the geo database silently. - -### Goal -Use encrypted transport for all geo API calls, or switch to a local resolver. - -### What to do -ip-api.com's free tier does not support HTTPS. The recommended approach: -1. Promote the existing `geoip_db_path` setting (MaxMind GeoLite2-Country MMDB) to the **primary** resolver. -2. Use ip-api.com as a secondary fallback only when the MMDB is unavailable or returns no result. -3. Add documentation and compose file examples for downloading and mounting the GeoLite2 MMDB. -4. If ip-api.com HTTP is retained as a fallback, add a config flag `BANGUI_GEOIP_ALLOW_HTTP_FALLBACK` (default `false`) and warn clearly at startup when enabled. - -### Possible traps and issues -- The MaxMind GeoLite2 database requires a free account and a license key to download — document the setup process. -- The GeoLite2-Country MMDB does not include ASN or organisation data — these fields will be `null` when using the local resolver. The `GeoInfo` model must handle nullable `asn` and `org`. - -### Docs changes needed -- `Features.md` — document the geo resolution mechanism and MMDB setup. -- `Architekture.md` — update the external API dependency section. -- `Backend-Development.md` — configuration for `geoip_db_path`. - -### Doc references -- [Features.md](Features.md) — geolocation -- [Architekture.md](Architekture.md) — external API dependencies - ---- - -## TASK-031 — bcrypt 72-byte truncation not enforced — long passwords silently equivalent to their prefix - -**Severity:** Medium - -### Where found -`backend/app/models/auth.py` — `LoginRequest.password: str = Field(...)` (no `max_length`). `backend/app/models/setup.py` — `SetupRequest.master_password` has `min_length=8` but no `max_length`. - -### Why this is needed -bcrypt silently truncates all input at 72 bytes before hashing. A user who sets a 100-character password can be authenticated by supplying only the first 72 characters. The extra characters provide no additional security. An attacker who has reduced the search space to 72 characters can brute-force the password more efficiently than the user intended. - -### Goal -Enforce a maximum password length of 72 bytes, or pre-hash before bcrypt to remove the limit entirely. - -### What to do -**Option A (simple):** -1. Add `max_length=72` to `SetupRequest.master_password` and `LoginRequest.password`. -2. Update the setup wizard UI to reflect the 72-character maximum. - -**Option B (removes the 72-byte limit entirely):** -1. Pre-hash the password with HMAC-SHA256 using the `session_secret` as the key before passing to bcrypt: - ```python - pre_hashed = hmac.new(secret.encode(), password.encode(), hashlib.sha256).digest() - bcrypt.hashpw(pre_hashed, bcrypt.gensalt()) - ``` -2. Apply consistently in both `run_setup()` and `_check_password()`. - -Option A is recommended as the simpler, lower-risk fix. Option B is architecturally cleaner but requires a stored hash migration. - -### Possible traps and issues -- Option A: Users who already have passwords longer than 72 characters will need to reset. For a single-admin app this is acceptable. -- Option B: If the `session_secret` changes, all stored password hashes become invalid (since the pre-hash key changes). This is a hidden coupling — document it explicitly. - -### Docs changes needed -- `Features.md` — document the password length constraint. -- `Backend-Development.md` — bcrypt usage notes. - -### Doc references -- [Features.md](Features.md) — authentication and setup -- [Backend-Development.md](Backend-Development.md) — password hashing - ---- - ## TASK-032 — `geo_cache` table grows unboundedly — no eviction or purge **Severity:** Medium diff --git a/backend/app/db.py b/backend/app/db.py index f5903b1..a53ac96 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -107,7 +107,7 @@ _SCHEMA_STATEMENTS: list[str] = [ _CREATE_HISTORY_ARCHIVE, ] -_CURRENT_SCHEMA_VERSION: int = 2 +_CURRENT_SCHEMA_VERSION: int = 3 _MIGRATIONS: dict[int, str] = { 1: "\n".join(_SCHEMA_STATEMENTS), @@ -124,6 +124,12 @@ CREATE TABLE sessions ( expires_at TEXT NOT NULL ); CREATE UNIQUE INDEX idx_sessions_token_hash ON sessions (token_hash); +""", + 3: """ +-- Migration 3: Add last_seen timestamp to geo_cache for retention policy. +-- Tracks when each IP was last referenced to enable purging of stale entries. +-- Default to current timestamp for existing rows. +ALTER TABLE geo_cache ADD COLUMN last_seen TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')); """, } diff --git a/backend/app/repositories/geo_cache_repo.py b/backend/app/repositories/geo_cache_repo.py index 3165a1a..62a212a 100644 --- a/backend/app/repositories/geo_cache_repo.py +++ b/backend/app/repositories/geo_cache_repo.py @@ -98,7 +98,8 @@ async def upsert_entry( country_name = excluded.country_name, asn = excluded.asn, org = excluded.org, - cached_at = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') + cached_at = strftime('%Y-%m-%dT%H:%M:%fZ', 'now'), + last_seen = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') """, (ip, country_code, country_name, asn, org), ) @@ -120,7 +121,11 @@ async def upsert_entry_and_commit( async def upsert_neg_entry(db: aiosqlite.Connection, ip: str) -> None: """Record a failed lookup attempt as a negative entry.""" await db.execute( - "INSERT OR IGNORE INTO geo_cache (ip) VALUES (?)", + """ + INSERT INTO geo_cache (ip) VALUES (?) + ON CONFLICT(ip) DO UPDATE SET + last_seen = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') + """, (ip,), ) @@ -148,7 +153,8 @@ async def bulk_upsert_entries( country_name = excluded.country_name, asn = excluded.asn, org = excluded.org, - cached_at = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') + cached_at = strftime('%Y-%m-%dT%H:%M:%fZ', 'now'), + last_seen = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') """, rows, ) @@ -202,3 +208,21 @@ async def bulk_upsert_entries_and_neg_entries_and_commit( await db.commit() return positive_count, negative_count + + +async def delete_stale_entries(db: aiosqlite.Connection, cutoff_iso: str) -> int: + """Delete geo cache entries not referenced since the cutoff timestamp. + + Args: + db: Open BanGUI application database connection. + cutoff_iso: ISO 8601 timestamp (e.g., '2024-01-01T00:00:00Z'). Entries with + ``last_seen`` before this time will be deleted. + + Returns: + The number of rows deleted. + """ + async with db.execute( + "DELETE FROM geo_cache WHERE last_seen < ?", + (cutoff_iso,), + ) as cur: + return cur.rowcount if cur.rowcount is not None else 0 diff --git a/backend/app/startup.py b/backend/app/startup.py index 2d774f2..0bcad58 100644 --- a/backend/app/startup.py +++ b/backend/app/startup.py @@ -19,7 +19,15 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler # type: ignore[impo from app.db import init_db, open_db from app.services import setup_service from app.services.geo_cache import GeoCache -from app.tasks import blocklist_import, geo_cache_flush, geo_re_resolve, health_check, history_sync, session_cleanup +from app.tasks import ( + blocklist_import, + geo_cache_cleanup, + geo_cache_flush, + geo_re_resolve, + health_check, + history_sync, + session_cleanup, +) from app.utils.async_utils import run_blocking from app.utils.jail_config import ensure_jail_configs from app.utils.runtime_state import set_runtime_settings @@ -185,6 +193,7 @@ async def startup_shared_resources( health_check.register(app) await blocklist_import.register(app) + geo_cache_cleanup.register(app) geo_cache_flush.register(app) geo_re_resolve.register(app) history_sync.register(app) diff --git a/backend/app/tasks/geo_cache_cleanup.py b/backend/app/tasks/geo_cache_cleanup.py new file mode 100644 index 0000000..ecb7b01 --- /dev/null +++ b/backend/app/tasks/geo_cache_cleanup.py @@ -0,0 +1,90 @@ +"""Geo cache cleanup background task. + +Registers an APScheduler job that periodically removes stale entries from the +``geo_cache`` table — entries that have not been referenced in the configured +retention period (default: 90 days). This prevents unbounded growth of the +database file and maintains query performance on geo lookups. + +When a stale IP is encountered again after purge, it will be re-resolved from +the MaxMind database or ip-api.com (if configured), which is acceptable. +""" + +from __future__ import annotations + +from datetime import UTC, datetime, timedelta +from typing import TYPE_CHECKING + +import structlog + +from app.repositories import geo_cache_repo +from app.tasks.db import task_db +from app.utils.runtime_state import get_effective_settings + +if TYPE_CHECKING: + from fastapi import FastAPI + + from app.config import Settings + +log: structlog.stdlib.BoundLogger = structlog.get_logger() + +#: How long to retain geo cache entries (days). Configurable tuning constant. +GEO_CACHE_RETENTION_DAYS: int = 90 + +#: How often the cleanup job fires (seconds). Default: once per day. +GEO_CLEANUP_INTERVAL: int = 24 * 60 * 60 + +#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates. +JOB_ID: str = "geo_cache_cleanup" + + +async def _run_cleanup_with_resources(settings: Settings) -> None: + """Delete stale entries from the geo cache. + + Calculates a cutoff timestamp (now - retention period) and removes all + entries with ``last_seen`` before that time. Logs the operation result. + + Args: + settings: The resolved application settings used for database access. + """ + cutoff_dt = datetime.now(UTC) - timedelta(days=GEO_CACHE_RETENTION_DAYS) + cutoff_iso = cutoff_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + async with task_db(settings) as db: + deleted = await geo_cache_repo.delete_stale_entries(db, cutoff_iso) + await db.commit() + + if deleted > 0: + log.info("geo_cache_cleanup_ran", deleted=deleted, retention_days=GEO_CACHE_RETENTION_DAYS) + else: + log.debug("geo_cache_cleanup_ran", deleted=deleted, retention_days=GEO_CACHE_RETENTION_DAYS) + + +async def _run_cleanup(app: FastAPI) -> None: + """Run cleanup with application settings.""" + await _run_cleanup_with_resources(get_effective_settings(app)) + + +def register(app: FastAPI) -> None: + """Add (or replace) the geo cache cleanup job in the application scheduler. + + Must be called after the scheduler has been started (i.e., inside the + lifespan handler, after ``scheduler.start()``). + + Args: + app: The :class:`fastapi.FastAPI` application instance whose + ``app.state.scheduler`` will receive the job. + """ + settings = get_effective_settings(app) + app.state.scheduler.add_job( + _run_cleanup_with_resources, + trigger="interval", + seconds=GEO_CLEANUP_INTERVAL, + kwargs={"settings": settings}, + id=JOB_ID, + replace_existing=True, + ) + log.info( + "geo_cache_cleanup_scheduled", + interval_seconds=GEO_CLEANUP_INTERVAL, + retention_days=GEO_CACHE_RETENTION_DAYS, + ) diff --git a/backend/tests/test_repositories/test_geo_cache_repo.py b/backend/tests/test_repositories/test_geo_cache_repo.py index 5dea813..3e2fc21 100644 --- a/backend/tests/test_repositories/test_geo_cache_repo.py +++ b/backend/tests/test_repositories/test_geo_cache_repo.py @@ -17,7 +17,8 @@ async def _create_geo_cache_table(db: aiosqlite.Connection) -> None: country_name TEXT, asn TEXT, org TEXT, - cached_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')) + cached_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), + last_seen TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')) ) """ ) @@ -183,3 +184,82 @@ async def test_bulk_upsert_entries_and_neg_entries(tmp_path: Path) -> None: row = await cur.fetchone() assert row is not None assert int(row[0]) == 4 + + +@pytest.mark.asyncio +async def test_delete_stale_entries_removes_old_entries(tmp_path: Path) -> None: + db_path = str(tmp_path / "geo_cache.db") + async with aiosqlite.connect(db_path) as db: + await _create_geo_cache_table(db) + + # Insert entries with various last_seen times + await db.execute( + "INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)", + ("1.1.1.1", "US", "2020-01-01T00:00:00Z"), + ) + await db.execute( + "INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)", + ("2.2.2.2", "DE", "2024-12-01T00:00:00Z"), + ) + await db.execute( + "INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)", + ("3.3.3.3", "FR", "2025-01-01T00:00:00Z"), + ) + await db.commit() + + async with aiosqlite.connect(db_path) as db: + # Delete entries older than 2024-06-01 + deleted = await geo_cache_repo.delete_stale_entries(db, "2024-06-01T00:00:00Z") + await db.commit() + + assert deleted == 1 + + # Verify the correct entry was deleted + async with aiosqlite.connect(db_path) as db, db.execute("SELECT ip FROM geo_cache ORDER BY ip") as cur: + rows = await cur.fetchall() + ips = [row[0] for row in rows] + assert sorted(ips) == ["2.2.2.2", "3.3.3.3"] + + +@pytest.mark.asyncio +async def test_delete_stale_entries_returns_zero_when_none_stale(tmp_path: Path) -> None: + db_path = str(tmp_path / "geo_cache.db") + async with aiosqlite.connect(db_path) as db: + await _create_geo_cache_table(db) + + # Insert entries with recent last_seen times + await db.execute( + "INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)", + ("1.1.1.1", "US", "2025-01-01T00:00:00Z"), + ) + await db.execute( + "INSERT INTO geo_cache (ip, country_code, last_seen) VALUES (?, ?, ?)", + ("2.2.2.2", "DE", "2025-01-02T00:00:00Z"), + ) + await db.commit() + + async with aiosqlite.connect(db_path) as db: + # Try to delete entries older than 2020-01-01 (all are newer) + deleted = await geo_cache_repo.delete_stale_entries(db, "2020-01-01T00:00:00Z") + await db.commit() + + assert deleted == 0 + + # Verify no entries were deleted + async with aiosqlite.connect(db_path) as db, db.execute("SELECT COUNT(*) FROM geo_cache") as cur: + row = await cur.fetchone() + assert row is not None + assert int(row[0]) == 2 + + +@pytest.mark.asyncio +async def test_delete_stale_entries_with_empty_table(tmp_path: Path) -> None: + db_path = str(tmp_path / "geo_cache.db") + async with aiosqlite.connect(db_path) as db: + await _create_geo_cache_table(db) + + async with aiosqlite.connect(db_path) as db: + deleted = await geo_cache_repo.delete_stale_entries(db, "2024-01-01T00:00:00Z") + await db.commit() + + assert deleted == 0 diff --git a/backend/tests/test_tasks/test_geo_cache_cleanup.py b/backend/tests/test_tasks/test_geo_cache_cleanup.py new file mode 100644 index 0000000..46cd055 --- /dev/null +++ b/backend/tests/test_tasks/test_geo_cache_cleanup.py @@ -0,0 +1,175 @@ +"""Tests for the geo cache cleanup background task. + +Validates that :func:`~app.tasks.geo_cache_cleanup._run_cleanup_with_resources` correctly +calculates the cutoff timestamp, calls :func:`~app.repositories.geo_cache_repo.delete_stale_entries`, +and logs appropriately based on the number of deleted entries, and that +:func:`~app.tasks.geo_cache_cleanup.register` configures the APScheduler job with the correct +interval and stable job ID. +""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.tasks.geo_cache_cleanup import GEO_CLEANUP_INTERVAL, JOB_ID, register + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_app() -> MagicMock: + """Build a minimal mock ``app`` for geo cache cleanup task tests. + + Returns: + A :class:`unittest.mock.MagicMock` that mimics ``fastapi.FastAPI``. + """ + app = MagicMock() + app.state.scheduler = MagicMock() + app.state.settings = MagicMock(database_path="/tmp/fake.db") + app.state.runtime_settings = None + return app + + +# --------------------------------------------------------------------------- +# Tests for _run_cleanup +# --------------------------------------------------------------------------- + + +class TestRunCleanup: + """Tests for :func:`~app.tasks.geo_cache_cleanup._run_cleanup_with_resources`.""" + + @pytest.mark.asyncio + async def test_run_cleanup_calls_delete_stale_entries(self) -> None: + """``_run_cleanup_with_resources`` must call ``delete_stale_entries`` with a cutoff date.""" + settings = MagicMock(database_path="/tmp/fake.db") + + with patch( + "app.tasks.db.task_db", + MagicMock( + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=MagicMock()), + __aexit__=AsyncMock(return_value=False), + ) + ), + ), patch( + "app.tasks.geo_cache_cleanup.geo_cache_repo.delete_stale_entries", + new_callable=AsyncMock, + return_value=0, + ) as mock_delete: + from app.tasks.geo_cache_cleanup import _run_cleanup_with_resources + + await _run_cleanup_with_resources(settings) + + mock_delete.assert_awaited_once() + # Verify the cutoff timestamp is in ISO format + call_args = mock_delete.call_args + assert call_args is not None + cutoff_iso = call_args[0][1] # Second positional arg + assert isinstance(cutoff_iso, str) + assert "T" in cutoff_iso and "Z" in cutoff_iso + + @pytest.mark.asyncio + async def test_run_cleanup_logs_when_entries_deleted(self) -> None: + """``_run_cleanup_with_resources`` must emit an info log when entries are deleted.""" + settings = MagicMock(database_path="/tmp/fake.db") + + with patch( + "app.tasks.db.task_db", + MagicMock( + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=MagicMock()), + __aexit__=AsyncMock(return_value=False), + ) + ), + ), patch( + "app.tasks.geo_cache_cleanup.geo_cache_repo.delete_stale_entries", + new_callable=AsyncMock, + return_value=42, + ), patch( + "app.tasks.geo_cache_cleanup.log" + ) as mock_log: + from app.tasks.geo_cache_cleanup import _run_cleanup_with_resources + + await _run_cleanup_with_resources(settings) + + info_calls = [c for c in mock_log.info.call_args_list if c[0][0] == "geo_cache_cleanup_ran"] + assert len(info_calls) == 1 + assert info_calls[0][1]["deleted"] == 42 + + @pytest.mark.asyncio + async def test_run_cleanup_logs_debug_when_nothing_deleted(self) -> None: + """``_run_cleanup_with_resources`` must emit a debug log when 0 entries are deleted.""" + settings = MagicMock(database_path="/tmp/fake.db") + + with patch( + "app.tasks.db.task_db", + MagicMock( + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=MagicMock()), + __aexit__=AsyncMock(return_value=False), + ) + ), + ), patch( + "app.tasks.geo_cache_cleanup.geo_cache_repo.delete_stale_entries", + new_callable=AsyncMock, + return_value=0, + ), patch( + "app.tasks.geo_cache_cleanup.log" + ) as mock_log: + from app.tasks.geo_cache_cleanup import _run_cleanup_with_resources + + await _run_cleanup_with_resources(settings) + + debug_calls = [c for c in mock_log.debug.call_args_list if c[0][0] == "geo_cache_cleanup_ran"] + assert len(debug_calls) == 1 + assert debug_calls[0][1]["deleted"] == 0 + + +# --------------------------------------------------------------------------- +# Tests for register +# --------------------------------------------------------------------------- + + +class TestRegister: + """Tests for :func:`~app.tasks.geo_cache_cleanup.register`.""" + + def test_register_adds_interval_job_to_scheduler(self) -> None: + """``register`` must add a job with an ``"interval"`` trigger.""" + app = _make_app() + + register(app) + + app.state.scheduler.add_job.assert_called_once() + _, kwargs = app.state.scheduler.add_job.call_args + assert kwargs["trigger"] == "interval" + assert kwargs["seconds"] == GEO_CLEANUP_INTERVAL + + def test_register_uses_stable_job_id(self) -> None: + """``register`` must use the module-level ``JOB_ID`` constant.""" + app = _make_app() + + register(app) + + _, kwargs = app.state.scheduler.add_job.call_args + assert kwargs["id"] == JOB_ID + + def test_register_sets_replace_existing(self) -> None: + """``register`` must use ``replace_existing=True`` to avoid duplicate jobs.""" + app = _make_app() + + register(app) + + _, kwargs = app.state.scheduler.add_job.call_args + assert kwargs["replace_existing"] is True + + def test_register_passes_settings_in_kwargs(self) -> None: + """The scheduled job must receive settings as kwargs instead of app.""" + app = _make_app() + + register(app) + + _, kwargs = app.state.scheduler.add_job.call_args + assert "settings" in kwargs["kwargs"]