diff --git a/Docs/DATABASE_MIGRATIONS.md b/Docs/DATABASE_MIGRATIONS.md index 7f5a400..34f0737 100644 --- a/Docs/DATABASE_MIGRATIONS.md +++ b/Docs/DATABASE_MIGRATIONS.md @@ -121,6 +121,7 @@ rm bangui.db bangui.db-wal bangui.db-shm | 6 | Add import_runs table for idempotent imports | | 7 | Add indexes to import_log | | 8 | Migrate import_log.timestamp TEXT→INTEGER UNIX | +| 9 | Change import_log.source_id FK to ON DELETE RESTRICT | ## Adding New Migrations diff --git a/Docs/Features.md b/Docs/Features.md index ce9d9df..20040f8 100644 --- a/Docs/Features.md +++ b/Docs/Features.md @@ -358,6 +358,13 @@ Automated downloading and applying of external IP blocklists to block known mali - Display the import log in the web interface, filterable by source and date range. - Show a warning badge in the navigation if the most recent import encountered errors. +### Data Retention & Deletion + +- Import logs are retained for audit and troubleshooting purposes. +- A blocklist source **cannot be deleted** while it has associated import logs (foreign key RESTRICT constraint). +- Before deleting a source, delete all its import logs first via the API. +- Attempting to delete a source with logs returns **HTTP 409 Conflict** with error code `blocklist_source_has_logs`. + ### Error Handling - If a blocklist URL is unreachable, log the error and continue with remaining sources. diff --git a/Docs/Tasks.md b/Docs/Tasks.md index c970645..97aa80e 100644 --- a/Docs/Tasks.md +++ b/Docs/Tasks.md @@ -1,99 +1,3 @@ -### Issue #10: HIGH - Database Type Inconsistency (Timestamps Mixed Across Tables) - -**Where found**: -- `backend/app/db.py` (lines 68-75) -- `import_log` table uses TEXT ISO 8601 format -- `history_archive` table uses INTEGER UNIX timestamp -- Frontend receives both formats - -**Why this is needed**: -Frontend parsing code must handle multiple formats: -- Parser might parse one format incorrectly -- Inconsistent type representation makes bugs harder to track -- Aggregation queries mixing both formats require conversions - -**Goal**: -Standardize all timestamps on UNIX timestamps (INTEGER seconds since epoch) throughout entire database. - -**What to do**: -1. Migrate `import_log.timestamp` from TEXT to INTEGER: - ```sql - ALTER TABLE import_log ADD COLUMN timestamp_unix INTEGER; - UPDATE import_log SET timestamp_unix = strftime('%s', timestamp); - ALTER TABLE import_log DROP COLUMN timestamp; - ALTER TABLE import_log RENAME COLUMN timestamp_unix TO timestamp; - ``` -2. Update all code to use UNIX timestamps -3. Add validation in repositories that timestamps are UNIX format -4. Update frontend parsing to handle UNIX timestamps -5. Write migration tests with various date formats - -**Possible traps and issues**: -- Existing data with invalid timestamps causes migration to fail -- Timezone issues if code assumes local time -- Backward compatibility breaks for old APIs -- Performance impact of migration on large tables - -**Docs changes needed**: -- Document timestamp format requirement in API docs -- Add to backend development guide -- Create migration runbook - -**Doc references**: -- DATABASE_API_DEPLOYMENT_ISSUES.md - Issue "1.2 Data Type Inconsistency" - ---- - -### Issue #11: HIGH - Foreign Key ON DELETE Semantics Problem (Data Loss) - -**Where found**: -- `backend/app/db.py` (lines 61-70) -- `import_log.source_id` uses `ON DELETE SET NULL` -- `import_log.source_url` remains populated -- Orphaned log records with NULL source_id but populated URL - -**Why this is needed**: -When a blocklist source is deleted: -- Import logs become orphaned and meaningless -- UI can't link logs back to source -- Data becomes inconsistent - -**Goal**: -Fix foreign key cascade strategy to maintain data integrity. - -**What to do**: -1. Decide cascade strategy: - - Option A: `ON DELETE CASCADE` - delete all logs when source deleted (data loss) - - Option B: `ON DELETE RESTRICT` - prevent source deletion if logs exist (prevent data loss) - - Option C: Keep source_id for history without URL (reference counting) -2. Recommendation: Use Option B with proper deletion workflow: - ```sql - ALTER TABLE import_log - DROP CONSTRAINT fk_source_id, - ADD CONSTRAINT fk_source_id - FOREIGN KEY(source_id) REFERENCES blocklist_sources(id) - ON DELETE RESTRICT; - ``` -3. If deleting source, first archive and delete old logs -4. Update deletion API to handle RESTRICT error -5. Document deletion procedures - -**Possible traps and issues**: -- Existing schema already has ON DELETE SET NULL - migration needed -- User tries to delete source with logs - must handle RESTRICT error -- Cascading deletes can cause unexpected data loss -- Historical logs might be valuable for audit - -**Docs changes needed**: -- Add data retention policy to `Docs/Features.md` -- Document deletion constraints in API docs -- Add to admin guide - -**Doc references**: -- DATABASE_API_DEPLOYMENT_ISSUES.md - Issue "1.3 Foreign Key ON DELETE" - ---- - ### Issue #12: HIGH - Race Condition in Concurrent Writes (Import Runs Duplication) **Where found**: diff --git a/backend/app/db.py b/backend/app/db.py index 1e15bf5..55d295d 100644 --- a/backend/app/db.py +++ b/backend/app/db.py @@ -59,9 +59,9 @@ CREATE TABLE IF NOT EXISTS blocklist_sources ( _CREATE_IMPORT_LOG: str = """ CREATE TABLE IF NOT EXISTS import_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, - source_id INTEGER REFERENCES blocklist_sources(id) ON DELETE SET NULL, + source_id INTEGER REFERENCES blocklist_sources(id) ON DELETE RESTRICT, source_url TEXT NOT NULL, - timestamp TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), + timestamp INTEGER NOT NULL, ips_imported INTEGER NOT NULL DEFAULT 0, ips_skipped INTEGER NOT NULL DEFAULT 0, errors TEXT @@ -111,7 +111,7 @@ _SCHEMA_STATEMENTS: list[str] = [ _CREATE_HISTORY_ARCHIVE, ] -_CURRENT_SCHEMA_VERSION: int = 8 +_CURRENT_SCHEMA_VERSION: int = 9 _MIGRATIONS: dict[int, str] = { 1: "\n".join(_SCHEMA_STATEMENTS), @@ -216,6 +216,31 @@ ALTER TABLE import_log ADD COLUMN timestamp_unix INTEGER; UPDATE import_log SET timestamp_unix = strftime('%s', timestamp); ALTER TABLE import_log DROP COLUMN timestamp; ALTER TABLE import_log RENAME COLUMN timestamp_unix TO timestamp; +""", + 9: """ +-- Migration 9: Change import_log.source_id foreign key to ON DELETE RESTRICT. +-- Previously, deleting a blocklist source set source_id to NULL, leaving orphaned +-- log records with populated URL but NULL source_id (meaningless/useless data). +-- Now, RESTRICT prevents source deletion if import logs exist, preserving data +-- integrity. Admin must delete logs before deleting source. +-- See Issue #11: Foreign Key ON DELETE Semantics Problem. +DROP INDEX IF EXISTS idx_import_log_source_id_desc; +DROP TABLE IF EXISTS _import_log_backup; +CREATE TABLE _import_log_backup ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER REFERENCES blocklist_sources(id) ON DELETE RESTRICT, + source_url TEXT NOT NULL, + timestamp INTEGER NOT NULL, + ips_imported INTEGER NOT NULL DEFAULT 0, + ips_skipped INTEGER NOT NULL DEFAULT 0, + errors TEXT +); +INSERT INTO _import_log_backup (id, source_id, source_url, timestamp, ips_imported, ips_skipped, errors) +SELECT id, source_id, source_url, timestamp, ips_imported, ips_skipped, errors FROM import_log; +DROP TABLE import_log; +ALTER TABLE _import_log_backup RENAME TO import_log; +CREATE INDEX IF NOT EXISTS idx_import_log_source_id_desc + ON import_log (source_id, id DESC); """, } diff --git a/backend/app/exceptions.py b/backend/app/exceptions.py index 79bd8e2..e0909be 100644 --- a/backend/app/exceptions.py +++ b/backend/app/exceptions.py @@ -482,6 +482,22 @@ class BlocklistSourceNotFoundError(NotFoundError): return {"source_id": self.source_id} +class BlocklistSourceHasLogsError(ConflictError): + """Raised when attempting to delete a blocklist source that has import logs.""" + + error_code: str = "blocklist_source_has_logs" + + def __init__(self, source_id: int) -> None: + self.source_id = source_id + super().__init__( + f"Blocklist source {source_id} cannot be deleted because it has import logs. " + "Delete the import logs first." + ) + + def get_error_metadata(self) -> dict[str, str | int | float | bool | None]: + return {"source_id": self.source_id} + + class HistoryNotFoundError(NotFoundError): """Raised when no history is found for the given IP.""" diff --git a/backend/app/repositories/import_run_repo.py b/backend/app/repositories/import_run_repo.py index 6e0a4df..2859c3e 100644 --- a/backend/app/repositories/import_run_repo.py +++ b/backend/app/repositories/import_run_repo.py @@ -61,16 +61,17 @@ async def get_by_source_and_hash( ) -async def create_pending( +async def upsert_pending( db: aiosqlite.Connection, source_id: int, content_hash: str, ) -> int: - """Create a pending import run entry. + """Atomically insert or reset a pending import run entry. - Wraps the insert in an explicit transaction to ensure atomicity and enable - proper error handling if a UNIQUE(source_id, content_hash) constraint - violation occurs due to concurrent requests. + Uses ``INSERT ... ON CONFLICT`` to make the operation fully atomic — + no window between check and insert where a concurrent request can create + a duplicate row. If a row for ``(source_id, content_hash)`` already exists, + its status is reset to ``pending`` and its ID is returned. Args: db: Active aiosqlite connection. @@ -78,27 +79,21 @@ async def create_pending( content_hash: SHA256 hash of the downloaded blocklist content. Returns: - Primary key of the inserted row. - - Raises: - aiosqlite.IntegrityError: If a row with this (source_id, content_hash) - already exists (constraint violation). The caller should catch this - and retry the lookup to get the existing run's ID. + Primary key of the inserted or updated row. """ - try: - await db.execute("BEGIN IMMEDIATE") - cursor = await db.execute( - """ - INSERT INTO import_runs (source_id, content_hash, status) - VALUES (?, ?, 'pending') - """, - (source_id, content_hash), - ) - await db.commit() - return int(cursor.lastrowid) # type: ignore[arg-type] - except Exception: - await db.rollback() - raise + cursor = await db.execute( + """ + INSERT INTO import_runs (source_id, content_hash, status) + VALUES (?, ?, 'pending') + ON CONFLICT(source_id, content_hash) DO UPDATE SET + status = 'pending', + updated_at = strftime('%Y-%m-%dT%H:%M:%fZ', 'now') + RETURNING id; + """, + (source_id, content_hash), + ) + row = await cursor.fetchone() + return int(row[0]) # type: ignore[arg-type] async def mark_completed( diff --git a/backend/app/repositories/protocols.py b/backend/app/repositories/protocols.py index f6eb1e3..7462f43 100644 --- a/backend/app/repositories/protocols.py +++ b/backend/app/repositories/protocols.py @@ -154,13 +154,13 @@ class ImportRunRepository(Protocol): """Check if a specific import (by source and content hash) has been completed.""" ... - async def create_pending( + async def upsert_pending( self, db: aiosqlite.Connection, source_id: int, content_hash: str, ) -> int: - """Create a pending import run entry. Returns the id.""" + """Atomically insert or reset a pending import run entry. Returns the id.""" ... async def mark_completed( diff --git a/backend/app/services/blocklist_import_workflow.py b/backend/app/services/blocklist_import_workflow.py index 00709d2..ae6dd43 100644 --- a/backend/app/services/blocklist_import_workflow.py +++ b/backend/app/services/blocklist_import_workflow.py @@ -190,41 +190,19 @@ class BlocklistImportWorkflow: # --- Create or update pending import run entry --- if existing_run is None: - try: - run_id = await import_run_repo.create_pending( - db, - source.id, - content_hash, - ) - log.info( - "blocklist_import_tracking_created", - source_id=source.id, - run_id=run_id, - content_hash=content_hash[:8], - ) - except aiosqlite.IntegrityError as e: - # Race condition: another request created the same import between - # our check and this insert. Fetch the existing run and use its ID. - existing_run = await import_run_repo.get_by_source_and_hash( - db, - source.id, - content_hash, - ) - if existing_run is None: - # Unexpected: the constraint error indicates a row exists, but - # we can't find it. This should not happen in normal operation. - raise RuntimeError( - f"Integrity error indicates import exists, " - f"but lookup failed for source_id={source.id}, " - f"content_hash={content_hash[:8]}" - ) from e - run_id = existing_run.id - log.info( - "blocklist_import_lost_race", - source_id=source.id, - run_id=run_id, - content_hash=content_hash[:8], - ) + run_id = await import_run_repo.upsert_pending( + db, + source.id, + content_hash, + ) + # Commit the implicit transaction opened by RETURNING. + await db.commit() + log.info( + "blocklist_import_tracking_created", + source_id=source.id, + run_id=run_id, + content_hash=content_hash[:8], + ) else: # Retry case: existing run is pending or failed, try again run_id = existing_run.id diff --git a/backend/app/services/blocklist_service.py b/backend/app/services/blocklist_service.py index eaa8dea..f36a732 100644 --- a/backend/app/services/blocklist_service.py +++ b/backend/app/services/blocklist_service.py @@ -18,8 +18,10 @@ import json from typing import TYPE_CHECKING import aiohttp +import aiosqlite import structlog +from app.exceptions import BlocklistSourceHasLogsError from app.models.blocklist import ( BlocklistSource, ImportLogEntry, @@ -40,7 +42,6 @@ from app.utils.pagination import create_pagination_metadata if TYPE_CHECKING: from collections.abc import Awaitable, Callable - import aiosqlite from apscheduler.schedulers.asyncio import AsyncIOScheduler from app.config import Settings @@ -196,8 +197,17 @@ async def delete_source(db: aiosqlite.Connection, source_id: int) -> bool: Returns: ``True`` if the source was found and deleted, ``False`` otherwise. + + Raises: + BlocklistSourceHasLogsError: If the source has associated import logs + and cannot be deleted due to RESTRICT foreign key constraint. """ - deleted = await blocklist_repo.delete_source(db, source_id) + try: + deleted = await blocklist_repo.delete_source(db, source_id) + except aiosqlite.IntegrityError as e: + if "FOREIGN KEY constraint failed" in str(e): + raise BlocklistSourceHasLogsError(source_id) from e + raise if deleted: log.info("blocklist_source_deleted", id=source_id) return deleted diff --git a/backend/tests/test_services/test_blocklist_components.py b/backend/tests/test_services/test_blocklist_components.py index 65ba302..a86c485 100644 --- a/backend/tests/test_services/test_blocklist_components.py +++ b/backend/tests/test_services/test_blocklist_components.py @@ -259,6 +259,8 @@ class TestBlocklistImportWorkflow: @pytest.mark.asyncio async def test_import_source_success(self) -> None: """Test successful import workflow.""" + from app.repositories import import_run_repo + http_session = MagicMock(spec=aiohttp.ClientSession) response = AsyncMock() response.status = 200 @@ -280,7 +282,27 @@ class TestBlocklistImportWorkflow: ) db = MagicMock() - result = await workflow.import_source(source, "/var/run/fail2ban/fail2ban.sock", db) + db.commit = AsyncMock() + + # Mock get_by_source_and_hash to return None (no existing run) + mock_existing_cursor = MagicMock() + mock_existing_cursor.fetchone = AsyncMock(return_value=None) + + async def mock_existing_aenter(self): + return mock_existing_cursor + + async def mock_existing_aexit(self, *args): + return None + + mock_existing_cursor.__aenter__ = mock_existing_aenter + mock_existing_cursor.__aexit__ = mock_existing_aexit + + # Patch upsert_pending to return a run_id without touching real DB + with patch.object(import_run_repo, "upsert_pending", new=AsyncMock(return_value=42)): + with patch.object(import_run_repo, "mark_completed", new=AsyncMock()): + # Mock get_by_source_and_hash to return None (no existing run) + with patch.object(import_run_repo, "get_by_source_and_hash", new=AsyncMock(return_value=None)): + result = await workflow.import_source(source, "/var/run/fail2ban/fail2ban.sock", db) assert result.source_id == 1 assert result.ips_imported == 2 diff --git a/backend/tests/test_services/test_blocklist_service.py b/backend/tests/test_services/test_blocklist_service.py index c71908c..b1e7c5b 100644 --- a/backend/tests/test_services/test_blocklist_service.py +++ b/backend/tests/test_services/test_blocklist_service.py @@ -14,6 +14,7 @@ from app.models.blocklist import ( ScheduleConfig, ScheduleFrequency, ) +from app.repositories import import_log_repo from app.services import blocklist_service # --------------------------------------------------------------------------- @@ -115,6 +116,27 @@ class TestSourceCRUD: result = await blocklist_service.delete_source(db, 9999) assert result is False + @patch("app.utils.ip_utils.validate_blocklist_url") + async def test_delete_source_with_logs_raises_error( + self, mock_validate: AsyncMock, db: aiosqlite.Connection + ) -> None: + """delete_source raises BlocklistSourceHasLogsError when source has import logs.""" + mock_validate.return_value = None + source = await blocklist_service.create_source(db, "HasLogs", "https://haslogs.test/") + # Create an import log for this source + await import_log_repo.add_log( + db, + source_id=source.id, + source_url="https://haslogs.test/", + ips_imported=10, + ips_skipped=0, + errors=None, + ) + from app.exceptions import BlocklistSourceHasLogsError + with pytest.raises(BlocklistSourceHasLogsError) as exc_info: + await blocklist_service.delete_source(db, source.id) + assert exc_info.value.source_id == source.id + # --------------------------------------------------------------------------- # Preview