Refactor pagination with cursor-based support and standardized response format

- Implement cursor-based pagination in pagination.py - Update response models to standardize pagination structure - Add cursor pagination utilities for repositories - Update HistoryArchiveRepository and ImportLogRepository with new pagination - Add comprehensive tests for cursor pagination - Update documentation for backend development and task tracking Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-01 17:54:05 +02:00
parent be974b9b0d
commit 67b26a3ef7
8 changed files with 613 additions and 51 deletions
--- a/backend/app/db.py
+++ b/backend/app/db.py
@@ -107,7 +107,7 @@ _SCHEMA_STATEMENTS: list[str] = [
    _CREATE_HISTORY_ARCHIVE,
 ]

-_CURRENT_SCHEMA_VERSION: int = 6
+_CURRENT_SCHEMA_VERSION: int = 7

 _MIGRATIONS: dict[int, str] = {
    1: "\n".join(_SCHEMA_STATEMENTS),
@@ -187,10 +187,25 @@ CREATE TABLE IF NOT EXISTS import_runs (
 -- Index for looking up completed imports by source
 CREATE INDEX IF NOT EXISTS idx_import_runs_source_status
    ON import_runs (source_id, status);
+""",
+    7: """
+-- Migration 7: Add indexes to import_log table for cursor-based pagination.
+-- The import_log table is paginated by id (newest first) and filtered by source_id.
+-- These indexes accelerate pagination queries and maintain consistent ordering.
+-- See Docs/Backend-Development.md § Database Performance for details.
+
+-- Index for ordering by id DESC for cursor-based pagination (newest first)
+CREATE INDEX IF NOT EXISTS idx_import_log_id_desc
+    ON import_log (id DESC);
+
+-- Composite index for source_id + id DESC ordering (filtered pagination)
+CREATE INDEX IF NOT EXISTS idx_import_log_source_id_desc
+    ON import_log (source_id, id DESC);
 """,
 }


+
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
--- a/backend/app/models/response.py
+++ b/backend/app/models/response.py
@@ -125,16 +125,22 @@ class PaginationMetadata(BanGuiBaseModel):
    """Pagination metadata embedded in paginated list responses.

    Contains page information and computed fields to support frontend pagination controls.
+    Supports both offset-based and cursor-based pagination modes.

    Fields:
-        page: Current page number (1-based).
+        page: Current page number (1-based). Set to 1 for cursor pagination.
        page_size: Number of items per page.
        total: Total number of items matching the query (across all pages).
+               For cursor pagination, this is -1 (unknown without full scan).
        total_pages: Computed total number of pages.
+                     For cursor pagination, this is -1 (unknown without full scan).
        has_next_page: Whether there is a next page after this one.
        has_prev_page: Whether there is a previous page before this one.
+                       Always False for cursor pagination (cannot navigate backward without storing history).
+        cursor: Opaque cursor token for fetching the next page (cursor pagination only).
+                None for offset pagination or when there are no more pages.

-    Example:
+    Example (offset pagination):
        ```python
        pagination = PaginationMetadata(
            page=2,
@@ -142,17 +148,36 @@ class PaginationMetadata(BanGuiBaseModel):
            total=150,
            total_pages=3,
            has_next_page=True,
-            has_prev_page=True
+            has_prev_page=True,
+            cursor=None
+        )
+        ```
+
+    Example (cursor pagination):
+        ```python
+        pagination = PaginationMetadata(
+            page=1,
+            page_size=50,
+            total=-1,
+            total_pages=-1,
+            has_next_page=True,
+            has_prev_page=False,
+            cursor="eyJpZCI6IDQyN30="
        )
        ```
    """

-    page: int = Field(..., ge=1, description="Current page number (1-based).")
+    page: int = Field(..., ge=1, description="Current page number (1-based). Set to 1 for cursor pagination.")
    page_size: int = Field(..., ge=1, description="Number of items per page.")
-    total: int = Field(..., ge=0, description="Total number of items matching the query.")
-    total_pages: int = Field(..., ge=1, description="Computed total number of pages.")
+    total: int = Field(..., description="Total number of items matching the query. -1 if unknown (cursor pagination).")
+    total_pages: int = Field(..., description="Computed total number of pages. -1 if unknown (cursor pagination).")
    has_next_page: bool = Field(..., description="Whether there is a next page after this one.")
    has_prev_page: bool = Field(..., description="Whether there is a previous page before this one.")
+    cursor: str | None = Field(
+        default=None,
+        description="Opaque cursor token for fetching the next page (cursor pagination only).",
+    )
+


 class PaginatedListResponse(BanGuiBaseModel, Generic[T]):
--- a/backend/app/repositories/history_archive_repo.py
+++ b/backend/app/repositories/history_archive_repo.py
@@ -2,6 +2,14 @@

 Provides persistence APIs for the BanGUI archival history table in the
 application database.
+
+Supports both offset-based and cursor-based pagination:
+
+- **Offset pagination** (legacy): ``get_archived_history(page=2, page_size=100)``
+  - convenient for small datasets but degrades on large offsets.
+
+- **Cursor pagination** (recommended): ``get_archived_history_keyset(page_size=100, last_ban_id=None)``
+  - constant-time performance regardless of dataset size.
 """

 from __future__ import annotations
@@ -164,3 +172,110 @@ async def purge_archived_history(db: aiosqlite.Connection, age_seconds: int) ->
        deleted = cursor.rowcount
    await db.commit()
    return deleted
+
+
+async def get_archived_history_keyset(
+    db: aiosqlite.Connection,
+    since: int | None = None,
+    jail: str | None = None,
+    ip_filter: str | list[str] | None = None,
+    origin: BanOrigin | None = None,
+    action: str | None = None,
+    page_size: int = 100,
+    last_ban_id: int | None = None,
+) -> tuple[list[dict[str, Any]], bool]:
+    """Return cursor-paginated archived history using keyset pagination.
+
+    Uses keyset pagination (WHERE id < last_id) for constant-time performance
+    regardless of result set size. This is the recommended pagination method
+    for large result sets.
+
+    Ordering is by timeofban DESC (newest first), with id DESC as tiebreaker for
+    events with identical timestamps. This ensures stable, deterministic pagination.
+
+    Args:
+        db: Active aiosqlite connection.
+        since: If given, filter to events on or after this Unix timestamp.
+        jail: If given, filter to events for this jail.
+        ip_filter: If given, filter by IP (exact match list or LIKE prefix).
+        origin: If given, filter by ban origin ('blocklist' or 'selfblock').
+        action: If given, filter to this action type ('ban' or 'unban').
+        page_size: Number of items per page (max returned is page_size + 1 to detect overflow).
+        last_ban_id: The ID of the last item from the previous page (for cursor).
+                    None for the first page.
+
+    Returns:
+        A 2-tuple ``(records, has_more)`` where:
+        - *records* is a list of up to page_size dicts with ban details
+        - *has_more* is True if there are additional pages beyond this one
+    """
+    if isinstance(ip_filter, list) and len(ip_filter) == 0:
+        return [], False
+
+    wheres: list[str] = []
+    params: list[object] = []
+
+    if since is not None:
+        wheres.append("timeofban >= ?")
+        params.append(since)
+
+    if jail is not None:
+        wheres.append("jail = ?")
+        params.append(jail)
+
+    if ip_filter is not None:
+        if isinstance(ip_filter, list):
+            placeholder = ", ".join("?" for _ in ip_filter)
+            wheres.append(f"ip IN ({placeholder})")
+            params.extend(ip_filter)
+        else:
+            wheres.append("ip LIKE ? ESCAPE '\\'")
+            params.append(f"{escape_like(ip_filter)}%")
+
+    if origin == "blocklist":
+        wheres.append("jail = ?")
+        params.append(BLOCKLIST_JAIL)
+    elif origin == "selfblock":
+        wheres.append("jail != ?")
+        params.append(BLOCKLIST_JAIL)
+
+    if action is not None:
+        wheres.append("action = ?")
+        params.append(action)
+
+    if last_ban_id is not None:
+        wheres.append("id < ?")
+        params.append(last_ban_id)
+
+    where_sql = "WHERE " + " AND ".join(wheres) if wheres else ""
+
+    # Fetch page_size + 1 to detect if there are more pages
+    fetch_limit = page_size + 1
+    params.append(fetch_limit)
+
+    async with db.execute(
+        "SELECT id, jail, ip, timeofban, bancount, data, action "
+        "FROM history_archive "
+        f"{where_sql} "
+        "ORDER BY id DESC "
+        "LIMIT ?",  # noqa: S608
+        params,
+    ) as cur:
+        rows_iterable = await cur.fetchall()
+        rows = list(rows_iterable)
+
+    records = [
+        {
+            "jail": str(r[1]),
+            "ip": str(r[2]),
+            "timeofban": int(r[3]),
+            "bancount": int(r[4]),
+            "data": str(r[5]),
+            "action": str(r[6]),
+        }
+        for r in rows[:page_size]
+    ]
+    has_more = len(rows) > page_size
+
+    return records, has_more
+
--- a/backend/app/repositories/import_log_repo.py
+++ b/backend/app/repositories/import_log_repo.py
@@ -3,6 +3,14 @@
 Persists and queries blocklist import run records in the ``import_log``
 table.  All methods are plain async functions that accept a
 :class:`aiosqlite.Connection`.
+
+Supports both offset-based and cursor-based pagination:
+
+- **Offset pagination** (legacy): ``list_logs(page=2, page_size=50)`` - query-efficient
+  but degrades on large offsets.
+
+- **Cursor pagination** (recommended): ``list_logs_keyset(page_size=50, last_log_id=None)``
+  - constant-time performance regardless of dataset size.
 """

 from __future__ import annotations
@@ -17,7 +25,6 @@ if TYPE_CHECKING:

 from app.models.blocklist import ImportLogEntry

-
 # Alias for backward compatibility with protocols
 ImportLogRow = ImportLogEntry
 async def add_log(
@@ -144,6 +151,66 @@ def compute_total_pages(total: int, page_size: int) -> int:
    return math.ceil(total / page_size)


+async def list_logs_keyset(
+    db: aiosqlite.Connection,
+    *,
+    source_id: int | None = None,
+    page_size: int = 50,
+    last_log_id: int | None = None,
+) -> tuple[list[ImportLogRow], bool]:
+    """Return a cursor-paginated list of import log entries.
+
+    Uses keyset pagination (WHERE id < last_id) for constant-time performance
+    regardless of result set size. This is the recommended pagination method
+    for large result sets.
+
+    Args:
+        db: Active aiosqlite connection.
+        source_id: If given, filter to logs for this source only.
+        page_size: Number of items per page (max returned is page_size + 1 to detect overflow).
+        last_log_id: The ID of the last item from the previous page (for cursor).
+                    None for the first page.
+
+    Returns:
+        A 2-tuple ``(items, has_more)`` where:
+        - *items* is a list of up to page_size ImportLogEntry objects
+        - *has_more* is True if there are additional pages beyond this one
+    """
+    where = ""
+    params: list[object] = []
+
+    if source_id is not None:
+        where = " WHERE source_id = ?"
+        params.append(source_id)
+
+    if last_log_id is not None:
+        if where:
+            where += " AND id < ?"
+        else:
+            where = " WHERE id < ?"
+        params.append(last_log_id)
+
+    # Fetch page_size + 1 to detect if there are more pages
+    fetch_limit = page_size + 1
+    params.append(fetch_limit)
+
+    async with db.execute(
+        f"""
+        SELECT id, source_id, source_url, timestamp, ips_imported, ips_skipped, errors
+        FROM import_log{where}
+        ORDER BY id DESC
+        LIMIT ?
+        """,  # noqa: S608
+        params,
+    ) as cursor:
+        rows_iterable = await cursor.fetchall()
+        rows = list(rows_iterable)
+        items = [_row_to_dict(r) for r in rows[:page_size]]
+        has_more = len(rows) > page_size
+
+    return items, has_more
+
+
 # ---------------------------------------------------------------------------
 # Internal helpers
 # ---------------------------------------------------------------------------
@@ -158,5 +225,6 @@ def _row_to_dict(row: object) -> ImportLogRow:
    Returns:
        ImportLogEntry Pydantic model instance.
    """
-    mapping = cast("Mapping[str, object]", row)
-    return ImportLogEntry(**mapping)
+    from typing import Any as AnyType
+    mapping = cast("Mapping[str, AnyType]", row)
+    return ImportLogEntry.model_validate(dict(mapping))
--- a/backend/app/utils/pagination.py
+++ b/backend/app/utils/pagination.py
@@ -4,11 +4,21 @@ This module provides reusable utilities for implementing consistent pagination
 across all endpoints. All paginated endpoints should use these utilities to
 ensure a uniform API contract.

-Standard Pagination Contract:
-  Query parameters: page (1-based), page_size (1-500)
-  Response: PaginatedListResponse[T] with items and pagination metadata
+Supported Pagination Modes:

-Usage in routers:
+1. **Offset-Based (Legacy)** — Uses page number + page_size.
+   Query parameters: page (1-based), page_size (1-500)
+   ⚠️  Performance degrades on large offsets (OFFSET requires scanning N rows).
+   Use for: Small datasets, where performance is not critical.
+
+2. **Cursor-Based (Recommended for large tables)** — Uses keyset pagination.
+   Query parameters: cursor (opaque token for next/prev), page_size
+   ✓ Constant-time performance regardless of dataset size.
+   Use for: Large tables (>100K rows), paginated lists with sorting.
+
+Usage Examples:
+
+**Offset pagination (legacy):**
  ```python
  from app.utils.pagination import PAGINATION_DEFAULTS, create_pagination_metadata

@@ -26,14 +36,50 @@ Usage in routers:
      pagination = create_pagination_metadata(total, page, page_size)
      return MyListResponse(items=items, pagination=pagination)
  ```
+
+**Cursor pagination (recommended):**
+  ```python
+  from app.utils.pagination import decode_cursor, encode_cursor, PAGINATION_DEFAULTS
+
+  @router.get("/items")
+  async def get_items(
+      cursor: str | None = Query(None),
+      page_size: int = Query(
+          default=PAGINATION_DEFAULTS["page_size"],
+          ge=1,
+          le=PAGINATION_DEFAULTS["max_page_size"],
+      ),
+  ):
+      # Decode cursor to get last_row_id
+      last_row_id = decode_cursor(cursor) if cursor else None
+
+      # Fetch items using keyset pagination (WHERE id > last_row_id)
+      items, has_more = await repo.get_items_keyset(page_size, last_row_id)
+
+      # Encode cursor for next page (last item's ID)
+      next_cursor = encode_cursor(items[-1]["id"]) if items and has_more else None
+
+      pagination = create_keyset_pagination_metadata(items, next_cursor, page_size)
+      return MyListResponse(items=items, pagination=pagination)
+  ```
 """

+import base64
+import json
 from typing import TYPE_CHECKING, Final

 if TYPE_CHECKING:
    from app.models.response import PaginationMetadata

-__all__ = ["PAGINATION_DEFAULTS", "get_offset", "compute_total_pages", "create_pagination_metadata"]
+__all__ = [
+    "PAGINATION_DEFAULTS",
+    "get_offset",
+    "compute_total_pages",
+    "create_pagination_metadata",
+    "encode_cursor",
+    "decode_cursor",
+    "create_keyset_pagination_metadata",
+]

 # Standardized pagination defaults
 PAGINATION_DEFAULTS: Final[dict[str, int]] = {
@@ -148,3 +194,112 @@ def create_pagination_metadata(total: int, page: int, page_size: int) -> "Pagina
        has_prev_page=has_prev_page,
    )

+
+# ---------------------------------------------------------------------------
+# Cursor-Based Pagination Functions
+# ---------------------------------------------------------------------------
+
+
+def encode_cursor(row_id: int) -> str:
+    """Encode a row ID into an opaque cursor token.
+
+    The cursor is a base64-encoded JSON object containing the row ID.
+    This format is opaque to the client and must not be modified manually.
+
+    Args:
+        row_id: The database row ID to encode.
+
+    Returns:
+        Base64-encoded cursor string that can be passed to decode_cursor().
+
+    Raises:
+        ValueError: If row_id is invalid (< 1).
+
+    Example:
+        ```python
+        cursor = encode_cursor(42)
+        assert isinstance(cursor, str)
+        assert decode_cursor(cursor) == 42
+        ```
+    """
+    if row_id < 1:
+        raise ValueError(f"row_id must be >= 1, got {row_id}")
+
+    cursor_data = {"id": row_id}
+    json_str = json.dumps(cursor_data, separators=(",", ":"))
+    return base64.b64encode(json_str.encode()).decode("ascii")
+
+
+def decode_cursor(cursor: str) -> int:
+    """Decode an opaque cursor token to retrieve the row ID.
+
+    Decodes a base64-encoded JSON object containing the row ID.
+    This is the inverse of encode_cursor().
+
+    Args:
+        cursor: Cursor string produced by encode_cursor().
+
+    Returns:
+        The row ID stored in the cursor.
+
+    Raises:
+        ValueError: If cursor is invalid (not base64-decodable or missing 'id' field).
+
+    Example:
+        ```python
+        cursor = encode_cursor(42)
+        assert decode_cursor(cursor) == 42
+        ```
+    """
+    try:
+        json_str = base64.b64decode(cursor.encode("ascii")).decode("utf-8")
+        cursor_data = json.loads(json_str)
+        row_id = cursor_data.get("id")
+        if not isinstance(row_id, int) or row_id < 1:
+            raise ValueError(f"Invalid cursor: 'id' field must be an integer >= 1, got {row_id}")
+        return row_id
+    except (ValueError, TypeError, json.JSONDecodeError) as e:
+        raise ValueError(f"Invalid cursor format: {e}") from e
+
+
+def create_keyset_pagination_metadata(
+    items: list[dict[str, object]] | list[object],
+    next_cursor: str | None,
+    page_size: int,
+) -> "PaginationMetadata":
+    """Create pagination metadata for keyset (cursor-based) pagination.
+
+    This function creates metadata for cursor-based pagination without the need
+    to query the total row count. Frontend can determine if there are more pages
+    by checking if the returned items count equals page_size.
+
+    Args:
+        items: The items returned from the keyset query (fetched count + 1).
+        next_cursor: Cursor for fetching the next page, or None if no more pages.
+        page_size: The requested page size.
+
+    Returns:
+        :class:`~app.models.response.PaginationMetadata` adapted for cursor pagination.
+        Note: total and total_pages are set to -1 (unknown), has_prev_page is always False.
+
+    Example:
+        ```python
+        items = await repo.get_items_keyset(page_size=10, last_row_id=None)
+        metadata = create_keyset_pagination_metadata(items, next_cursor, page_size=10)
+        assert metadata.total == -1  # Unknown in cursor pagination
+        assert metadata.has_next_page == (next_cursor is not None)
+        ```
+    """
+    from app.models.response import PaginationMetadata
+
+    has_next_page = next_cursor is not None
+
+    return PaginationMetadata(
+        page=1,
+        page_size=page_size,
+        total=-1,
+        total_pages=-1,
+        has_next_page=has_next_page,
+        has_prev_page=False,
+        cursor=next_cursor,
+    )