refactor(logging): replace structlog with stdlib logging compat layer

- Remove structlog dependency from backend/pyproject.toml - Add app.utils.logging_compat shim for keyword-arg logging API - Add app.utils.json_formatter for JSON log output with extra fields - Update all backend modules to use logging_compat.get_logger() - Update docstrings in log_sanitizer.py and json_formatter.py - Update test comment in test_async_utils.py - Record 406 failing tests in Docs/Tasks.md for tracking
2026-05-10 13:37:54 +02:00
parent 7790736918
commit 7ec80fdeec
81 changed files with 3013 additions and 634 deletions
--- a/backend/app/utils/async_utils.py
+++ b/backend/app/utils/async_utils.py
@@ -12,12 +12,12 @@ from collections.abc import Callable, Coroutine
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, ParamSpec, TypeVar

-import structlog
+from app.utils.logging_compat import get_logger

 P = ParamSpec("P")
 T = TypeVar("T")

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 DEFAULT_BLOCKING_EXECUTOR: ThreadPoolExecutor = ThreadPoolExecutor(
    max_workers=16,
--- a/backend/app/utils/conffile_parser.py
+++ b/backend/app/utils/conffile_parser.py
@@ -24,7 +24,7 @@ import contextlib
 import io
 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

 if TYPE_CHECKING:
    from pathlib import Path
@@ -39,7 +39,7 @@ from app.models.config import (
    JailSectionConfig,
 )

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # ---------------------------------------------------------------------------
 # Constants — well-known Definition keys for action files
--- a/backend/app/utils/config_file_utils.py
+++ b/backend/app/utils/config_file_utils.py
@@ -10,7 +10,7 @@ import tempfile
 from pathlib import Path
 from typing import cast

-import structlog
+from app.utils.logging_compat import get_logger

 from app.exceptions import (
    ConfigWriteError,
@@ -32,7 +32,7 @@ from app.utils.fail2ban_client import (
 from app.utils.fail2ban_response import ok, to_dict
 from app.utils.log_sanitizer import sanitize_for_logging

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # Allowlist pattern for jail names used in path construction.
 _SAFE_JAIL_NAME_RE: re.Pattern[str] = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$")
--- a/backend/app/utils/config_parser.py
+++ b/backend/app/utils/config_parser.py
@@ -28,12 +28,12 @@ import configparser
 import re
 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

 if TYPE_CHECKING:
    from pathlib import Path

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # Compiled pattern that matches fail2ban-style %(variable_name)s references.
 _INTERPOLATE_RE: re.Pattern[str] = re.compile(r"%\((\w+)\)s")
--- a/backend/app/utils/config_writer.py
+++ b/backend/app/utils/config_writer.py
@@ -31,12 +31,12 @@ import tempfile
 import threading
 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

 if TYPE_CHECKING:
    from pathlib import Path

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # ---------------------------------------------------------------------------
 # Per-file lock registry
--- a/backend/app/utils/constants.py
+++ b/backend/app/utils/constants.py
@@ -51,19 +51,6 @@ CSRF_HEADER_NAME: Final[str] = "X-BanGUI-Request"
 CSRF_HEADER_VALUE: Final[str] = "1"
 """Required value of the CSRF header to pass validation."""

-# ---------------------------------------------------------------------------
-# Authentication penalty (brute-force resistance)
-# ---------------------------------------------------------------------------
-
-LOGIN_PENALTY_BASE_SECONDS: Final[float] = 1.0
-"""Base penalty (seconds) for a failed login attempt."""
-
-LOGIN_PENALTY_MAX_SECONDS: Final[float] = 10.0
-"""Maximum penalty (seconds) for failed login attempts."""
-
-LOGIN_PENALTY_MULTIPLIER: Final[float] = 2.0
-"""Exponential multiplier applied per failed attempt."""
-
 # ---------------------------------------------------------------------------
 # Time-range presets (used by dashboard and history endpoints)
 # ---------------------------------------------------------------------------
--- a/backend/app/utils/external_logging.py
+++ b/backend/app/utils/external_logging.py
@@ -16,9 +16,9 @@ from typing import TYPE_CHECKING, Any, Literal
 if TYPE_CHECKING:
    from aiohttp import ClientSession

-import structlog
+from app.utils.logging_compat import get_logger

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)


 class ExternalLogHandler(ABC):
--- a/backend/app/utils/fail2ban_client.py
+++ b/backend/app/utils/fail2ban_client.py
@@ -24,7 +24,7 @@ from collections.abc import Mapping, Sequence, Set
 from pathlib import Path
 from typing import TYPE_CHECKING, Protocol

-import structlog
+from app.utils.logging_compat import get_logger

 from app.exceptions import Fail2BanConnectionError, Fail2BanProtocolError

@@ -68,7 +68,7 @@ type Fail2BanResponse = tuple[int, object]
 if TYPE_CHECKING:
    from types import TracebackType

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # Attempt to reuse the vendored fail2ban package embedded in the repository.
 # If it is not on sys.path yet, load it from ``../fail2ban-master``.
--- a/backend/app/utils/fail2ban_db_utils.py
+++ b/backend/app/utils/fail2ban_db_utils.py
@@ -5,9 +5,9 @@ from __future__ import annotations
 import json
 from datetime import UTC, datetime

-import structlog
+from app.utils.logging_compat import get_logger

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)


 def escape_like(s: str) -> str:
--- a/backend/app/utils/jail_config.py
+++ b/backend/app/utils/jail_config.py
@@ -11,12 +11,12 @@ from __future__ import annotations

 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

 if TYPE_CHECKING:
    from pathlib import Path

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # ---------------------------------------------------------------------------
 # Default file contents
--- a/backend/app/utils/jail_socket.py
+++ b/backend/app/utils/jail_socket.py
@@ -11,7 +11,7 @@ from __future__ import annotations
 import asyncio
 from typing import cast

-import structlog
+from app.utils.logging_compat import get_logger

 from app.exceptions import JailNotFoundError, JailOperationError
 from app.utils.fail2ban_client import (
@@ -24,7 +24,7 @@ from app.utils.fail2ban_response import (
    to_dict,
 )

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # Socket communication timeout in seconds.
 SOCKET_TIMEOUT: float = 10.0
--- a/backend/app/utils/json_formatter.py
+++ b/backend/app/utils/json_formatter.py
@@ -0,0 +1,85 @@
+"""JSON formatter for stdlib logging that preserves extra fields.
+
+A single logging.Formatter subclass that serialises any keyword arguments
+passed via ``extra=`` into the JSON output alongside the standard record
+attributes.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timezone
+from typing import Any
+
+# Attributes that belong to the standard LogRecord and should NOT be
+# treated as user-supplied extra fields.
+_STD_RECORD_ATTRS: frozenset[str] = frozenset(
+    {
+        "name",
+        "msg",
+        "args",
+        "levelname",
+        "levelno",
+        "pathname",
+        "filename",
+        "module",
+        "exc_info",
+        "exc_text",
+        "stack_info",
+        "lineno",
+        "funcName",
+        "created",
+        "msecs",
+        "relativeCreated",
+        "thread",
+        "threadName",
+        "processName",
+        "process",
+        "message",
+        "asctime",
+        "taskName",
+    }
+)
+
+
+class JSONFormatter(logging.Formatter):
+    """Format log records as JSON lines, including extra fields.
+
+    Usage::
+
+        handler = logging.StreamHandler()
+        handler.setFormatter(JSONFormatter())
+        logging.getLogger().addHandler(handler)
+
+    Output keys:
+        - ``event``      – the log message
+        - ``level``      – lower-cased level name
+        - ``timestamp``  – ISO-8601 UTC timestamp
+        - ``logger``     – logger name
+        - any ``extra`` fields supplied by the caller
+    """
+
+    def format(self, record: logging.LogRecord) -> str:
+        """Return a JSON string for *record*."""
+        log_dict: dict[str, Any] = {
+            "event": record.getMessage(),
+            "level": record.levelname.lower(),
+            "timestamp": (
+                datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat()
+            ),
+            "logger": record.name,
+        }
+
+        # Merge any extra fields attached to the record.
+        for key, value in record.__dict__.items():
+            if key not in _STD_RECORD_ATTRS:
+                log_dict[key] = value
+
+        # Include exception info when present.
+        if record.exc_info and not record.exc_text:
+            record.exc_text = self.formatException(record.exc_info)
+        if record.exc_text:
+            log_dict["exception"] = record.exc_text
+
+        return json.dumps(log_dict, default=str)
--- a/backend/app/utils/log_sanitizer.py
+++ b/backend/app/utils/log_sanitizer.py
@@ -1,7 +1,7 @@
 """Log sanitization utilities for preventing sensitive data leakage.

 All external output (subprocess, API responses, config data) passed to
-structlog MUST be sanitized first. This module provides the canonical
+logging MUST be sanitized first. This module provides the canonical
 sanitize_for_logging() function used across the codebase.
 """

--- a/backend/app/utils/logging_compat.py
+++ b/backend/app/utils/logging_compat.py
@@ -0,0 +1,63 @@
+"""Compatibility shim providing keyword-argument logging API on top of stdlib logging.
+
+This module lets the rest of the codebase keep the keyword-argument logging
+style (``log.info("event", key=value)``) while using only the Python standard
+library ``logging`` module underneath.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+
+class _CompatLogger:
+    """Wraps a stdlib :class:`logging.Logger` to accept keyword arguments."""
+
+    def __init__(self, logger: logging.Logger) -> None:
+        self._logger = logger
+
+    def _log(self, level: int, event: str, **kwargs: Any) -> None:
+        exc_info = kwargs.pop("exc_info", None)
+        extra = kwargs if kwargs else None
+        self._logger.log(level, event, exc_info=exc_info, extra=extra)
+
+    def debug(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.DEBUG, event, **kwargs)
+
+    def info(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.INFO, event, **kwargs)
+
+    def warning(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.WARNING, event, **kwargs)
+
+    def warn(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.WARNING, event, **kwargs)
+
+    def error(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.ERROR, event, **kwargs)
+
+    def critical(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.CRITICAL, event, **kwargs)
+
+    def exception(self, event: str, **kwargs: Any) -> None:
+        self._log(logging.ERROR, event, exc_info=True, **kwargs)
+
+    def bind(self, **kwargs: Any) -> "_CompatLogger":
+        """Return a new logger with bound context (no-op for stdlib)."""
+        return self
+
+
+def get_logger(name: str | None = None) -> _CompatLogger:
+    """Get a compatibility logger wrapping the stdlib logger for *name*.
+
+    If *name* is ``None`` the caller's module name is used.
+    """
+    if name is None:
+        import sys
+
+        # Walk up the stack to find the caller's module.
+        frame = sys._getframe(1)
+        module = frame.f_globals.get("__name__", "__main__")
+        name = module
+    return _CompatLogger(logging.getLogger(name))
--- a/backend/app/utils/metrics.py
+++ b/backend/app/utils/metrics.py
@@ -11,9 +11,9 @@ and get_metrics() returns an empty bytes object.

 from __future__ import annotations

-import structlog
+from app.utils.logging_compat import get_logger

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 try:
    from prometheus_client import (
--- a/backend/app/utils/rate_limiter.py
+++ b/backend/app/utils/rate_limiter.py
@@ -1,46 +1,25 @@
-"""In-memory rate limiter for IP-based request throttling.
+"""In-memory global rate limiter for IP-based request throttling.

-Implements exponential backoff for failed login attempts using failure tracking.
-Each wrong password attempt increments the failure count for that IP, and subsequent
-attempts are blocked for a duration that grows exponentially up to a maximum.
-
-Uses a dictionary of deques (per IP) storing timestamps of recent failures.
-Old entries are cleaned up by a background task to prevent unbounded growth.
+Implements a sliding-window request counter per IP address. Old entries are
+cleaned up by a background task to prevent unbounded growth.

 Process-local implementation — in multi-worker setups, each worker has
-independent counters. This constraint limits the blast radius of brute-force
-attacks to a single worker.
+independent counters. This constraint limits the blast radius of abuse to a
+single worker.

-**How It Works:**
+**Cleanup Lifecycle**: The rate limiter state grows as IPs interact with the
+system. To prevent unbounded memory growth during long runtimes, a scheduled
+background task (rate_limiter_cleanup) calls cleanup_expired() every 30 minutes.
+This is safe because:

-1. A successful login resets the failure counter for that IP.
-2. Each failed login (wrong password) calls record_failure() and increments the counter.
-3. is_allowed() checks if enough time has passed since the last failure based on
-   the current failure count. The delay grows exponentially with each consecutive failure:
-
-   - 1st failure: 0.5 second penalty
-   - 2nd failure: 1 second penalty (0.5 * 2^1)
-   - 3rd failure: 2 seconds penalty (0.5 * 2^2)
-   - 4th failure: 4 seconds penalty (0.5 * 2^3)
-   - ... up to the configured maximum (default 5 seconds)
-
-4. Penalties are cumulative within the window: if an attacker makes 5 failed
-   attempts, they must wait the full 5 seconds before trying again (not 5 seconds
-   per attempt).
-
-**Cleanup Lifecycle**: The rate limiter state (_failures) grows as IPs interact
-with the system. To prevent unbounded memory growth during long runtimes, a
-scheduled background task (rate_limiter_cleanup) calls cleanup_expired() every
-30 minutes. This is safe because:
-
- cleanup_expired() only removes IPs with no recent failures (all timestamps
+- cleanup_expired() only removes IPs with no recent requests (all timestamps
  outside the rate-limit window), so active IPs are never disrupted.
 - The cleanup is non-blocking and logged for observability.
 - Individual requests already prune old timestamps from each IP's deque during
-  is_allowed() and record_failure(), so cleanup primarily handles dormant IPs.
+  check_allowed(), so cleanup primarily handles dormant IPs.

-For monitoring, check logs for "rate_limiter_cleanup" events to observe how
-many IPs are being retired from memory each cleanup cycle.
+For monitoring, check logs for "global_rate_limiter_cleanup" events to observe
+how many IPs are being retired from memory each cleanup cycle.
 """

 from __future__ import annotations
@@ -49,173 +28,21 @@ from collections import deque
 from time import time
 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

-from app.utils.constants import (
-    LOGIN_PENALTY_BASE_SECONDS,
-    LOGIN_PENALTY_MAX_SECONDS,
-    LOGIN_PENALTY_MULTIPLIER,
-)
 from app.utils.ip_utils import normalise_ip

 if TYPE_CHECKING:
    from collections.abc import Mapping

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
-
-# 5 attempts per minute per IP (300 seconds)
-DEFAULT_RATE_LIMIT_ATTEMPTS = 5
-DEFAULT_RATE_LIMIT_WINDOW_SECONDS = 60
-
-
-class RateLimiter:
-    """Track and enforce request rate limits per IP address.
-
-    Stores attempt timestamps in per-IP deques, removing old entries
-    outside the rate limit window.
-    """
-
-    def __init__(
-        self,
-        max_attempts: int = DEFAULT_RATE_LIMIT_ATTEMPTS,
-        window_seconds: int = DEFAULT_RATE_LIMIT_WINDOW_SECONDS,
-    ) -> None:
-        """Initialize the rate limiter.
-
-        Args:
-            max_attempts: Maximum attempts allowed within the window.
-                (Deprecated: now only used for cleanup window size)
-            window_seconds: Time window (seconds) for rate limit.
-        """
-        self.max_attempts: int = max_attempts
-        self.window_seconds: int = window_seconds
-        self._failures: dict[str, deque[float]] = {}
-
-    def is_allowed(self, ip_address: str) -> bool:
-        """Check if a request from *ip_address* is allowed.
-
-        Checks if the IP has accumulated failures that would currently block
-        the attempt due to penalty backoff. Does NOT record a new attempt —
-        that happens only on successful password verification.
-
-        Args:
-            ip_address: The client IP address to rate-limit.
-
-        Returns:
-            ``True`` if the request is allowed (past penalty period), ``False``
-            if currently blocked by exponential backoff.
-        """
-        ip_address = normalise_ip(ip_address)
-        now = time()
-
-        if ip_address not in self._failures:
-            self._failures[ip_address] = deque()
-
-        failures = self._failures[ip_address]
-        cutoff = now - self.window_seconds
-
-        # Remove old failures outside the window
-        while failures and failures[0] < cutoff:
-            failures.popleft()
-
-        # If no recent failures, request is allowed
-        if not failures:
-            return True
-
-        # Calculate accumulated penalty: how much time must pass before
-        # the next attempt is allowed, based on failure count
-        failure_count = len(failures)
-        penalty = min(
-            LOGIN_PENALTY_BASE_SECONDS * (LOGIN_PENALTY_MULTIPLIER ** failure_count),
-            LOGIN_PENALTY_MAX_SECONDS,
-        )
-
-        # Check if enough time has passed since the last failure
-        time_since_last_failure = now - failures[-1]
-        return time_since_last_failure >= penalty
-
-    def cleanup_expired(self) -> None:
-        """Remove all IPs with no recent failures (cleanup task).
-
-        Called periodically by the background task to prevent unbounded
-        growth of the tracking dictionary.
-        """
-        now = time()
-        cutoff = now - self.window_seconds
-
-        ips_to_remove = []
-        for ip_address, failures in self._failures.items():
-            # Remove old failures
-            while failures and failures[0] < cutoff:
-                failures.popleft()
-            # Mark IP for removal if no failures remain
-            if not failures:
-                ips_to_remove.append(ip_address)
-
-        for ip_address in ips_to_remove:
-            del self._failures[ip_address]
-
-        if ips_to_remove:
-            log.debug("rate_limiter_cleanup", removed_ips=len(ips_to_remove))
-
-    def get_state(self) -> Mapping[str, int]:
-        """Return a read-only view of current failure counts per IP.
-
-        For debugging and monitoring.
-
-        Returns:
-            A mapping of IP addresses to their failure counts.
-        """
-        now = time()
-        cutoff = now - self.window_seconds
-        result = {}
-        for ip_address, failures in self._failures.items():
-            # Count non-expired failures
-            count = sum(1 for ts in failures if ts >= cutoff)
-            if count > 0:
-                result[ip_address] = count
-        return result
-
-    def reset(self) -> None:
-        """Clear all tracked failures (for testing)."""
-        self._failures.clear()
-
-    # ---------------------------------------------------------------------------
-    # Penalty strategy for failed login attempts
-    # ---------------------------------------------------------------------------
-
-    def record_failure(self, ip_address: str) -> None:
-        """Record a failed login attempt.
-
-        Tracks failures per IP to enable exponential backoff in is_allowed().
-        The penalty delay is automatically calculated in is_allowed() based on
-        the failure count, providing transparent brute-force resistance.
-
-        Args:
-            ip_address: The client IP address whose login attempt failed.
-        """
-        ip_address = normalise_ip(ip_address)
-        now = time()
-
-        if ip_address not in self._failures:
-            self._failures[ip_address] = deque()
-
-        failures = self._failures[ip_address]
-        cutoff = now - self.window_seconds
-
-        # Remove old failures outside the window
-        while failures and failures[0] < cutoff:
-            failures.popleft()
-
-        # Record this failure
-        failures.append(now)
+log = get_logger(__name__)


 class GlobalRateLimiter:
    """Global per-IP request rate limiter using sliding window algorithm.

    Tracks total request count within a configurable time window per IP address.
-    Unlike RateLimiter (which uses exponential backoff), this implements simple
+    This implements simple
    request counting: when an IP exceeds the limit, the next request is blocked
    until the oldest request in the window expires.

--- a/backend/app/utils/regex_validator.py
+++ b/backend/app/utils/regex_validator.py
@@ -11,7 +11,7 @@ import signal
 from contextlib import contextmanager
 from typing import TYPE_CHECKING

-import structlog
+from app.utils.logging_compat import get_logger

 try:
    from regexploit.ast.sre import SreOpParser
@@ -25,7 +25,7 @@ except ImportError:
 if TYPE_CHECKING:
    from collections.abc import Generator

-logger = structlog.get_logger()
+logger = get_logger(__name__)

 # Constants for regex validation
 MAX_REGEX_LENGTH = 1000
--- a/backend/app/utils/runtime_state.py
+++ b/backend/app/utils/runtime_state.py
@@ -53,7 +53,7 @@ import datetime
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any

-import structlog
+from app.utils.logging_compat import get_logger
 from starlette.datastructures import State

 from app.models.config import PendingRecovery
@@ -63,7 +63,7 @@ from app.utils.session_cache import InMemorySessionCache, NoOpSessionCache
 if TYPE_CHECKING:  # pragma: no cover
    from app.config import Settings

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 ActivationRecord = dict[str, datetime.datetime]

--- a/backend/app/utils/scheduler_lock.py
+++ b/backend/app/utils/scheduler_lock.py
@@ -46,9 +46,9 @@ import time
 from typing import Any

 import aiosqlite
-import structlog
+from app.utils.logging_compat import get_logger

-log: structlog.stdlib.BoundLogger = structlog.get_logger()
+log = get_logger(__name__)

 # Lock record expires if heartbeat hasn't been updated for this many seconds.
 # This prevents stale locks from a crashed instance from blocking new startups.