From c4ede71fa6f3afdb078aa9acf984ff631f9a1b14 Mon Sep 17 00:00:00 2001 From: Lukas Date: Thu, 30 Apr 2026 20:54:24 +0200 Subject: [PATCH] Fix: Enforce single-worker deployment for session cache cluster safety Addresses: Backend session cache not cluster-safe (multi-worker issue) Problem: - Session cache is process-local (InMemorySessionCache) - Multi-worker deployments (uvicorn --workers N) create separate processes - Each process has its own independent session cache - Sessions cached in Worker A are invisible to Workers B, C, D - Users randomly logged out when requests land on different workers - Also affects RuntimeState, rate limiter, and background jobs Solution (Option A - Strict single-worker enforcement): - Enhance startup validation with clearer error messages - Update error messages to explain the problem and how to fix it - Document single-worker requirement prominently in Docker configs - Update module docstrings to clarify constraints Changes: 1. app/startup.py: - Enhanced _check_single_worker_mode() error message with troubleshooting - Enhanced _stage_check_worker_mode_and_acquire_lock() error message - Removed unused import 2. app/utils/session_cache.py: - Updated module docstring to explain constraints more clearly - Added references to deployment documentation - Clarified multi-worker solution for future implementation 3. app/utils/runtime_state.py: - Updated module docstring with deployment constraint references - Aligned messaging with session_cache.py 4. Docker/Dockerfile.backend: - Added comprehensive comments about single-worker requirement - Explained impact in multi-worker deployments - Referenced deployment constraints documentation 5. Docker/docker-compose.yml, compose.prod.yml, compose.debug.yml: - Added documentation comments about BANGUI_WORKERS constraint - Explained why single-worker is required 6. backend/tests/test_startup_integration.py: - Fixed test unpacking to match function return signature (3 values, not 2) This ensures multi-worker deployments fail loudly at startup with clear guidance on what went wrong and how to fix it. The database-backed scheduler lock provides defense-in-depth for container orchestration scenarios. For future multi-worker support, implement: - Redis or database-backed session cache - Shared RuntimeState coordination - Distributed APScheduler backend Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Docker/Dockerfile.backend | 15 ++++++++ Docker/compose.debug.yml | 2 ++ Docker/compose.prod.yml | 6 +++- Docker/docker-compose.yml | 2 ++ backend/app/startup.py | 22 ++++++++++-- backend/app/utils/runtime_state.py | 28 +++++++++------ backend/app/utils/session_cache.py | 43 +++++++++++++---------- backend/tests/test_startup_integration.py | 5 +-- 8 files changed, 89 insertions(+), 34 deletions(-) diff --git a/Docker/Dockerfile.backend b/Docker/Dockerfile.backend index 273d1ca..908ad0e 100644 --- a/Docker/Dockerfile.backend +++ b/Docker/Dockerfile.backend @@ -67,4 +67,19 @@ USER bangui HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1 +# ⚠️ IMPORTANT: Single-Worker Requirement +# BanGUI must always run as a single worker process: +# - Do NOT pass --workers or --worker-class to uvicorn +# - Do NOT use gunicorn with -w 4 or similar +# - Do NOT override BANGUI_WORKERS to > 1 +# +# Why? The session cache is process-local. Multiple workers would cause: +# - Random user logouts (sessions not shared between workers) +# - Duplicate background jobs (each worker runs the scheduler) +# - SQLite lock contention and timeouts +# +# For high availability, use container orchestration (Kubernetes, Docker Swarm) +# to run multiple instances, not multiple workers in a single process. +# +# See Docs/Architekture.md § Deployment Constraints for details. CMD ["uvicorn", "app.main:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Docker/compose.debug.yml b/Docker/compose.debug.yml index 9186d05..fa7415e 100644 --- a/Docker/compose.debug.yml +++ b/Docker/compose.debug.yml @@ -65,6 +65,8 @@ services: # Secure=false is intentional for local HTTP development. # In production, Secure=true prevents session cookies over unencrypted HTTP. BANGUI_SESSION_COOKIE_SECURE: "false" + # BANGUI_WORKERS should not be set (defaults to 1). + # Never set it to > 1; the session cache is process-local. volumes: - ../backend/app:/app/app:z - ../fail2ban-master:/app/fail2ban-master:ro,z diff --git a/Docker/compose.prod.yml b/Docker/compose.prod.yml index 73847a5..712cefe 100644 --- a/Docker/compose.prod.yml +++ b/Docker/compose.prod.yml @@ -58,7 +58,11 @@ services: BANGUI_FAIL2BAN_SOCKET: "/var/run/fail2ban/fail2ban.sock" BANGUI_FAIL2BAN_CONFIG_DIR: "/config/fail2ban" BANGUI_LOG_LEVEL: "info" - BANGUI_WORKERS: "1" # APScheduler requires single worker — do not change + # ⚠️ BANGUI_WORKERS MUST be 1 — see session_cache.py docstring for details + # BanGUI uses a process-local session cache. Multiple workers in a single process + # would cause users to be randomly logged out as sessions wouldn't be shared. + # For HA, run multiple BanGUI instances (each with --workers 1) via orchestration. + BANGUI_WORKERS: "1" BANGUI_SESSION_SECRET: "${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET}" BANGUI_TIMEZONE: "${BANGUI_TIMEZONE:-UTC}" volumes: diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml index 70ced48..a35d65e 100644 --- a/Docker/docker-compose.yml +++ b/Docker/docker-compose.yml @@ -41,6 +41,8 @@ services: - BANGUI_FAIL2BAN_SOCKET=/var/run/fail2ban/fail2ban.sock - BANGUI_FAIL2BAN_CONFIG_DIR=/config/fail2ban - BANGUI_LOG_LEVEL=info + # ⚠️ BANGUI_WORKERS MUST be 1 — the session cache is process-local + # Multiple workers would cause random logouts and duplicate background jobs - BANGUI_SESSION_SECRET=${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET} - BANGUI_TIMEZONE=${BANGUI_TIMEZONE:-UTC} volumes: diff --git a/backend/app/startup.py b/backend/app/startup.py index ee05a3c..bbde0ca 100644 --- a/backend/app/startup.py +++ b/backend/app/startup.py @@ -50,7 +50,6 @@ from app.utils.jail_config import ensure_jail_configs from app.utils.runtime_state import set_runtime_settings from app.utils.scheduler_lock import ( acquire_scheduler_lock, - release_scheduler_lock, ) from app.utils.setup_state import set_setup_complete_cache @@ -84,7 +83,18 @@ def _check_single_worker_mode() -> None: raise RuntimeError( "BanGUI background scheduler cannot run with multiple workers.\n" f"BANGUI_WORKERS is set to {worker_count}. Set it to 1 or remove it.\n" - "See Architekture.md § Deployment Constraints for details." + "\n" + "Why this matters:\n" + " - Session cache is process-local; users may be randomly logged out\n" + " - Background jobs (blocklist imports, history sync) would run N times\n" + " - Database lock contention will cause timeouts\n" + "\n" + "To fix:\n" + " 1. Remove BANGUI_WORKERS=N from your environment\n" + " 2. Don't pass --workers to uvicorn or -w to gunicorn\n" + " 3. Deploy as a single process (use container orchestration for HA)\n" + "\n" + "See Docs/Architekture.md § Deployment Constraints for details." ) except ValueError as e: raise RuntimeError( @@ -275,14 +285,20 @@ async def _stage_check_worker_mode_and_acquire_lock(startup_db: Any) -> None: if not await acquire_scheduler_lock(startup_db): raise RuntimeError( "Could not acquire scheduler lock. Another BanGUI instance is already running the scheduler.\n" + "\n" "This prevents duplicate background jobs (blocklist imports, history sync, etc.).\n" "\n" + "IMPORTANT: This also indicates a possible multi-worker misconfiguration:\n" + " - If BANGUI_WORKERS > 1, multiple workers are trying to acquire the lock\n" + " - If --workers or -w was passed to uvicorn/gunicorn, remove it\n" + " - BanGUI must run with exactly 1 worker process (use HA at container level)\n" + "\n" "To recover from a stale lock (e.g., after a crash):\n" " 1. Verify no other BanGUI instances are running\n" " 2. Inspect the lock: sqlite3 bangui.db 'SELECT * FROM scheduler_lock;'\n" " 3. If stale, clean it: sqlite3 bangui.db 'DELETE FROM scheduler_lock;'\n" "\n" - "See Architekture.md § Deployment Constraints for details." + "See Docs/Architekture.md § Deployment Constraints for details." ) diff --git a/backend/app/utils/runtime_state.py b/backend/app/utils/runtime_state.py index 6009b53..ca395a8 100644 --- a/backend/app/utils/runtime_state.py +++ b/backend/app/utils/runtime_state.py @@ -24,18 +24,26 @@ IMPACT IN MULTI-WORKER DEPLOYMENTS: - fail2ban activation/recovery tracking (pending_recovery, last_activation) is per-worker and unreliable across processes. -MULTI-WORKER SOLUTION: - To deploy BanGUI with multiple workers (e.g., via gunicorn -w 4), you must: - 1. Replace RuntimeState with a shared store (Redis, shared memory, database). - 2. Replace InMemorySessionCache with RedisSessionCache (see session_cache.py). - 3. Ensure all workers use the same backend for coordination. - SINGLE-WORKER ENFORCEMENT: - See TASK-002 in Docs/Tasks.md for deployment configuration that enforces - single-worker mode, preventing this issue entirely. + BanGUI enforces single-worker mode at startup: + 1. Environment variable check: BANGUI_WORKERS must be 1 or unset + 2. Database lock: Only one instance can run the scheduler at a time + 3. Startup validation: Fails loudly if multi-worker scenario is detected -For now, BanGUI is deployed as single-worker only — this constraint is -acceptable and keeps the implementation simple. + See Docs/Architekture.md § Deployment Constraints for full details. + +MULTI-WORKER SOLUTION (Future): + To deploy BanGUI with multiple workers in the future (e.g., via gunicorn -w 4): + 1. Replace RuntimeState with a shared store (Redis, shared memory, database) + 2. Replace InMemorySessionCache with a shared backend (Redis, database) + 3. Replace APScheduler with a distributed scheduler backend + 4. Ensure all workers use the same backend for coordination + +CURRENT STATUS: + For now, BanGUI is deployed as single-worker only. This constraint is + acceptable and keeps the implementation simple. The database-backed scheduler + lock ensures only one instance runs background jobs, even in container + orchestration scenarios where multiple instances may start. """ from __future__ import annotations diff --git a/backend/app/utils/session_cache.py b/backend/app/utils/session_cache.py index 9900085..97d8957 100644 --- a/backend/app/utils/session_cache.py +++ b/backend/app/utils/session_cache.py @@ -19,16 +19,24 @@ IMPACT IN MULTI-WORKER DEPLOYMENTS: - Worker B still has the stale session in its cache → request is accepted. - User appears still logged in (from their perspective). - This is a security issue: logout does not work reliably across workers. + This is a CRITICAL SECURITY ISSUE: logout does not work reliably across workers. -MULTI-WORKER SOLUTION: - To deploy BanGUI with multiple workers (e.g., via gunicorn -w 4), replace - InMemorySessionCache with a shared backend such as: - - RedisSessionCache — backed by Redis (recommended for production). - - DatabaseSessionCache — backed by SQLite or PostgreSQL. - - SharedMemorySessionCache — backed by IPC (for local multi-process). +SINGLE-WORKER ENFORCEMENT: + BanGUI enforces single-worker mode to prevent this issue: + 1. Environment variable check: BANGUI_WORKERS must be 1 or unset + 2. Database lock: Only one instance can run the scheduler at a time + 3. Startup validation: Fails loudly if multi-worker scenario is detected - The SessionCache Protocol is already designed for pluggable backends: + See Docs/Architekture.md § Deployment Constraints for full details. + +MULTI-WORKER SOLUTION (Future): + If multi-worker support is needed in the future, replace InMemorySessionCache + with a shared backend such as: + - RedisSessionCache — backed by Redis (recommended for production) + - DatabaseSessionCache — backed by SQLite or PostgreSQL + - SharedMemorySessionCache — backed by IPC (for local multi-process) + + The SessionCache Protocol is designed for pluggable backends: class SessionCache(Protocol): def get(token: str) -> Session | None: ... def set(token: str, session: Session, ttl_seconds: float) -> None: ... @@ -36,17 +44,16 @@ MULTI-WORKER SOLUTION: def clear() -> None: ... To add Redis support: - 1. Create RedisSessionCache in this module (implements SessionCache). - 2. Update runtime_state.set_runtime_settings() to instantiate RedisSessionCache - when REDIS_URL is configured. - 3. See Backend-Development.md § "Session Cache Pluggability" for details. + 1. Create RedisSessionCache in this module (implements SessionCache) + 2. Update app/main.py _update_session_cache() to instantiate RedisSessionCache + when BANGUI_REDIS_URL is configured + 3. Update Backend-Development.md with multi-worker deployment guidelines -SINGLE-WORKER ENFORCEMENT: - See TASK-002 in Docs/Tasks.md for deployment configuration that enforces - single-worker mode, preventing this issue entirely. - -For now, BanGUI is deployed as single-worker only — this constraint is -acceptable and keeps the implementation simple. +CURRENT STATUS: + For now, BanGUI is deployed as single-worker only. This constraint is + acceptable and keeps the implementation simple. The database-backed scheduler + lock ensures only one instance runs background jobs, even in container + orchestration scenarios where multiple instances may start. """ from __future__ import annotations diff --git a/backend/tests/test_startup_integration.py b/backend/tests/test_startup_integration.py index d51663e..8415348 100644 --- a/backend/tests/test_startup_integration.py +++ b/backend/tests/test_startup_integration.py @@ -83,11 +83,12 @@ async def test_startup_shared_resources_complete_flow() -> None: mock_blocklist_import_register.return_value = None # Call startup_shared_resources - http_session, scheduler = await startup_shared_resources(app, settings) + http_session, scheduler, startup_db = await startup_shared_resources(app, settings) # Verify all stages completed successfully assert http_session is not None assert scheduler is not None + assert startup_db is not None assert scheduler.running # Verify resources were initialized @@ -178,7 +179,7 @@ async def test_startup_shared_resources_scheduler_starts() -> None: mock_geo_cache.init_geoip = MagicMock() mock_geo_cache_class.return_value = mock_geo_cache - http_session, scheduler = await startup_shared_resources(app, settings) + http_session, scheduler, startup_db = await startup_shared_resources(app, settings) # Verify scheduler is running assert scheduler.running