refactoring-backend #3

Merged
lukas.pupkalipinski merged 403 commits from refactoring-backend into main 2026-05-20 20:23:46 +02:00
8 changed files with 89 additions and 34 deletions
Showing only changes of commit c4ede71fa6 - Show all commits

View File

@@ -67,4 +67,19 @@ USER bangui
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1
# ⚠️ IMPORTANT: Single-Worker Requirement
# BanGUI must always run as a single worker process:
# - Do NOT pass --workers or --worker-class to uvicorn
# - Do NOT use gunicorn with -w 4 or similar
# - Do NOT override BANGUI_WORKERS to > 1
#
# Why? The session cache is process-local. Multiple workers would cause:
# - Random user logouts (sessions not shared between workers)
# - Duplicate background jobs (each worker runs the scheduler)
# - SQLite lock contention and timeouts
#
# For high availability, use container orchestration (Kubernetes, Docker Swarm)
# to run multiple instances, not multiple workers in a single process.
#
# See Docs/Architekture.md § Deployment Constraints for details.
CMD ["uvicorn", "app.main:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -65,6 +65,8 @@ services:
# Secure=false is intentional for local HTTP development.
# In production, Secure=true prevents session cookies over unencrypted HTTP.
BANGUI_SESSION_COOKIE_SECURE: "false"
# BANGUI_WORKERS should not be set (defaults to 1).
# Never set it to > 1; the session cache is process-local.
volumes:
- ../backend/app:/app/app:z
- ../fail2ban-master:/app/fail2ban-master:ro,z

View File

@@ -58,7 +58,11 @@ services:
BANGUI_FAIL2BAN_SOCKET: "/var/run/fail2ban/fail2ban.sock"
BANGUI_FAIL2BAN_CONFIG_DIR: "/config/fail2ban"
BANGUI_LOG_LEVEL: "info"
BANGUI_WORKERS: "1" # APScheduler requires single worker — do not change
# ⚠️ BANGUI_WORKERS MUST be 1 — see session_cache.py docstring for details
# BanGUI uses a process-local session cache. Multiple workers in a single process
# would cause users to be randomly logged out as sessions wouldn't be shared.
# For HA, run multiple BanGUI instances (each with --workers 1) via orchestration.
BANGUI_WORKERS: "1"
BANGUI_SESSION_SECRET: "${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET}"
BANGUI_TIMEZONE: "${BANGUI_TIMEZONE:-UTC}"
volumes:

View File

@@ -41,6 +41,8 @@ services:
- BANGUI_FAIL2BAN_SOCKET=/var/run/fail2ban/fail2ban.sock
- BANGUI_FAIL2BAN_CONFIG_DIR=/config/fail2ban
- BANGUI_LOG_LEVEL=info
# ⚠️ BANGUI_WORKERS MUST be 1 — the session cache is process-local
# Multiple workers would cause random logouts and duplicate background jobs
- BANGUI_SESSION_SECRET=${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET}
- BANGUI_TIMEZONE=${BANGUI_TIMEZONE:-UTC}
volumes:

View File

@@ -50,7 +50,6 @@ from app.utils.jail_config import ensure_jail_configs
from app.utils.runtime_state import set_runtime_settings
from app.utils.scheduler_lock import (
acquire_scheduler_lock,
release_scheduler_lock,
)
from app.utils.setup_state import set_setup_complete_cache
@@ -84,7 +83,18 @@ def _check_single_worker_mode() -> None:
raise RuntimeError(
"BanGUI background scheduler cannot run with multiple workers.\n"
f"BANGUI_WORKERS is set to {worker_count}. Set it to 1 or remove it.\n"
"See Architekture.md § Deployment Constraints for details."
"\n"
"Why this matters:\n"
" - Session cache is process-local; users may be randomly logged out\n"
" - Background jobs (blocklist imports, history sync) would run N times\n"
" - Database lock contention will cause timeouts\n"
"\n"
"To fix:\n"
" 1. Remove BANGUI_WORKERS=N from your environment\n"
" 2. Don't pass --workers to uvicorn or -w to gunicorn\n"
" 3. Deploy as a single process (use container orchestration for HA)\n"
"\n"
"See Docs/Architekture.md § Deployment Constraints for details."
)
except ValueError as e:
raise RuntimeError(
@@ -275,14 +285,20 @@ async def _stage_check_worker_mode_and_acquire_lock(startup_db: Any) -> None:
if not await acquire_scheduler_lock(startup_db):
raise RuntimeError(
"Could not acquire scheduler lock. Another BanGUI instance is already running the scheduler.\n"
"\n"
"This prevents duplicate background jobs (blocklist imports, history sync, etc.).\n"
"\n"
"IMPORTANT: This also indicates a possible multi-worker misconfiguration:\n"
" - If BANGUI_WORKERS > 1, multiple workers are trying to acquire the lock\n"
" - If --workers or -w was passed to uvicorn/gunicorn, remove it\n"
" - BanGUI must run with exactly 1 worker process (use HA at container level)\n"
"\n"
"To recover from a stale lock (e.g., after a crash):\n"
" 1. Verify no other BanGUI instances are running\n"
" 2. Inspect the lock: sqlite3 bangui.db 'SELECT * FROM scheduler_lock;'\n"
" 3. If stale, clean it: sqlite3 bangui.db 'DELETE FROM scheduler_lock;'\n"
"\n"
"See Architekture.md § Deployment Constraints for details."
"See Docs/Architekture.md § Deployment Constraints for details."
)

View File

@@ -24,18 +24,26 @@ IMPACT IN MULTI-WORKER DEPLOYMENTS:
- fail2ban activation/recovery tracking (pending_recovery, last_activation)
is per-worker and unreliable across processes.
MULTI-WORKER SOLUTION:
To deploy BanGUI with multiple workers (e.g., via gunicorn -w 4), you must:
1. Replace RuntimeState with a shared store (Redis, shared memory, database).
2. Replace InMemorySessionCache with RedisSessionCache (see session_cache.py).
3. Ensure all workers use the same backend for coordination.
SINGLE-WORKER ENFORCEMENT:
See TASK-002 in Docs/Tasks.md for deployment configuration that enforces
single-worker mode, preventing this issue entirely.
BanGUI enforces single-worker mode at startup:
1. Environment variable check: BANGUI_WORKERS must be 1 or unset
2. Database lock: Only one instance can run the scheduler at a time
3. Startup validation: Fails loudly if multi-worker scenario is detected
For now, BanGUI is deployed as single-worker only — this constraint is
acceptable and keeps the implementation simple.
See Docs/Architekture.md § Deployment Constraints for full details.
MULTI-WORKER SOLUTION (Future):
To deploy BanGUI with multiple workers in the future (e.g., via gunicorn -w 4):
1. Replace RuntimeState with a shared store (Redis, shared memory, database)
2. Replace InMemorySessionCache with a shared backend (Redis, database)
3. Replace APScheduler with a distributed scheduler backend
4. Ensure all workers use the same backend for coordination
CURRENT STATUS:
For now, BanGUI is deployed as single-worker only. This constraint is
acceptable and keeps the implementation simple. The database-backed scheduler
lock ensures only one instance runs background jobs, even in container
orchestration scenarios where multiple instances may start.
"""
from __future__ import annotations

View File

@@ -19,16 +19,24 @@ IMPACT IN MULTI-WORKER DEPLOYMENTS:
- Worker B still has the stale session in its cache → request is accepted.
- User appears still logged in (from their perspective).
This is a security issue: logout does not work reliably across workers.
This is a CRITICAL SECURITY ISSUE: logout does not work reliably across workers.
MULTI-WORKER SOLUTION:
To deploy BanGUI with multiple workers (e.g., via gunicorn -w 4), replace
InMemorySessionCache with a shared backend such as:
- RedisSessionCache — backed by Redis (recommended for production).
- DatabaseSessionCache — backed by SQLite or PostgreSQL.
- SharedMemorySessionCache — backed by IPC (for local multi-process).
SINGLE-WORKER ENFORCEMENT:
BanGUI enforces single-worker mode to prevent this issue:
1. Environment variable check: BANGUI_WORKERS must be 1 or unset
2. Database lock: Only one instance can run the scheduler at a time
3. Startup validation: Fails loudly if multi-worker scenario is detected
The SessionCache Protocol is already designed for pluggable backends:
See Docs/Architekture.md § Deployment Constraints for full details.
MULTI-WORKER SOLUTION (Future):
If multi-worker support is needed in the future, replace InMemorySessionCache
with a shared backend such as:
- RedisSessionCache — backed by Redis (recommended for production)
- DatabaseSessionCache — backed by SQLite or PostgreSQL
- SharedMemorySessionCache — backed by IPC (for local multi-process)
The SessionCache Protocol is designed for pluggable backends:
class SessionCache(Protocol):
def get(token: str) -> Session | None: ...
def set(token: str, session: Session, ttl_seconds: float) -> None: ...
@@ -36,17 +44,16 @@ MULTI-WORKER SOLUTION:
def clear() -> None: ...
To add Redis support:
1. Create RedisSessionCache in this module (implements SessionCache).
2. Update runtime_state.set_runtime_settings() to instantiate RedisSessionCache
when REDIS_URL is configured.
3. See Backend-Development.md § "Session Cache Pluggability" for details.
1. Create RedisSessionCache in this module (implements SessionCache)
2. Update app/main.py _update_session_cache() to instantiate RedisSessionCache
when BANGUI_REDIS_URL is configured
3. Update Backend-Development.md with multi-worker deployment guidelines
SINGLE-WORKER ENFORCEMENT:
See TASK-002 in Docs/Tasks.md for deployment configuration that enforces
single-worker mode, preventing this issue entirely.
For now, BanGUI is deployed as single-worker only — this constraint is
acceptable and keeps the implementation simple.
CURRENT STATUS:
For now, BanGUI is deployed as single-worker only. This constraint is
acceptable and keeps the implementation simple. The database-backed scheduler
lock ensures only one instance runs background jobs, even in container
orchestration scenarios where multiple instances may start.
"""
from __future__ import annotations

View File

@@ -83,11 +83,12 @@ async def test_startup_shared_resources_complete_flow() -> None:
mock_blocklist_import_register.return_value = None
# Call startup_shared_resources
http_session, scheduler = await startup_shared_resources(app, settings)
http_session, scheduler, startup_db = await startup_shared_resources(app, settings)
# Verify all stages completed successfully
assert http_session is not None
assert scheduler is not None
assert startup_db is not None
assert scheduler.running
# Verify resources were initialized
@@ -178,7 +179,7 @@ async def test_startup_shared_resources_scheduler_starts() -> None:
mock_geo_cache.init_geoip = MagicMock()
mock_geo_cache_class.return_value = mock_geo_cache
http_session, scheduler = await startup_shared_resources(app, settings)
http_session, scheduler, startup_db = await startup_shared_resources(app, settings)
# Verify scheduler is running
assert scheduler.running