diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..a5a263a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,33 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true + +[*.py] +indent_style = space +indent_size = 4 + +[*.{js,ts,tsx,jsx}] +indent_style = space +indent_size = 2 + +[*.md] +indent_style = space +indent_size = 2 + +[Dockerfile] +indent_style = space +indent_size = 4 + +[*.yml] +indent_style = space +indent_size = 2 + +[*.yaml] +indent_style = space +indent_size = 2 + +[Makefile] +indent_style = tab \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..6994e16 --- /dev/null +++ b/.env.example @@ -0,0 +1,60 @@ +# ────────────────────────────────────────────────────────────── +# BanGUI — Environment Variables Template +# Copy this file to .env and fill in the values below +# ────────────────────────────────────────────────────────────── + +# Session Secret (REQUIRED) +# Generate a secure random secret for session tokens. +# WARNING: Do not use the same secret across different environments. +# Generate with: python -c 'import secrets; print(secrets.token_hex(32))' +# Example value: a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6 +BANGUI_SESSION_SECRET= + +# Previous Session Secret (optional) +# Used during secret rotation to accept tokens signed with the old secret. +# Set this to the previous secret when rotating secrets, then unset it once +# all old tokens have expired. This enables gradual rotation without forcing logout. +# Leave empty unless performing a rotation. +BANGUI_SESSION_SECRET_PREVIOUS= + +# Timezone (optional, defaults to UTC) +# Use standard timezone names from the IANA Time Zone Database +# Examples: America/New_York, Europe/London, Asia/Tokyo, UTC +BANGUI_TIMEZONE=UTC + +# Backend port (optional, defaults to 8000) +# When using docker-compose, this is the port on your host machine +BANGUI_BACKEND_PORT=8000 + +# Frontend port (optional, defaults to 5173) +# When using docker-compose, this is the port on your host machine +BANGUI_FRONTEND_PORT=5173 + +# Public port (optional, defaults to 8080) +# When using production compose, this is the public-facing port +BANGUI_PORT=8080 + +# IP Geolocation (optional) +# Path to MaxMind GeoLite2-Country MMDB database file (primary resolver). +# Download from: https://www.maxmind.com/en/geolite2/signup +# If not set, geolocation is disabled (or falls back to HTTP if enabled below). +# Example: /data/GeoLite2-Country.mmdb +BANGUI_GEOIP_DB_PATH= + +# IP Geolocation HTTP Fallback (optional, defaults to false) +# ⚠️ SECURITY WARNING: Only enable if you cannot mount the MaxMind database. +# When enabled, unresolved IP addresses are sent unencrypted to ip-api.com. +# This is a privacy and GDPR/CCPA concern. Do NOT enable in production unless necessary. +# Set to "true" to enable (default is "false" for security). +BANGUI_GEOIP_ALLOW_HTTP_FALLBACK=false + +# CORS Configuration (optional) +# Comma-separated list of allowed origins for cross-origin requests. +# Defaults to common localhost development origins (http://localhost:5173, http://127.0.0.1:5173, etc). +# Set this in production to your frontend domain(s). +# Examples: +# BANGUI_CORS_ALLOWED_ORIGINS=https://example.com,https://app.example.com +# BANGUI_CORS_ALLOWED_ORIGINS= (empty to disable CORS) +# WARNING: Do NOT use wildcard "*" — it defeats CORS security when credentials are enabled. +BANGUI_CORS_ALLOWED_ORIGINS= + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..20aa935 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,174 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + backend: + name: Backend Tests + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Run tests with coverage + run: pytest --cov=app --cov-report=term-missing --cov-fail-under=80 + + - name: Upload coverage report + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: backend/htmlcov/ + retention-days: 7 + + ruff: + name: Lint + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install ruff + + - name: Run ruff + run: ruff check . + + mypy: + name: Type Check + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run mypy + run: mypy app + + import-linter: + name: Import Boundary + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run import-linter + run: linter + + openapi-breaking-changes: + name: OpenAPI Breaking Changes + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + # Only run on PRs — main branch push is covered by the baseline-commit step. + if: github.event_name == 'pull_request' + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Generate current OpenAPI spec + run: python scripts/generate_openapi.py current-openapi.json + + - name: Fetch baseline spec from main + run: | + git fetch origin main:main + git show main:backend/openapi.json > baseline-openapi.json 2>/dev/null || \ + echo "{}" > baseline-openapi.json + + - name: Install openapi-diff + run: npm install -g openapi-diff + + - name: Check for breaking changes + run: | + set +e + openapi-diff baseline-openapi.json current-openapi.json --format stylish 2>&1 + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then + echo "BREAKING CHANGE DETECTED — see output above" + exit 1 + fi + echo "No breaking changes found." + + openapi-baseline-commit: + name: OpenAPI Baseline Commit + runs-on: ubuntu-latest + # Only run on push to main (not PRs). + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Generate and commit OpenAPI baseline + run: | + python scripts/generate_openapi.py backend/openapi.json + + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + + git add backend/openapi.json + git diff --cached --quiet && echo "No changes to openapi.json" || \ + git commit -m "chore: update OpenAPI baseline spec [skip ci] + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6d90837..4b21858 100644 --- a/.gitignore +++ b/.gitignore @@ -95,20 +95,16 @@ Thumbs.db # ── Docker dev config ───────────────────────── # Ignore auto-generated linuxserver/fail2ban config files, # but track our custom filter, jail, and documentation. -Docker/fail2ban-dev-config/** -!Docker/fail2ban-dev-config/README.md -!Docker/fail2ban-dev-config/fail2ban/ -!Docker/fail2ban-dev-config/fail2ban/filter.d/ -!Docker/fail2ban-dev-config/fail2ban/filter.d/bangui-sim.conf -!Docker/fail2ban-dev-config/fail2ban/filter.d/bangui-access.conf -!Docker/fail2ban-dev-config/fail2ban/jail.d/ -!Docker/fail2ban-dev-config/fail2ban/jail.d/bangui-sim.conf -!Docker/fail2ban-dev-config/fail2ban/jail.d/bangui-access.conf -!Docker/fail2ban-dev-config/fail2ban/jail.d/blocklist-import.conf -!Docker/fail2ban-dev-config/fail2ban/jail.local +data/* # ── Misc ────────────────────────────────────── *.log *.tmp *.bak *.orig + +# ── E2E test results ─────────────────────────── +e2e/results/ +e2e/Instructions.md + +playwright-log.txt diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100644 index 0000000..7904292 --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1 @@ +cd frontend && npm run validate:types diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..59368f6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-merge-conflict + - id: check-added-large-files + + - repo: https://github.com/astral-sh/ruff-pre-commit + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/mirrors-prettier + hooks: + - id: prettier + args: [--check] + name: prettier (frontend) + files: ^frontend/ + entry: prettier --check + language: system \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..12de40d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,157 @@ +# Contributing to BanGUI + +Welcome! This guide covers everything you need to know to set up your dev environment, understand the codebase, and submit changes. + +--- + +## Dev Setup + +### 1 — Clone and init + +```bash +git clone +cd BanGUI +cp .env.example .env +python -c 'import secrets; print(secrets.token_hex(32))' +# paste output as BANGUI_SESSION_SECRET in .env +``` + +### 2 — Start the stack + +```bash +make up +``` + +Backend: http://127.0.0.1:8000 · Frontend (Vite proxy): http://127.0.0.1:5173 + +### 3 — Editor Setup + +Install **EditorConfig** plugin for your IDE. Ensures consistent formatting (indent style, line endings) across all editors. + +| IDE | Plugin | +|-----|--------| +| VS Code | EditorConfig (ms-vscode.editorconfig) | +| PyCharm / IntelliJ | Built-in (enable in Settings → Editor → Code Style) | +| Vim / Neovim | editorconfig-vim | +| Sublime Text | EditorConfig | + +### 4 — Pre-commit hooks + +**Backend** (pre-commit, all languages): + +```bash +pip install pre-commit +pre-commit install +``` + +**Frontend** (husky, TypeScript validation): + +```bash +cd frontend && npm install +npx husky install +``` + +Hooks run automatically on every `git commit`. To run manually: + +```bash +pre-commit run --all-files # backend hooks +cd frontend && npm run validate:types # frontend type check +``` + +--- + +## Project Structure + +``` +BanGUI/ +├── backend/ Python FastAPI app +│ └── app/ +│ ├── routers/ HTTP endpoint handlers +│ ├── services/ Business logic +│ ├── repos/ Data access +│ ├── models/ Pydantic request/response/domain models +│ └── utils/ Shared helpers +├── frontend/ React + TypeScript + Fluent UI v9 +│ └── src/ +│ ├── pages/ Route-level page components +│ ├── components/ Reusable UI components +│ ├── hooks/ Custom React hooks +│ └── types/ Shared TypeScript types +├── Docs/ Architecture, design, and feature documentation +└── Docker/ Container compose files +``` + +--- + +## Code Quality + +| Tool | Scope | Command | +|---|---|---| +| `ruff` | Backend linting | `cd backend && ruff check .` | +| `ruff-format` | Backend formatting | `cd backend && ruff format .` | +| `mypy --strict` | Backend type checking | `cd backend && mypy --strict app` | +| `tsc --noEmit` | Frontend type checking | `cd frontend && tsc --noEmit` | +| `eslint` | Frontend linting | `cd frontend && eslint src` | +| `prettier --check` | Frontend formatting | `cd frontend && prettier --check src` | +| `import-linter` | Layer boundary enforcement | `cd backend && linter` | + +**All checks must pass before committing.** CI runs the same suite. + +--- + +## Testing + +```bash +# Backend +cd backend && pytest --cov=app --cov-report=term-missing + +# Coverage threshold: 80%. Build fails if coverage drops below. +``` + +The CI pipeline enforces the same 80% minimum coverage threshold. + +--- + +## Security Rules + +### Never echo raw user input in error messages + +User-supplied values (jail names, filter names, action names, IPs, filenames, etc.) +MUST be sanitized before interpolation into any string that may be rendered in an +HTML context (error messages, admin UI, email notifications). + +Use the `sanitize_for_display()` helper from `app.utils.display_sanitizer`: + +```python +from app.utils.display_sanitizer import sanitize_for_display + +# Good: sanitized before display +super().__init__(f"Jail not found: {sanitize_for_display(name)!r}") + +# Bad: raw user input echoed — XSS vector if rendered as HTML +super().__init__(f"Jail not found: {name!r}") +``` + +This rule applies even when the value has been validated: validation checks the +format, not the rendering context. JSON API responses do NOT need sanitization +(JSON is not HTML); apply it only at HTML render boundaries. + +--- + +## Stack + +| Layer | Stack | +|---|---| +| Backend | Python 3.12+, FastAPI, Pydantic v2, aiosqlite, structlog | +| Frontend | TypeScript, React, Fluent UI v9, Vite | +| Container | Docker Compose (development + production) | + +--- + +## Key Docs + +- [Instructions.md](Docs/Instructions.md) — Agent operating rules +- [Backend-Development.md](Docs/Backend-Development.md) — Backend conventions +- [Web-Development.md](Docs/Web-Development.md) — Frontend conventions +- [Features.md](Docs/Features.md) — Complete feature list +- [Architekture.md](Docs/Architekture.md) — System architecture \ No newline at end of file diff --git a/Docker/Dockerfile.backend b/Docker/Dockerfile.backend index 849e552..665ec13 100644 --- a/Docker/Dockerfile.backend +++ b/Docker/Dockerfile.backend @@ -7,6 +7,11 @@ # Usage: # docker build -t bangui-backend -f Docker/Dockerfile.backend . # podman build -t bangui-backend -f Docker/Dockerfile.backend . +# +# Signal handling: +# - STOPSIGNAL defaults to SIGTERM (handled by uvicorn → lifespan shutdown) +# - stop_grace_period in docker-compose.yml controls Docker's kill timeout +# - Python code allows 25s for in-flight tasks to drain before hard kill # ────────────────────────────────────────────────────────────── # ── Stage 1: build dependencies ────────────────────────────── @@ -33,6 +38,11 @@ FROM docker.io/library/python:3.12-slim AS runtime LABEL maintainer="BanGUI" \ description="BanGUI backend — fail2ban web management API" +# Install curl for healthcheck (used by Docker HEALTHCHECK and Compose healthcheck) +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + # Non-root user for security RUN groupadd --gid 1000 bangui \ && useradd --uid 1000 --gid bangui --shell /bin/bash --create-home bangui @@ -56,14 +66,32 @@ VOLUME ["/data"] # Default environment values (override at runtime) ENV BANGUI_DATABASE_PATH="/data/bangui.db" \ BANGUI_FAIL2BAN_SOCKET="/var/run/fail2ban/fail2ban.sock" \ - BANGUI_LOG_LEVEL="info" + BANGUI_LOG_LEVEL="info" \ + BANGUI_WORKERS="1" EXPOSE 8000 USER bangui # Health-check using the built-in health endpoint -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')" || exit 1 +# Returns exit 0 (success) for HTTP 200 (fail2ban online) +# Returns exit 1 (failure) for HTTP 503 (fail2ban offline) +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:8000/api/health || exit 1 +# ⚠️ IMPORTANT: Single-Worker Requirement +# BanGUI must always run as a single worker process: +# - Do NOT pass --workers or --worker-class to uvicorn +# - Do NOT use gunicorn with -w 4 or similar +# - Do NOT override BANGUI_WORKERS to > 1 +# +# Why? The session cache is process-local. Multiple workers would cause: +# - Random user logouts (sessions not shared between workers) +# - Duplicate background jobs (each worker runs the scheduler) +# - SQLite lock contention and timeouts +# +# For high availability, use container orchestration (Kubernetes, Docker Swarm) +# to run multiple instances, not multiple workers in a single process. +# +# See Docs/Architekture.md § Deployment Constraints for details. CMD ["uvicorn", "app.main:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Docker/compose.debug.yml b/Docker/compose.debug.yml index 84c9eb2..9f9b307 100644 --- a/Docker/compose.debug.yml +++ b/Docker/compose.debug.yml @@ -31,10 +31,10 @@ services: PUID: 0 PGID: 0 volumes: - - ./fail2ban-dev-config:/config + - ../data/fail2ban-dev-config:/config - fail2ban-dev-run:/var/run/fail2ban - /var/log:/var/log:ro - - ./logs:/remotelogs/bangui + - ../data/log:/remotelogs/bangui healthcheck: test: ["CMD", "fail2ban-client", "ping"] interval: 15s @@ -58,17 +58,22 @@ services: BANGUI_DATABASE_PATH: "/data/bangui.db" BANGUI_FAIL2BAN_SOCKET: "/var/run/fail2ban/fail2ban.sock" BANGUI_FAIL2BAN_CONFIG_DIR: "/config/fail2ban" + BANGUI_LOG_FILE: "/data/log/bangui.log" BANGUI_LOG_LEVEL: "debug" - BANGUI_SESSION_SECRET: "${BANGUI_SESSION_SECRET:-dev-secret-do-not-use-in-production}" + BANGUI_ENABLE_DOCS: "true" + BANGUI_SESSION_SECRET: "${BANGUI_SESSION_SECRET:?BANGUI_SESSION_SECRET must be set — generate with: python -c 'import secrets; print(secrets.token_hex(32))'}" BANGUI_TIMEZONE: "${BANGUI_TIMEZONE:-UTC}" + # Secure=false is intentional for local HTTP development. + # In production, Secure=true prevents session cookies over unencrypted HTTP. + BANGUI_SESSION_COOKIE_SECURE: "false" + # BANGUI_WORKERS should not be set (defaults to 1). + # Never set it to > 1; the session cache is process-local. volumes: - ../backend/app:/app/app:z - ../fail2ban-master:/app/fail2ban-master:ro,z - - bangui-dev-data:/data + - ../data:/data - fail2ban-dev-run:/var/run/fail2ban:ro - - ./fail2ban-dev-config:/config:rw - ports: - - "${BANGUI_BACKEND_PORT:-8000}:8000" + - ../data/fail2ban-dev-config:/config:rw command: [ "uvicorn", "app.main:create_app", "--factory", @@ -76,13 +81,12 @@ services: "--reload", "--reload-dir", "/app/app" ] healthcheck: - test: ["CMD-SHELL", "python -c 'import urllib.request; urllib.request.urlopen(\"http://127.0.0.1:8000/api/health\", timeout=4)'"] + test: ["CMD-SHELL", "python -c 'import urllib.request; urllib.request.urlopen(\"http://127.0.0.1:8000/api/v1/health/live\", timeout=4)'"] interval: 15s timeout: 5s start_period: 45s retries: 5 - networks: - - bangui-dev-net + network_mode: host # ── Frontend (Vite dev server with HMR) ───────────────────── frontend: @@ -92,23 +96,15 @@ services: working_dir: /app environment: NODE_ENV: development + VITE_BACKEND_URL: "http://localhost:8000" volumes: - ../frontend:/app:z - frontend-node-modules:/app/node_modules - ports: - - "${BANGUI_FRONTEND_PORT:-5173}:5173" command: ["sh", "-c", "npm install && npm run dev -- --host 0.0.0.0"] depends_on: backend: condition: service_healthy - healthcheck: - test: ["CMD", "wget", "-qO", "/dev/null", "http://localhost:5173/"] - interval: 15s - timeout: 5s - start_period: 30s - retries: 5 - networks: - - bangui-dev-net + network_mode: host volumes: bangui-dev-data: diff --git a/Docker/compose.prod.yml b/Docker/compose.prod.yml deleted file mode 100644 index 0e1ee72..0000000 --- a/Docker/compose.prod.yml +++ /dev/null @@ -1,109 +0,0 @@ -# ────────────────────────────────────────────────────────────── -# BanGUI — Production Compose -# -# Compatible with: -# docker compose -f Docker/compose.prod.yml up -d -# podman compose -f Docker/compose.prod.yml up -d -# podman-compose -f Docker/compose.prod.yml up -d -# -# Prerequisites: -# Create a .env file at the project root (or pass --env-file): -# BANGUI_SESSION_SECRET= -# ────────────────────────────────────────────────────────────── - -name: bangui - -services: - # ── fail2ban ───────────────────────────────────────────────── - fail2ban: - image: lscr.io/linuxserver/fail2ban:latest - container_name: bangui-fail2ban - restart: unless-stopped - cap_add: - - NET_ADMIN - - NET_RAW - network_mode: host - environment: - TZ: "${BANGUI_TIMEZONE:-UTC}" - PUID: 0 - PGID: 0 - volumes: - - fail2ban-config:/config - - fail2ban-run:/var/run/fail2ban - - /var/log:/var/log:ro - healthcheck: - test: ["CMD", "fail2ban-client", "ping"] - interval: 30s - timeout: 5s - start_period: 15s - retries: 3 - # NOTE: The fail2ban-config volume must be pre-populated with the following files: - # • fail2ban/jail.conf (or jail.d/*.conf) with the DEFAULT section containing: - # banaction = iptables-allports[lockingopt="-w 5"] - # This prevents xtables lock contention errors when multiple jails start in parallel. - # See https://fail2ban.readthedocs.io/en/latest/development/environment.html - - # ── Backend (FastAPI + uvicorn) ───────────────────────────── - backend: - build: - context: .. - dockerfile: Docker/Dockerfile.backend - container_name: bangui-backend - restart: unless-stopped - depends_on: - fail2ban: - condition: service_healthy - environment: - BANGUI_DATABASE_PATH: "/data/bangui.db" - BANGUI_FAIL2BAN_SOCKET: "/var/run/fail2ban/fail2ban.sock" - BANGUI_FAIL2BAN_CONFIG_DIR: "/config/fail2ban" - BANGUI_LOG_LEVEL: "info" - BANGUI_SESSION_SECRET: "${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET}" - BANGUI_TIMEZONE: "${BANGUI_TIMEZONE:-UTC}" - volumes: - - bangui-data:/data - - fail2ban-run:/var/run/fail2ban:ro - - fail2ban-config:/config:rw - expose: - - "8000" - healthcheck: - test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/api/health')"] - interval: 30s - timeout: 5s - start_period: 10s - retries: 3 - networks: - - bangui-net - - # ── Frontend (nginx serving built SPA + API proxy) ────────── - frontend: - build: - context: .. - dockerfile: Docker/Dockerfile.frontend - container_name: bangui-frontend - restart: unless-stopped - ports: - - "${BANGUI_PORT:-8080}:80" - depends_on: - backend: - condition: service_healthy - healthcheck: - test: ["CMD", "wget", "-qO", "/dev/null", "http://localhost:80/"] - interval: 30s - timeout: 5s - start_period: 5s - retries: 3 - networks: - - bangui-net - -volumes: - bangui-data: - driver: local - fail2ban-config: - driver: local - fail2ban-run: - driver: local - -networks: - bangui-net: - driver: bridge diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml deleted file mode 100644 index 70ced48..0000000 --- a/Docker/docker-compose.yml +++ /dev/null @@ -1,73 +0,0 @@ -version: '3.8' -services: - fail2ban: - image: lscr.io/linuxserver/fail2ban:latest - container_name: fail2ban - cap_add: - - NET_ADMIN - - NET_RAW - network_mode: host - environment: - - PUID=1011 - - PGID=1001 - - TZ=Etc/UTC - - VERBOSITY=-vv #optional - - volumes: - - /server/server_fail2ban/config:/config - - /server/server_fail2ban/fail2ban-run:/var/run/fail2ban - - /var/log:/var/log - - /server/server_nextcloud/config/nextcloud.log:/remotelogs/nextcloud/nextcloud.log:ro #optional - - /server/server_nginx/data/logs:/remotelogs/nginx:ro #optional - - /server/server_gitea/log/gitea.log:/remotelogs/gitea/gitea.log:ro #optional - - - #- /path/to/homeassistant/log:/remotelogs/homeassistant:ro #optional - #- /path/to/unificontroller/log:/remotelogs/unificontroller:ro #optional - #- /path/to/vaultwarden/log:/remotelogs/vaultwarden:ro #optional - restart: unless-stopped - - backend: - image: git.lpl-mind.de/lukas.pupkalipinski/bangui/backend:latest - container_name: bangui-backend - restart: unless-stopped - depends_on: - fail2ban: - condition: service_started - environment: - - PUID=1011 - - PGID=1001 - - BANGUI_DATABASE_PATH=/data/bangui.db - - BANGUI_FAIL2BAN_SOCKET=/var/run/fail2ban/fail2ban.sock - - BANGUI_FAIL2BAN_CONFIG_DIR=/config/fail2ban - - BANGUI_LOG_LEVEL=info - - BANGUI_SESSION_SECRET=${BANGUI_SESSION_SECRET:?Set BANGUI_SESSION_SECRET} - - BANGUI_TIMEZONE=${BANGUI_TIMEZONE:-UTC} - volumes: - - /server/server_fail2ban/bangui-data:/data - - /server/server_fail2ban/fail2ban-run:/var/run/fail2ban:ro - - /server/server_fail2ban/config:/config:rw - expose: - - "8000" - networks: - - bangui-net - - # ── Frontend (nginx serving built SPA + API proxy) ────────── - frontend: - image: git.lpl-mind.de/lukas.pupkalipinski/bangui/frontend:latest - container_name: bangui-frontend - restart: unless-stopped - environment: - - PUID=1011 - - PGID=1001 - ports: - - "${BANGUI_PORT:-8080}:80" - depends_on: - backend: - condition: service_started - networks: - - bangui-net - -networks: - bangui-net: - name: bangui-net \ No newline at end of file diff --git a/Docker/fail2ban-dev-config/README.md b/Docker/fail2ban-dev-config/README.md deleted file mode 100644 index 6422e00..0000000 --- a/Docker/fail2ban-dev-config/README.md +++ /dev/null @@ -1,147 +0,0 @@ -# BanGUI — Fail2ban Dev Test Environment - -This directory contains the fail2ban configuration and supporting scripts for a -self-contained development test environment. A simulation script writes fake -authentication-failure log lines, fail2ban detects them via the `manual-Jail` -jail, and bans the offending IP — giving a fully reproducible ban/unban cycle -without a real service. - ---- - -## Prerequisites - -- Docker or Podman installed and running. -- `docker compose` (v2) or `podman-compose` available on the `PATH`. -- The repo checked out; all commands run from the **repo root**. - ---- - -## Quick Start - -### 1 — Start the fail2ban container - -```bash -docker compose -f Docker/compose.debug.yml up -d fail2ban -# or: make up (starts the full dev stack) -``` - -Wait ~15 s for the health-check to pass (`docker ps` shows `healthy`). - -### 2 — Run the login-failure simulation - -```bash -bash Docker/simulate_failed_logins.sh -``` - -Default: writes **5** failure lines for IP `192.168.100.99` to -`Docker/logs/auth.log`. -Optional overrides: - -```bash -bash Docker/simulate_failed_logins.sh -# e.g. bash Docker/simulate_failed_logins.sh 10 203.0.113.42 -``` - -### 3 — Verify the IP was banned - -```bash -bash Docker/check_ban_status.sh -``` - -The output shows the current jail counters and the list of banned IPs with their -ban expiry timestamps. - -### 4 — Unban and re-test - -```bash -bash Docker/check_ban_status.sh --unban 192.168.100.99 -``` - -### One-command smoke test (Makefile shortcut) - -```bash -make dev-ban-test -``` - -Chains steps 1–3 automatically with appropriate sleep intervals. - ---- - -## Configuration Reference - -| File | Purpose | -|------|---------| -| `fail2ban/filter.d/manual-Jail.conf` | Defines the `failregex` that matches simulation log lines | -| `fail2ban/jail.d/manual-Jail.conf` | Jail settings: `maxretry=3`, `bantime=60s`, `findtime=120s` | -| `Docker/logs/auth.log` | Log file written by the simulation script (host path) | - -Inside the container the log file is mounted at `/remotelogs/bangui/auth.log` -(see `fail2ban/paths-lsio.conf` — `remote_logs_path = /remotelogs`). - -BanGUI also extends fail2ban history retention for archive backfill. In -the development config `fail2ban/fail2ban.conf` the database purge age is -set to `648000` seconds (7.5 days) so the first archive sync can recover a -full 7-day window before fail2ban purges old rows. - -To change sensitivity, edit `fail2ban/jail.d/manual-Jail.conf`: - -```ini -maxretry = 3 # failures before a ban -findtime = 120 # look-back window in seconds -bantime = 60 # ban duration in seconds -``` - ---- - -## Troubleshooting - -### Log file not detected - -The jail uses `backend = polling` for reliability inside Docker containers. -If fail2ban still does not pick up new lines, verify the volume mount in -`Docker/compose.debug.yml`: - -```yaml -- ./logs:/remotelogs/bangui -``` - -and confirm `Docker/logs/auth.log` exists after running the simulation script. - -### Filter regex mismatch - -Test the regex manually: - -```bash -docker exec bangui-fail2ban-dev \ - fail2ban-regex /remotelogs/bangui/auth.log manual-Jail -``` - -The output should show matched lines. If nothing matches, check that the log -lines match the corresponding `failregex` pattern: - -``` -# manual-Jail (auth log): -YYYY-MM-DD HH:MM:SS bangui-auth: authentication failure from -``` - -### iptables / permission errors - -The fail2ban container requires `NET_ADMIN` and `NET_RAW` capabilities and -`network_mode: host`. Both are already set in `Docker/compose.debug.yml`. If -you see iptables errors, check that the host kernel has iptables loaded: - -```bash -sudo modprobe ip_tables -``` - -### IP not banned despite enough failures - -Check whether the source IP falls inside the `ignoreip` range defined in -`fail2ban/jail.d/manual-Jail.conf`: - -```ini -ignoreip = 127.0.0.0/8 ::1 172.16.0.0/12 -``` - -The default simulation IP `192.168.100.99` is outside these ranges and will be -banned normally. diff --git a/Docker/fail2ban-dev-config/fail2ban/filter.d/manual-Jail.conf b/Docker/fail2ban-dev-config/fail2ban/filter.d/manual-Jail.conf deleted file mode 100644 index 48019ec..0000000 --- a/Docker/fail2ban-dev-config/fail2ban/filter.d/manual-Jail.conf +++ /dev/null @@ -1,13 +0,0 @@ -# ────────────────────────────────────────────────────────────── -# BanGUI — Simulated authentication failure filter -# -# Matches lines written by Docker/simulate_failed_logins.sh -# Format: bangui-auth: authentication failure from -# Jail: manual-Jail -# ────────────────────────────────────────────────────────────── - -[Definition] - -failregex = ^.* bangui-auth: authentication failure from \s*$ - -ignoreregex = diff --git a/Docker/fail2ban-dev-config/fail2ban/jail.d/blocklist-import.conf b/Docker/fail2ban-dev-config/fail2ban/jail.d/blocklist-import.conf deleted file mode 100644 index 0bae8b3..0000000 --- a/Docker/fail2ban-dev-config/fail2ban/jail.d/blocklist-import.conf +++ /dev/null @@ -1,25 +0,0 @@ -# ────────────────────────────────────────────────────────────── -# BanGUI — Blocklist-import jail -# -# Dedicated jail for IPs banned via the BanGUI blocklist import -# feature. This is a manual-ban jail: it does not watch any log -# file. All bans are injected programmatically via -# fail2ban-client set blocklist-import banip -# which the BanGUI backend uses through its fail2ban socket -# client. -# ────────────────────────────────────────────────────────────── - -[blocklist-import] - -enabled = true -# No log-based detection — only manual banip commands are used. -filter = -logpath = /dev/null -backend = auto -maxretry = 1 -findtime = 1d -# Block imported IPs for 24 hours. -bantime = 86400 - -# Never ban the Docker bridge network or localhost. -ignoreip = 127.0.0.0/8 ::1 172.16.0.0/12 diff --git a/Docker/fail2ban-dev-config/fail2ban/jail.d/manual-Jail.conf b/Docker/fail2ban-dev-config/fail2ban/jail.d/manual-Jail.conf deleted file mode 100644 index 00a9a82..0000000 --- a/Docker/fail2ban-dev-config/fail2ban/jail.d/manual-Jail.conf +++ /dev/null @@ -1,19 +0,0 @@ -# ────────────────────────────────────────────────────────────── -# BanGUI — Simulated authentication failure jail -# -# Watches Docker/logs/auth.log (mounted at /remotelogs/bangui) -# for lines produced by Docker/simulate_failed_logins.sh. -# ────────────────────────────────────────────────────────────── - -[manual-Jail] - -enabled = true -filter = manual-Jail -logpath = /remotelogs/bangui/auth.log -backend = polling -maxretry = 3 -findtime = 120 -bantime = 60 - -# Never ban localhost, the Docker bridge network, or the host machine. -ignoreip = 127.0.0.0/8 ::1 172.16.0.0/12 diff --git a/Docker/fail2ban-dev-config/fail2ban/jail.local b/Docker/fail2ban-dev-config/fail2ban/jail.local deleted file mode 100644 index f66226e..0000000 --- a/Docker/fail2ban-dev-config/fail2ban/jail.local +++ /dev/null @@ -1,6 +0,0 @@ -# Local overrides — not overwritten by the container init script. -# Provides banaction so all jails can resolve %(action_)s interpolation. - -[DEFAULT] -banaction = iptables-multiport -banaction_allports = iptables-allports diff --git a/Docker/nginx.conf b/Docker/nginx.conf index 5910ccf..5efa829 100644 --- a/Docker/nginx.conf +++ b/Docker/nginx.conf @@ -10,6 +10,15 @@ server { gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript image/svg+xml; gzip_min_length 256; + # ── Security headers ───────────────────────────────────── + add_header Content-Security-Policy "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; font-src 'self'; connect-src 'self'; frame-ancestors 'none';" always; + add_header X-Frame-Options "DENY" always; + add_header X-Content-Type-Options "nosniff" always; + add_header Referrer-Policy "no-referrer" always; + add_header Permissions-Policy "geolocation=(), microphone=(), camera=()" always; + # Uncomment when HTTPS is fully configured: + # add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload" always; + # ── API reverse proxy → backend container ───────────────── location /api/ { proxy_pass http://backend:8000; diff --git a/Docker/simulate_failed_logins.sh b/Docker/simulate_failed_logins.sh index a0ad9ac..dfe534e 100644 --- a/Docker/simulate_failed_logins.sh +++ b/Docker/simulate_failed_logins.sh @@ -11,7 +11,7 @@ # Defaults: # COUNT : 5 # SOURCE_IP: 192.168.100.99 -# LOG_FILE : Docker/logs/auth.log (relative to repo root) +# LOG_FILE : data/log/auth.log (relative to repo root) # # Log line format (must match manual-Jail failregex exactly): # YYYY-MM-DD HH:MM:SS bangui-auth: authentication failure from @@ -25,7 +25,7 @@ readonly DEFAULT_IP="192.168.100.99" # Resolve script location so defaults work regardless of cwd. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -readonly DEFAULT_LOG_FILE="${SCRIPT_DIR}/logs/auth.log" +readonly DEFAULT_LOG_FILE="${SCRIPT_DIR}/../data/log/auth.log" # ── Arguments ───────────────────────────────────────────────── COUNT="${1:-${DEFAULT_COUNT}}" diff --git a/Docs/API-Reference.md b/Docs/API-Reference.md new file mode 100644 index 0000000..4898d94 --- /dev/null +++ b/Docs/API-Reference.md @@ -0,0 +1,1338 @@ +# BanGUI API Reference + +Complete reference for the BanGUI REST API. All endpoints require authentication unless noted as public. + +Base URL: `http://{host}:8000` + +**Authentication** — All protected endpoints require a valid session cookie (`bangui_session`) or `Authorization: Bearer ` header. + +--- + +## Public Endpoints + +### `GET /api/v1/health` + +Health check. No auth required. + +**Response `200`** +```json +{ + "status": "ok", + "fail2ban": "online", + "database": "ok", + "scheduler": "running", + "cache": "initialised", + "components": [] +} +``` + +| Field | Description | +|---|---| +| `status` | `ok`, `degraded`, or `unavailable` | +| `fail2ban` | `online` or `offline` | +| `database` | `ok` or `error` | +| `scheduler` | `running`, `stopped`, or `unknown` | +| `cache` | `initialised` or `uninitialised` | +| `components` | List of unhealthy components (empty when `ok`) | + +**Response `503**` — fail2ban offline. + +--- + +### `GET /api/v1/setup` + +Check whether initial setup has been completed. + +**Response `200`** +```json +{ "completed": true } +``` + +--- + +### `POST /api/v1/setup` + +Run the first-run setup wizard. + +**Request** +```json +{ + "master_password": "Hallo123!", + "database_path": "/var/lib/fail2ban/fail2ban.sqlite3", + "fail2ban_socket": "/var/run/fail2ban/fail2ban.sock", + "timezone": "Europe/Berlin", + "session_duration_minutes": 480 +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `master_password` | string | Yes | Min 8 chars, uppercase + number + special (`!@#$%^&*()`) | +| `database_path` | string | No | Path to fail2ban DB (default: `/var/lib/fail2ban/fail2ban.sqlite3`) | +| `fail2ban_socket` | string | No | Path to fail2ban socket (default: `/var/run/fail2ban/fail2ban.sock`) | +| `timezone` | string | No | IANA timezone (default: `UTC`) | +| `session_duration_minutes` | int | No | Session TTL in minutes (default: `480`) | + +**Response `201`** — Setup completed. + +**Response `409`** — Setup already completed. + +--- + +### `GET /api/v1/setup/timezone` + +Returns the configured IANA timezone. + +**Response `200`** +```json +{ "timezone": "Europe/Berlin" } +``` + +--- + +### `GET /metrics` + +Prometheus metrics endpoint. No auth required. Returns OpenMetrics text format. + +--- + +## Auth + +### `POST /api/v1/auth/login` + +Authenticate with the master password. + +**Request** +```json +{ "password": "Hallo123!" } +``` + +> Note: The frontend SHA256-hashes the password before sending. The backend expects the already-hashed value. + +**Response `200`** — Sets `bangui_session` cookie. +```json +{ "expires_at": "2024-12-25T10:00:00Z" } +``` + +**Response `401`** — Invalid password. + +**Response `429`** — Too many login attempts (exponential backoff delay). Includes `Retry-After` header. + +**Response `503`** — Setup not complete. + +--- + +### `GET /api/v1/auth/session` + +Validate the current session. + +**Response `200`** +```json +{ "valid": true } +``` + +**Response `401`** — Session missing, expired, or invalid. + +--- + +### `POST /api/v1/auth/logout` + +Revoke the current session. + +**Response `200`** +```json +{} +``` + +Session cookie is cleared. Idempotent — returns `200` even if no session present. + +--- + +## Dashboard + +### `GET /api/v1/dashboard/status` + +Returns the cached fail2ban server health snapshot (refreshed every 30 seconds). + +**Response `200`** +```json +{ + "status": { + "version": "0.12.1", + "online": true, + "uptime": 86400, + "jail_count": 3 + } +} +``` + +| Field | Type | Description | +|---|---|---| +| `version` | string | fail2ban server version | +| `online` | bool | Whether fail2ban daemon is reachable | +| `uptime` | int | Daemon uptime in seconds | +| `jail_count` | int | Number of configured jails | + +**Response `401`** — Not authenticated. + +**Response `502`** — fail2ban unreachable. + +--- + +### `GET /api/v1/dashboard/bans` + +Paginated list of recent bans with geo enrichment. + +**Query Parameters** + +| Param | Type | Default | Description | +|---|---|---|---| +| `range` | `TimeRange` | `24h` | Time window: `24h`, `7d`, `30d`, `365d` | +| `source` | string | `fail2ban` | Data source: `fail2ban` or `archive` | +| `page` | int | `1` | 1-based page number | +| `page_size` | int | `100` | Items per page (max 500) | +| `origin` | string | null | Filter: `blocklist` or `selfblock` | + +**Response `200`** +```json +{ + "items": [ + { + "ip": "1.2.3.4", + "jail": "sshd", + "banned_at": "2024-12-25T08:00:00Z", + "expires_at": "2024-12-26T08:00:00Z", + "country": "US", + "asn": "AS15169", + "org": "Google LLC" + } + ], + "total": 150, + "page": 1, + "page_size": 100 +} +``` + +--- + +### `GET /api/v1/dashboard/bans/by-country` + +Ban counts aggregated by country. + +**Query Parameters** — Same as `/bans` plus optional `country_code` filter. + +**Response `200`** +```json +{ + "countries": { "US": 45, "CN": 32, "BR": 18 }, + "total": 150, + "items": [...] +} +``` + +--- + +### `GET /api/v1/dashboard/bans/trend` + +Ban counts grouped into time buckets for charts. + +**Bucket sizes:** +- `24h` → 1-hour buckets (24 total) +- `7d` → 6-hour buckets (28 total) +- `30d` → 1-day buckets (30 total) +- `365d` → 7-day buckets (~53 total) + +**Query Parameters** — Same as `/bans`. + +**Response `200`** +```json +{ + "buckets": [ + { "ts": "2024-12-24T00:00:00Z", "count": 12 }, + { "ts": "2024-12-24T01:00:00Z", "count": 8 } + ], + "bucket_size": "1h", + "total": 150 +} +``` + +--- + +### `GET /api/v1/dashboard/bans/by-jail` + +Ban counts grouped by jail. + +**Query Parameters** — Same as `/bans`. + +**Response `200`** +```json +{ + "jails": [ + { "jail": "sshd", "count": 120 }, + { "jail": "nginx-http-auth", "count": 30 } + ], + "total": 150 +} +``` + +--- + +## Bans + +### `GET /api/v1/bans/active` + +List all currently banned IPs across all jails. + +**Response `200`** +```json +{ + "items": [ + { + "ip": "1.2.3.4", + "jail": "sshd", + "banned_at": "2024-12-25T08:00:00Z", + "expires_at": "2024-12-26T08:00:00Z", + "country": "US" + } + ], + "total": 42 +} +``` + +**Response `401`** — Not authenticated. + +**Response `502`** — fail2ban unreachable. + +--- + +### `POST /api/v1/bans` + +Ban an IP address in a specific jail. + +**Request** +```json +{ "jail": "sshd", "ip": "5.6.7.8" } +``` + +**Response `201`** +```json +{ "message": "IP '5.6.7.8' banned in jail 'sshd'.", "jail": "sshd" } +``` + +**Response `400`** — Invalid IP address. + +**Response `404`** — Jail not found. + +**Response `409`** — Ban command failed in fail2ban. + +**Response `429`** — Rate limit exceeded (10 ban requests/minute per IP). + +**Response `502`** — fail2ban unreachable. + +--- + +### `DELETE /api/v1/bans` + +Unban an IP from one or all jails. + +**Request** +```json +{ "ip": "5.6.7.8", "jail": "sshd", "unban_all": false } +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `ip` | string | Yes | IP address to unban | +| `jail` | string | No | Specific jail to unban from | +| `unban_all` | bool | No | `true` = unban from all jails (default: `false` if `jail` omitted) | + +**Response `200`** +```json +{ "message": "IP '5.6.7.8' unbanned from jail 'sshd'.", "jail": "sshd" } +``` + +**Response `404`** — Jail not found. + +**Response `429`** — Rate limit exceeded (10 unban requests/minute per IP). + +--- + +### `DELETE /api/v1/bans/all` + +Unban every currently banned IP across all jails. + +**Response `200`** +```json +{ "message": "All bans cleared. 42 IP addresses unbanned.", "count": 42 } +``` + +--- + +## History + +### `GET /api/v1/history` + +Paginated historical ban records. + +**Query Parameters** + +| Param | Type | Default | Description | +|---|---|---|---| +| `range` | `TimeRange` | null | Time filter: `24h`, `7d`, `30d`, `365d` (null = all-time) | +| `jail` | string | null | Filter by jail name (exact match) | +| `ip` | string | null | Filter by IP prefix | +| `origin` | string | null | Filter: `blocklist` or `selfblock` | +| `source` | string | `fail2ban` | `fail2ban` or `archive` | +| `page` | int | `1` | 1-based page number | +| `page_size` | int | `100` | Items per page (max 500) | + +**Response `200`** +```json +{ + "items": [ + { + "ip": "1.2.3.4", + "jail": "sshd", + "banned_at": "2024-12-25T08:00:00Z", + "unbanned_at": "2024-12-26T08:00:00Z", + "origin": "selfblock", + "country": "US" + } + ], + "total": 500, + "page": 1, + "page_size": 100 +} +``` + +--- + +### `GET /api/v1/history/archive` + +Same as `/history` but reads from the archive database. + +**Query Parameters** — Same as `/history` (no `origin` filter). + +--- + +### `GET /api/v1/history/{ip}` + +Complete ban timeline for a single IP. + +**Response `200`** +```json +{ + "ip": "1.2.3.4", + "country": "US", + "total_bans": 5, + "timeline": [ + { + "jail": "sshd", + "banned_at": "2024-12-25T08:00:00Z", + "unbanned_at": "2024-12-26T08:00:00Z", + "origin": "selfblock" + } + ] +} +``` + +**Response `404`** — No history found for this IP. + +--- + +## Jails + +### `GET /api/v1/jails` + +List all active fail2ban jails. + +**Response `200`** +```json +{ + "items": [ + { + "name": "sshd", + "enabled": true, + "currently_banned": 12, + "total_bans": 150, + "failed_attempts": 320, + "find_time": 600, + "ban_time": 86400, + "max_retries": 5, + "backend": "polling", + "idle": false + } + ], + "total": 3 +} +``` + +--- + +### `GET /api/v1/jails/{name}` + +Full detail for a single jail. + +**Response `200`** +```json +{ + "name": "sshd", + "enabled": true, + "log_paths": ["/var/log/auth.log"], + "fail_regex": ["^%(__prefix_line)sFailed publickey forInvalid user"], + "ignore_regex": [], + "date_pattern": null, + "log_encoding": "UTF-8", + "actions": ["iptables"], + "find_time": 600, + "ban_time": 86400, + "max_retries": 5, + "ignore_list": ["192.168.1.1"], + "ignore_self": true, + "currently_banned": 12, + "total_bans": 150, + "failed_attempts": 320 +} +``` + +**Response `404`** — Jail not found. + +--- + +### `POST /api/v1/jails/{name}/start` + +Start a stopped jail. + +**Response `200`** +```json +{ "message": "Jail 'sshd' started.", "jail": "sshd" } +``` + +--- + +### `POST /api/v1/jails/{name}/stop` + +Stop a running jail. + +**Response `200`** +```json +{ "message": "Jail 'sshd' stopped.", "jail": "sshd" } +``` + +--- + +### `POST /api/v1/jails/{name}/idle` + +Toggle jail idle mode. + +**Request body** +```json +{ "on": true } +``` + +**Response `200`** +```json +{ "message": "Jail 'sshd' idle mode turned on.", "jail": "sshd" } +``` + +--- + +### `POST /api/v1/jails/{name}/reload` + +Reload a single jail. + +**Response `200`** +```json +{ "message": "Jail 'sshd' reloaded.", "jail": "sshd" } +``` + +--- + +### `POST /api/v1/jails/reload-all` + +Reload all fail2ban jails. + +**Response `200`** +```json +{ "message": "All jails reloaded successfully.", "jail": "*" } +``` + +--- + +### `GET /api/v1/jails/{name}/ignoreip` + +Get the ignore (whitelist) list for a jail. + +**Response `200`** +```json +{ "items": ["192.168.1.0/24", "10.0.0.1"], "total": 2 } +``` + +--- + +### `POST /api/v1/jails/{name}/ignoreip` + +Add an IP or CIDR to the ignore list. + +**Request** +```json +{ "ip": "192.168.1.100" } +``` + +**Response `201`** +```json +{ "message": "IP '192.168.1.100' added to ignore list of jail 'sshd'.", "jail": "sshd" } +``` + +**Response `400`** — Invalid IP or network. + +--- + +### `DELETE /api/v1/jails/{name}/ignoreip` + +Remove an IP or CIDR from the ignore list. + +**Request** +```json +{ "ip": "192.168.1.100" } +``` + +**Response `200`** +```json +{ "message": "IP '192.168.1.100' removed from ignore list of jail 'sshd'.", "jail": "sshd" } +``` + +--- + +### `POST /api/v1/jails/{name}/ignoreself` + +Toggle the `ignoreself` option (ban server's own IP). + +**Request** +```json +{ "on": true } +``` + +**Response `200`** +```json +{ "message": "ignoreself enabled for jail 'sshd'.", "jail": "sshd" } +``` + +--- + +### `GET /api/v1/jails/{name}/banned` + +Paginated currently-banned IPs for a specific jail. + +**Query Parameters** + +| Param | Type | Default | Description | +|---|---|---|---| +| `page` | int | `1` | 1-based page number | +| `page_size` | int | `100` | Items per page (max 100) | +| `search` | string | null | Case-insensitive substring filter on IP | + +**Response `200`** +```json +{ + "items": [ + { + "ip": "1.2.3.4", + "banned_at": "2024-12-25T08:00:00Z", + "expires_at": "2024-12-26T08:00:00Z", + "country": "US", + "asn": "AS15169", + "org": "Google LLC" + } + ], + "total": 12, + "page": 1, + "page_size": 100 +} +``` + +--- + +## Config + +### `GET /api/v1/config/global` + +Get global fail2ban settings. + +**Response `200`** +```json +{ + "loglevel": "INFO", + "logtarget": "/var/log/fail2ban.log", + "syslog_socket": "auto", + "db_file": "/var/lib/fail2ban/fail2ban.sqlite3", + "db_purge_age": 86400 +} +``` + +--- + +### `PUT /api/v1/config/global` + +Update global fail2ban settings. + +**Request** — All fields optional (only non-null fields written): +```json +{ + "loglevel": "DEBUG", + "logtarget": "/var/log/fail2ban.log", + "db_purge_age": 604800 +} +``` + +| Field | Type | Description | +|---|---|---| +| `loglevel` | string | `CRITICAL`, `ERROR`, `WARNING`, `NOTICE`, `INFO`, `DEBUG` | +| `logtarget` | string | `STDOUT`, `STDERR`, `SYSLOG`, or a file path | +| `db_purge_age` | int | Seconds before old ban records are purged | + +**Response `204`** — Updated. + +**Response `400`** — `logtarget` invalid (not in allowed directories). + +**Response `429`** — Rate limit exceeded (10 updates/minute per IP). + +--- + +### `POST /api/v1/config/reload` + +Trigger a full fail2ban reload. + +**Response `204`** + +--- + +### `POST /api/v1/config/restart` + +Restart the fail2ban service (stop + start). + +**Response `204`** + +**Response `503`** — fail2ban did not come back online within 10 seconds. + +--- + +### `POST /api/v1/config/regex-test` + +Test a fail regex pattern against a sample log line (stateless, no fail2ban call). + +**Request** +```json +{ "pattern": "^%(__prefix_line)sFailed publickey", "sample": "Dec 25 08:00:01 server sshd[123]: Failed publickey for user admin from 1.2.3.4" } +``` + +**Response `200`** +```json +{ "matched": true, "groups": ["Dec 25 08:00:01", "server", "123", "admin", "1.2.3.4"] } +``` + +--- + +### `POST /api/v1/config/preview-log` + +Read a log file and test a regex against each line. + +**Request** +```json +{ + "path": "/var/log/auth.log", + "pattern": "^Failed publickey", + "lines": 50 +} +``` + +**Response `200`** +```json +{ + "lines": [ + { "line": "Dec 25 08:00:01 server sshd[123]: Failed publickey...", "matched": true, "groups": [...] }, + { "line": "Dec 25 08:00:02 server sshd[456]: Accepted publickey...", "matched": false } + ] +} +``` + +--- + +### `GET /api/v1/config/map-color-thresholds` + +Get map color threshold configuration. + +**Response `200`** +```json +{ + "thresholds": [ + { "count": 0, "color": "#4ade80" }, + { "count": 10, "color": "#facc15" }, + { "count": 50, "color": "#f97316" }, + { "count": 200, "color": "#ef4444" } + ] +} +``` + +--- + +### `PUT /api/v1/config/map-color-thresholds` + +Update map color thresholds. + +**Request** +```json +{ + "thresholds": [ + { "count": 0, "color": "#4ade80" }, + { "count": 100, "color": "#facc15" } + ] +} +``` + +> Thresholds must be strictly ascending by `count`. + +**Response `200`** — Updated thresholds. + +**Response `400`** — Thresholds not properly ordered. + +--- + +### `GET /api/v1/config/fail2ban-log` + +Read the tail of the fail2ban daemon log file. + +**Query Parameters** + +| Param | Type | Default | Description | +|---|---|---|---| +| `lines` | int | `200` | Number of tail lines (1–2000) | +| `filter` | string | null | Plain-text substring filter | + +**Response `200`** +```json +{ + "lines": ["2024-12-25 08:00:01,000 INFO ...", "2024-12-25 08:00:02,000 WARNING ..."], + "count": 2 +} +``` + +--- + +### `GET /api/v1/config/service-status` + +Fail2ban service health with log configuration. + +**Response `200`** +```json +{ + "online": true, + "version": "0.12.1", + "loglevel": "INFO", + "logtarget": "/var/log/fail2ban.log" +} +``` + +--- + +## Filters + +### `GET /api/v1/config/filters` + +List all available filters with active/inactive status. + +**Response `200`** +```json +{ + "items": [ + { + "name": "sshd", + "active": true, + "used_by_jails": ["sshd"], + "source_file": "/etc/fail2ban/filter.d/sshd.conf", + "has_local_override": false, + "failregex": ["^%(__prefix_line)sFailed publickey"], + "ignoreregex": [], + "date_pattern": null, + "journalmatch": null + } + ], + "total": 12 +} +``` + +Active filters (used by running jails) are listed first, sorted alphabetically. Inactive filters follow. + +--- + +### `GET /api/v1/config/filters/{name}` + +Full detail for a single filter. + +**Response `200`** — FilterConfig object (same shape as list item). + +**Response `404`** — Filter not found in `filter.d/`. + +--- + +### `POST /api/v1/config/filters` + +Create a new user-defined filter. + +**Request** +```json +{ + "name": "nginx-404", + "failregex": ["^\\s*\\S+ \\S+ \\S+ GET /nonexistent"], + "ignoreregex": null, + "date_pattern": null, + "journalmatch": null +} +``` + +**Response `201`** — Created FilterConfig object. + +**Response `409`** — Filter with this name already exists. + +**Response `422`** — Regex failed to compile. + +**Response `429`** — Rate limit exceeded (5 creates/minute per IP). + +--- + +### `PUT /api/v1/config/filters/{name}` + +Update a filter's `.local` override. Only non-null fields are written. + +**Request** +```json +{ + "failregex": ["^new pattern here"], + "ignoreregex": null +} +``` + +**Query Parameter** — `reload` (bool, default `false`) — trigger fail2ban reload after writing. + +**Response `200`** — Updated FilterConfig object. + +**Response `422`** — Regex failed to compile. + +**Response `429`** — Rate limit exceeded (10 updates/minute per IP). + +--- + +### `DELETE /api/v1/config/filters/{name}` + +Delete a user-created filter's `.local` file. Shipped `.conf`-only filters cannot be deleted. + +**Response `204`** + +**Response `409`** — Filter is a shipped default (conf-only). + +--- + +## Actions + +### `GET /api/v1/config/actions` + +List all available actions with active/inactive status. + +**Response `200`** +```json +{ + "actions": [ + { + "name": "iptables", + "active": true, + "used_by_jails": ["sshd", "nginx-http-auth"], + "source_file": "/etc/fail2ban/action.d/iptables.conf", + "has_local_override": false, + "start_command": "iptables -N f2b-sshd...", + "stop_command": "iptables -X f2b-sshd...", + "check_command": "iptables -L f2b-sshd -n", + "ban_action": "iptables -I f2b-sshd...", + "unban_action": "iptables -D f2b-sshd..." + } + ], + "total": 8 +} +``` + +--- + +### `GET /api/v1/config/actions/{name}` + +Full detail for a single action. + +**Response `200`** — ActionConfig object (same shape as list item). + +--- + +### `POST /api/v1/config/actions` + +Create a new user-defined action. + +**Request** +```json +{ + "name": "my-custom-action", + "start_command": "echo 'starting'", + "stop_command": "echo 'stopping'", + "check_command": "echo 'checking'", + "ban_action": "echo 'banning'", + "unban_action": "echo 'unbanning'" +} +``` + +**Response `201`** — Created ActionConfig object. + +**Response `409`** — Action with this name already exists. + +--- + +### `PUT /api/v1/config/actions/{name}` + +Update an action's `.local` override. + +**Request** — All fields optional: +```json +{ "ban_action": "new ban command here" } +``` + +**Query Parameter** — `reload` (bool, default `false`). + +**Response `200`** — Updated ActionConfig object. + +--- + +### `DELETE /api/v1/config/actions/{name}` + +Delete a user-created action's `.local` file. + +**Response `204`** + +**Response `409`** — Action is a shipped default (conf-only). + +--- + +## Geo + +### `GET /api/v1/geo/lookup/{ip}` + +Ban status and geo info for an IP. + +**Response `200`** +```json +{ + "ip": "1.2.3.4", + "banned": true, + "jails": ["sshd", "nginx-http-auth"], + "country": "US", + "country_name": "United States", + "region": "North America", + "city": "Mountain View", + "isp": "Google LLC", + "asn": "AS15169", + "org": "Google LLC", + "last_ban": "2024-12-25T08:00:00Z", + "total_bans": 3 +} +``` + +**Response `400`** — Invalid IP address. + +--- + +### `GET /api/v1/geo/stats` + +Geo cache diagnostic counters. + +**Response `200`** +```json +{ + "total": 1500, + "resolved": 1480, + "failed": 20, + "cache_size": 1480 +} +``` + +--- + +### `POST /api/v1/geo/re-resolve` + +Re-resolve all IPs with failed geo lookups. + +**Response `200`** +```json +{ + "total": 20, + "resolved": 18, + "failed": 2 +} +``` + +--- + +## Blocklists + +### `GET /api/v1/blocklists` + +List all blocklist sources. + +**Response `200`** +```json +{ + "sources": [ + { + "id": 1, + "name": "Country Block List", + "url": "https://example.com/blocklist.txt", + "enabled": true, + "last_import_at": "2024-12-25T08:00:00Z", + "last_import_succeeded": true, + "last_import_ban_count": 45 + } + ], + "total": 1 +} +``` + +--- + +### `POST /api/v1/blocklists` + +Add a new blocklist source. + +**Request** +```json +{ "name": "Spamhaus DROP", "url": "https://www.spamhaus.org/drop/drop.txt", "enabled": true } +``` + +**Response `201`** — Created BlocklistSource object. + +**Response `400`** — URL validation failed. + +--- + +### `GET /api/v1/blocklists/{source_id}` + +Get a single blocklist source. + +--- + +### `PUT /api/v1/blocklists/{source_id}` + +Update a blocklist source. + +**Request** — All fields optional: +```json +{ "name": "New Name", "enabled": false } +``` + +--- + +### `DELETE /api/v1/blocklists/{source_id}` + +Delete a blocklist source. + +**Response `204`** + +--- + +### `POST /api/v1/blocklists/import` + +Trigger an immediate import of all enabled blocklist sources. + +**Response `200`** +```json +{ + "started_at": "2024-12-25T10:00:00Z", + "sources": [ + { + "id": 1, + "name": "Spamhaus DROP", + "url": "https://www.spamhaus.org/drop/drop.txt", + "imported": 45, + "skipped": 3, + "failed": false, + "error": null + } + ], + "total_imported": 45, + "total_skipped": 3, + "total_failed": 0 +} +``` + +**Response `429`** — Rate limit exceeded (1 import/hour per IP). + +--- + +### `GET /api/v1/blocklists/schedule` + +Get the current import schedule. + +**Response `200`** +```json +{ + "enabled": true, + "interval_hours": 24, + "next_run_at": "2024-12-26T08:00:00Z" +} +``` + +--- + +### `PUT /api/v1/blocklists/schedule` + +Update the import schedule. + +**Request** +```json +{ "enabled": true, "interval_hours": 12 } +``` + +**Response `200`** — Updated ScheduleInfo. + +--- + +### `GET /api/v1/blocklists/log` + +Paginated import log. + +**Query Parameters** + +| Param | Type | Default | Description | +|---|---|---|---| +| `source_id` | int | null | Filter by source | +| `page` | int | `1` | 1-based page | +| `page_size` | int | `100` | Items per page (max 500) | + +**Response `200`** +```json +{ + "items": [ + { + "id": 1, + "source_id": 1, + "source_name": "Spamhaus DROP", + "started_at": "2024-12-25T08:00:00Z", + "completed_at": "2024-12-25T08:01:23Z", + "imported": 45, + "skipped": 3, + "failed": false, + "error": null + } + ], + "total": 50, + "page": 1, + "page_size": 100 +} +``` + +--- + +### `GET /api/v1/blocklists/{source_id}/preview` + +Preview the contents of a blocklist source (downloads and samples first ~20 lines). + +**Response `200`** +```json +{ + "url": "https://example.com/blocklist.txt", + "validated_lines": ["1.2.3.4", "5.6.7.8"], + "invalid_lines": ["not-an-ip"], + "total_valid": 2, + "total_invalid": 1 +} +``` + +**Response `502`** — URL could not be reached. + +--- + +## Server + +### `GET /api/v1/server/settings` + +Get fail2ban server-level settings. + +**Response `200`** +```json +{ + "loglevel": "INFO", + "logtarget": "/var/log/fail2ban.log", + "syslog_socket": "auto", + "db_file": "/var/lib/fail2ban/fail2ban.sqlite3", + "db_purge_age": 86400, + "max_matches": 100 +} +``` + +--- + +### `PUT /api/v1/server/settings` + +Update fail2ban server-level settings. + +**Request** — All fields optional: +```json +{ "loglevel": "DEBUG", "max_matches": 200 } +``` + +**Response `204`** + +**Response `400`** — fail2ban rejected a setting. + +--- + +### `POST /api/v1/server/flush-logs` + +Flush and re-open fail2ban log files (after log rotation). + +**Response `200`** +```json +{ "message": "Success: 1 log(s) flushed" } +``` + +--- + +## Common Types + +### `TimeRange` + +``` +"24h" | "7d" | "30d" | "365d" +``` + +### `BanOrigin` + +``` +"blocklist" | "selfblock" +``` + +### `Source` + +``` +"fail2ban" | "archive" +``` + +--- + +## Status Codes + +| Code | Meaning | +|---|---| +| `200` | OK | +| `201` | Created | +| `204` | No Content | +| `400` | Bad Request — invalid input | +| `401` | Unauthorized — session missing, expired, or invalid | +| `404` | Not Found | +| `409` | Conflict | +| `422` | Unprocessable Entity — validation failed | +| `429` | Too Many Requests — rate limit exceeded | +| `502` | Bad Gateway — fail2ban unreachable | +| `503` | Service Unavailable | diff --git a/Docs/API_STATUS_CODES.md b/Docs/API_STATUS_CODES.md new file mode 100644 index 0000000..5f7e91c --- /dev/null +++ b/Docs/API_STATUS_CODES.md @@ -0,0 +1,730 @@ +# API Status Codes Reference + +Complete reference of all HTTP status codes returned by the BanGUI API v1. +Use this document to handle every possible response from every endpoint. + +--- + +## Status Code Taxonomy + +| Code | Meaning | When Used | +|------|---------|-----------| +| **200** | OK | Successful GET, PUT, POST (no creation) | +| **201** | Created | Successful POST that created a resource | +| **204** | No Content | Successful DELETE or PUT with no response body | +| **400** | Bad Request | Invalid input, validation failure, bad IP, URL validation | +| **401** | Unauthorized | Missing, expired, or invalid session | +| **404** | Not Found | Entity does not exist | +| **409** | Conflict | State conflict (already exists, already done, operation failed) | +| **422** | Unprocessable Entity | Request body validation failed (Pydantic) | +| **429** | Too Many Requests | Rate limit exceeded | +| **500** | Internal Server Error | Unexpected server failure | +| **502** | Bad Gateway | fail2ban socket unreachable | +| **503** | Service Unavailable | Setup incomplete or component degraded | + +--- + +## /api/v1/auth + +### POST /api/v1/auth/login +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Login successful | `LoginResponse` | +| 401 | Invalid password | Error body | +| 422 | Validation error — invalid request body | Error body | +| 429 | Too many login attempts, retry after delay | Error body | +| 503 | Setup not complete | Error body | + +### GET /api/v1/auth/session +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Session valid | `SessionValidResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +### POST /api/v1/auth/logout +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Logout successful | `LogoutResponse` | +| 401 | Session missing or invalid (silently successful) | Error body | + +--- + +## /api/v1/setup + +### GET /api/v1/setup +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Setup status returned | `SetupStatusResponse` | + +### POST /api/v1/setup +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | Setup completed successfully | `SetupResponse` | +| 400 | Validation error in request body | Error body | +| 409 | Setup already completed | Error body | + +### GET /api/v1/setup/timezone +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Timezone returned | `SetupTimezoneResponse` | + +--- + +## /api/v1/health + +### GET /api/v1/health +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | All components healthy | `HealthResponse` | +| 503 | fail2ban offline or component degraded | `HealthResponse` | + +--- + +## /api/v1/dashboard + +### GET /api/v1/dashboard/status +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Server status returned | `ServerStatusResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/dashboard/bans +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Ban list returned | `DashboardBanListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/dashboard/bans/by-country +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Ban counts by country returned | `BansByCountryResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/dashboard/bans/trend +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Ban trend data returned | `BanTrendResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/dashboard/bans/by-jail +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Ban counts by jail returned | `BansByJailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/bans + +### GET /api/v1/bans/active +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Active ban list returned | `ActiveBanListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/bans +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | IP banned successfully | `JailCommandResponse` | +| 400 | Invalid IP address | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | Ban command failed in fail2ban | Error body | +| 429 | Rate limit exceeded for ban operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### DELETE /api/v1/bans +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | IP unbanned successfully | `JailCommandResponse` | +| 400 | Invalid IP address | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | Unban command failed in fail2ban | Error body | +| 429 | Rate limit exceeded for unban operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### DELETE /api/v1/bans/all +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | All bans cleared | `UnbanAllResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/jails + +### GET /api/v1/jails +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jails list returned | `JailListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/jails/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail detail returned | `JailDetailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/reload-all +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | All jails reloaded | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/start +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail started | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/stop +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail stopped | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/idle +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Idle mode toggled | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/reload +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail reloaded | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/jails/{name}/ignoreip +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Ignore list returned | `IgnoreListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/ignoreip +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | IP added to ignore list | `JailCommandResponse` | +| 400 | IP or network invalid | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### DELETE /api/v1/jails/{name}/ignoreip +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | IP removed from ignore list | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/jails/{name}/ignoreself +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | ignoreself toggled | `JailCommandResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 409 | fail2ban reports operation failed | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/jails/{name}/banned +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Banned IPs returned | `JailBannedIpsResponse` | +| 400 | page or page_size out of range | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/history + +### GET /api/v1/history +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | History list returned | `HistoryListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/history/archive +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Archived history list returned | `HistoryListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/history/{ip} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | IP history detail returned | `IpDetailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | No history found for this IP | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/geo + +### GET /api/v1/geo/lookup/{ip} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | IP lookup result returned | `IpLookupResponse` | +| 400 | Invalid IP address | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/geo/stats +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Geo cache stats returned | `GeoCacheStatsResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +### POST /api/v1/geo/re-resolve +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Re-resolve result | `GeoReResolveResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +--- + +## /api/v1/server + +### GET /api/v1/server/settings +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Server settings returned | `ServerSettingsResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### PUT /api/v1/server/settings +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Settings updated successfully | No body | +| 400 | Set command rejected by fail2ban | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/server/flush-logs +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Logs flushed successfully | `FlushLogsResponse` | +| 400 | Command rejected by fail2ban | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/config + +### GET /api/v1/config/global +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Global config returned | `GlobalConfigResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### PUT /api/v1/config/global +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Global config updated successfully | No body | +| 400 | Set command rejected or log_target invalid | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 429 | Rate limit exceeded for config update operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/config/reload +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Fail2ban reloaded successfully | No body | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | Reload command failed in fail2ban | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/config/restart +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Fail2ban restarted successfully | No body | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | Stop command failed in fail2ban | Error body | +| 502 | fail2ban unreachable for stop command | Error body | +| 503 | fail2ban did not come back online within 10s | Error body | + +### POST /api/v1/config/regex-test +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Regex test result | `RegexTestResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 422 | Invalid regex pattern | Error body | + +### POST /api/v1/config/preview-log +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Log preview result | `LogPreviewResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 422 | Invalid regex pattern | Error body | + +### GET /api/v1/config/map-color-thresholds +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Color thresholds returned | `MapColorThresholdsResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +### PUT /api/v1/config/map-color-thresholds +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Color thresholds updated | `MapColorThresholdsResponse` | +| 400 | Validation error (thresholds not properly ordered) | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 429 | Rate limit exceeded for config update operations | Error body | + +### GET /api/v1/config/fail2ban-log +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Log file lines returned | `Fail2BanLogResponse` | +| 400 | Log target not a file or path outside allowed directory | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/service-status +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Service status returned | `ServiceStatusResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +--- + +## /api/v1/config/jails (jail_config router) + +### GET /api/v1/config/jails +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jails config list returned | `JailConfigListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/jails/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail config detail returned | `JailConfigDetailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | +| 502 | fail2ban unreachable | Error body | + +### PUT /api/v1/config/jails/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Jail config updated | `JailConfigDetailResponse` | +| 400 | Invalid value for a property | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | +| 422 | Validation error | Error body | +| 429 | Rate limit exceeded for jail config operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### POST /api/v1/config/jails/{name}/commit +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Changes committed successfully | `JailConfigDetailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | +| 409 | Commit failed (fail2ban rejected the new config) | Error body | +| 429 | Rate limit exceeded for jail config operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/jails/{name}/rollback +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Rollback successful | `JailConfigDetailResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | +| 502 | fail2ban unreachable | Error body | + +### DELETE /api/v1/config/jails/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Jail deleted successfully | No body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | +| 409 | Jail is a shipped default (conf-only) | Error body | +| 429 | Rate limit exceeded for jail config operations | Error body | + +### POST /api/v1/config/jails +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | Jail created | `JailConfigDetailResponse` | +| 400 | Invalid jail name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | Jail already exists | Error body | +| 429 | Rate limit exceeded for jail config operations | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/jails/{name}/files +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Config files returned | `ConfigFileListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Jail not found in config | Error body | + +--- + +## /api/v1/config/filters (filter_config router) + +### GET /api/v1/config/filters +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Filter list returned | `FilterListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/filters/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Filter config returned | `FilterConfig` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Filter not found in filter.d/ | Error body | +| 502 | fail2ban unreachable | Error body | + +### PUT /api/v1/config/filters/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Filter updated | `FilterConfig` | +| 400 | Invalid filter name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Filter not found | Error body | +| 422 | Regex pattern failed to compile | Error body | +| 429 | Rate limit exceeded for filter update operations | Error body | +| 500 | Failed to write .local file | Error body | + +### POST /api/v1/config/filters +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | Filter created | `FilterConfig` | +| 400 | Invalid filter name or regex too long | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | Filter already exists | Error body | +| 422 | Regex pattern failed to compile | Error body | +| 429 | Rate limit exceeded for filter create operations | Error body | +| 500 | Failed to write .local file | Error body | + +### DELETE /api/v1/config/filters/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Filter deleted successfully | No body | +| 400 | Invalid filter name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Filter not found | Error body | +| 409 | Filter is a shipped default (conf-only) | Error body | +| 429 | Rate limit exceeded for filter delete operations | Error body | +| 500 | Failed to delete .local file | Error body | + +--- + +## /api/v1/config/actions (action_config router) + +### GET /api/v1/config/actions +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Action list returned | `ActionListResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 502 | fail2ban unreachable | Error body | + +### GET /api/v1/config/actions/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Action config returned | `ActionConfig` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Action not found in action.d/ | Error body | +| 502 | fail2ban unreachable | Error body | + +### PUT /api/v1/config/actions/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Action updated | `ActionConfig` | +| 400 | Invalid action name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Action not found | Error body | +| 429 | Rate limit exceeded for action update operations | Error body | +| 500 | Failed to write .local file | Error body | + +### POST /api/v1/config/actions +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | Action created | `ActionConfig` | +| 400 | Invalid action name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 409 | Action already exists | Error body | +| 429 | Rate limit exceeded for action create operations | Error body | +| 500 | Failed to write .local file | Error body | + +### DELETE /api/v1/config/actions/{name} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Action deleted successfully | No body | +| 400 | Invalid action name | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Action not found | Error body | +| 409 | Action is a shipped default (conf-only) | Error body | +| 429 | Rate limit exceeded for action delete operations | Error body | +| 500 | Failed to delete .local file | Error body | + +--- + +## /api/v1/blocklists + +### GET /api/v1/blocklists +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Blocklist sources returned | `BlocklistListResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +### POST /api/v1/blocklists +| Status | Description | Response Model | +|--------|-------------|----------------| +| 201 | Blocklist source created | `BlocklistSource` | +| 400 | URL validation failed | Error body | +| 401 | Session missing, expired, or invalid | Error body | + +### POST /api/v1/blocklists/import +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Import completed | `ImportRunResult` | +| 401 | Session missing, expired, or invalid | Error body | +| 429 | Rate limit exceeded for blocklist import | Error body | + +### GET /api/v1/blocklists/schedule +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Schedule info returned | `ScheduleInfo` | +| 401 | Session missing, expired, or invalid | Error body | + +### PUT /api/v1/blocklists/schedule +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Schedule updated | `ScheduleInfo` | +| 401 | Session missing, expired, or invalid | Error body | + +### GET /api/v1/blocklists/log +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Import log returned | `ImportLogListResponse` | +| 401 | Session missing, expired, or invalid | Error body | + +### GET /api/v1/blocklists/{source_id} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Blocklist source returned | `BlocklistSource` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Blocklist source not found | Error body | + +### PUT /api/v1/blocklists/{source_id} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Blocklist source updated | `BlocklistSource` | +| 400 | URL validation failed | Error body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Blocklist source not found | Error body | + +### DELETE /api/v1/blocklists/{source_id} +| Status | Description | Response Model | +|--------|-------------|----------------| +| 204 | Blocklist source deleted successfully | No body | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Blocklist source not found | Error body | + +### GET /api/v1/blocklists/{source_id}/preview +| Status | Description | Response Model | +|--------|-------------|----------------| +| 200 | Blocklist preview returned | `PreviewResponse` | +| 401 | Session missing, expired, or invalid | Error body | +| 404 | Blocklist source not found | Error body | +| 502 | URL could not be reached | Error body | + +--- + +## Error Response Format + +All error responses follow this structure: + +```json +{ + "code": "error_code_string", + "detail": "Human-readable error message", + "metadata": { + "key": "value" + } +} +``` + +### Common error_code values + +| code | Meaning | +|------|---------| +| `not_found` | Requested entity does not exist | +| `invalid_input` | Validation failure or bad parameters | +| `conflict` | State conflict (already exists, already done) | +| `authentication_required` | Session missing or invalid | +| `rate_limit_exceeded` | Rate limit hit — check `retry_after_seconds` in metadata | +| `fail2ban_unreachable` | fail2ban socket cannot be reached | +| `config_validation_failed` | Config value rejected | +| `config_file_not_found` | Config file does not exist | +| `jail_not_found` | Jail does not exist | +| `filter_not_found` | Filter does not exist | +| `action_not_found` | Action does not exist | +| `blocklist_source_not_found` | Blocklist source does not exist | +| `setup_already_complete` | Setup has already been run | + +--- + +## Status Code Decision Guide + +**Frontend gets 400 — what's wrong?** +- Has `code: "invalid_input"` → validation failure, check `detail` +- Has `code: "jail_not_found"` → jail doesn't exist +- Has `code: "config_validation_failed"` → config value rejected + +**Frontend gets 502 — what's wrong?** +- fail2ban is down or socket path wrong +- Check `code: "fail2ban_unreachable"` + +**Frontend gets 503 — what's wrong?** +- Setup not complete (`code: "setup_already_complete"`) +- Health check: fail2ban offline or component degraded + +**Frontend gets 409 — what's wrong?** +- Already done: jail already active/inactive, setup already complete +- Operation failed: fail2ban rejected the command +- Conflict: resource already exists + +**Frontend gets 429 — what's wrong?** +- Rate limit exceeded +- `metadata.retry_after_seconds` tells you how long to wait \ No newline at end of file diff --git a/Docs/API_VERSIONING.md b/Docs/API_VERSIONING.md new file mode 100644 index 0000000..c00d8bc --- /dev/null +++ b/Docs/API_VERSIONING.md @@ -0,0 +1,166 @@ +# API Versioning Strategy + +**Status:** Active — Current version: **v1** + +All BanGUI API endpoints are versioned using URI path versioning (e.g., `/api/v1/`). +This document explains when and how to version endpoints, how deprecation works, and what guarantees consumers can rely on. + +--- + +## 1. Version Lifecycle + +| Stage | Meaning | +|-------|---------| +| **Current** | Active, receiving new features and bug fixes. | +| **Deprecated** | Still functional but marked for removal. Clients receive `Deprecation: true` and `Sunset: ` response headers. | +| **Removed** | Endpoint no longer exists. Clients must migrate to a newer version. | + +--- + +## 2. URL Structure + +``` +/api/v{major}// +``` + +- **v1** — current version (released 2026-05-02) +- **v2** — reserved; skeleton router deployed at `/api/v2/jails` but **not yet active** for production traffic +- **PATCH** versions (v1.1, v1.2) are **not** used; only **major** version bumps indicate breaking changes +- The OpenAPI schema is always available at `/api/openapi.json` regardless of version + +--- + +## 3. What Triggers a Version Bump + +A new major version is required when a **breaking change** must be introduced, including: + +- Removing or renaming a field in a response model +- Changing the type of a request or response field +- Removing an endpoint entirely +- Changing authentication/authorization semantics +- Modifying the semantics of an existing operation + +**Non-breaking changes** (backward-compatible): + +- Adding new optional request fields +- Adding new response fields +- Adding new endpoints +- Fixing bugs that caused incorrect behavior + +These do **not** require a version bump. + +--- + +## 4. Deprecation Policy + +When an endpoint is deprecated: + +1. The endpoint **remains functional** for a minimum of **6 months** from the `Sunset` date +2. Response headers are added to every 2xx response: + ``` + Deprecation: true + Sunset: + Link: ; rel="successor-version" + ``` +3. The endpoint is registered in the deprecation middleware (``app/middleware/deprecation.py``) +4. The OpenAPI schema marks the endpoint with `deprecated: true` +5. Documentation is updated to show the endpoint as deprecated + +### Implementing Deprecation Headers + +The ``DeprecationHeaderMiddleware`` (``app/middleware/deprecation.py``) automatically injects +the correct headers for any registered deprecated endpoint. To schedule an endpoint for removal: + +```python +from datetime import datetime, timezone, timedelta +from app.middleware.deprecation import register_deprecated_endpoint + +# Example: deprecate /api/v1/jails on 2026-11-03 (6 months from v2 release) +register_deprecated_endpoint( + path_prefix="/api/v1/jails", + sunset_date=datetime(2026, 11, 3, tzinfo=timezone.utc), + successor_url="/api/v2/jails", +) +``` + +The middleware runs on every response; if the request path matches a registered deprecated prefix, +the appropriate headers are appended before the response is returned. + +--- + +## 5. Backend Development: Adding Versioned Endpoints + +### New endpoints + +All new endpoints are added to the **current** version (`/api/v1/`). Prefix your router: + +```python +router = APIRouter(prefix="/api/v1/my-resource", tags=["My Resource"]) +``` + +### Breaking changes requiring v2 + +1. Create a new router file (e.g., `routers/my_resource_v2.py`) with the v2 prefix: + ```python + router = APIRouter(prefix="/api/v2/my-resource", tags=["My Resource (v2)"]) + ``` +2. Copy or adapt the v1 handler logic as needed. Extract shared business logic into + a **service layer function** so both routers call the same underlying code. +3. Register the new router in `app/main.py`: + ```python + app.include_router(my_resource_v2.router) + ``` +4. Register the v1 endpoint for deprecation headers (see §4 above) +5. Update this document to reflect the new version lifecycle + +### Keeping routers DRY + +Routers should only contain HTTP concerns (parameters, responses, status codes). Business logic +belongs in the service layer. Both v1 and v2 handlers can call the same service function. + +--- + +## 6. Frontend Development + +The frontend always uses the current version's base URL: + +```typescript +const BASE_URL: string = import.meta.env.VITE_API_URL ?? "/api/v1"; +``` + +All endpoint paths in `frontend/src/api/endpoints.ts` are defined as relative paths (e.g., `/bans`, `/jails`) and are appended to `BASE_URL` at runtime. + +When v2 is released, update ``VITE_API_URL`` in the environment configuration to point to `/api/v2`. + +--- + +## 7. OpenAPI / Documentation + +- Swagger UI: `/api/docs` +- ReDoc: `/api/redoc` +- OpenAPI schema: `/api/openapi.json` +- Docs are **not** versioned; they always reflect the **current** (latest) API version + +--- + +## 8. CI Breaking-Change Checks + +A GitHub Actions job runs on every pull request to detect breaking OpenAPI changes: + +- ``openapi-breaking-changes`` job (PR only): generates the current OpenAPI spec and + compares it against the baseline committed on the last push to `main`. If any breaking + changes are found, the job fails and the PR cannot be merged. +- ``openapi-baseline-commit`` job (main push only): generates and commits the current + OpenAPI spec as the new baseline for future PR comparisons. + +To trigger the baseline update, push to main after merging a version bump or any change +that legitimately alters the OpenAPI surface. + +--- + +## 9. Version History + +| Version | Status | Released | Sunset Date | Notes | +|---------|--------|---------|-------------|-------| +| v1 | **Current** | 2026-05-02 | — | Initial versioning; all endpoints moved from `/api/` to `/api/v1/` | +| v2 | **Reserved — skeleton active, endpoints not yet available** | — | — | Router skeleton at `app/routers/jails_v2.py`; real endpoints will be added before activation | \ No newline at end of file diff --git a/Docs/Architekture.md b/Docs/Architekture.md index af79b86..5112287 100644 --- a/Docs/Architekture.md +++ b/Docs/Architekture.md @@ -20,14 +20,14 @@ BanGUI is a two-tier web application with a clear separation between frontend an ┌─────────────────────────────┼────────────────────────────────────┐ │ Server │ │ ┌──────────────────────────┴─────────────────────────────────┐ │ -│ │ Backend (FastAPI) │ │ +│ │ Backend (FastAPI) │ │ │ │ Python 3.12+ · Async · Pydantic v2 · structlog │ │ │ └─────┬──────────────┬──────────────┬────────────────────────┘ │ -│ │ │ │ │ -│ ┌─────┴─────┐ ┌─────┴─────┐ ┌────┴─────┐ │ -│ │ SQLite │ │ fail2ban │ │ External │ │ -│ │ (App DB) │ │ (Socket) │ │ APIs │ │ -│ └───────────┘ └───────────┘ └──────────┘ │ +│ │ │ │ │ +│ ┌─────┴─────┐ ┌─────┴─────┐ ┌────┴─────┐ │ +│ │ SQLite │ │ fail2ban │ │ External │ │ +│ │ (App DB) │ │ (Socket) │ │ APIs │ │ +│ └───────────┘ └───────────┘ └──────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -39,7 +39,8 @@ BanGUI is a two-tier web application with a clear separation between frontend an | **Backend** | Python 3.12+, FastAPI, Pydantic v2, aiosqlite | Business logic, data persistence, fail2ban communication, scheduling | | **Application Database** | SQLite (via aiosqlite) | Stores BanGUI's own data: configuration, session state, blocklist sources, import logs | | **fail2ban** | Unix domain socket | The monitored service — BanGUI reads status, issues commands, and reads the fail2ban database | -| **External APIs** | HTTP (via aiohttp) | IP geolocation, ASN/RIR lookups, blocklist downloads | +| **MaxMind GeoLite2** | Offline MMDB file (mounted into container) | IP geolocation (primary resolver) — local, encrypted | +| **External APIs** | HTTP (via aiohttp) | Blocklist downloads; IP geolocation fallback (only if MMDB unavailable and HTTP fallback enabled) | --- @@ -85,7 +86,7 @@ backend/ │ ├── `main.py` # FastAPI app factory, lifespan, exception handlers │ ├── `config.py` # Pydantic settings (env vars, .env loading) │ ├── `db.py` # Database connection and initialization -│ ├── `exceptions.py` # Shared domain exception classes +│ ├── `exceptions.py` # Shared domain exception classes; all services and routers import from here │ ├── `dependencies.py` # FastAPI Depends() providers (DB, services, auth) │ ├── `models/` # Pydantic schemas │ │ ├── auth.py # Login request/response, session models @@ -119,8 +120,13 @@ backend/ │ │ ├── filter_config_service.py # filter config lifecycle management │ │ ├── action_config_service.py # action config lifecycle management │ │ ├── log_service.py # Log preview and regex test operations +│ │ ├── fail2ban_metadata_service.py # Resolve and cache the fail2ban SQLite DB path via the fail2ban socket │ │ ├── history_service.py # Historical ban queries, per-IP timeline -│ │ ├── blocklist_service.py # Download, validate, apply blocklists +│ │ ├── blocklist_service.py # Orchestration: source CRUD, scheduling, import triggers +│ │ ├── blocklist_downloader.py # HTTP download with retry logic +│ │ ├── blocklist_parser.py # Parse and validate IP addresses +│ │ ├── blocklist_ban_executor.py # Ban execution with error handling +│ │ ├── blocklist_import_workflow.py # Import orchestration (coordinates components) │ │ ├── geo_service.py # IP-to-country resolution, ASN/RIR lookup │ │ ├── server_service.py # Server settings, log management, DB purge │ │ └── health_service.py # fail2ban connectivity checks, version detection @@ -131,14 +137,20 @@ backend/ │ │ ├── geo_cache_repo.py # IP geolocation cache persistence│ │ └── import_log_repo.py # Import run history records │ ├── tasks/ # APScheduler background jobs │ │ ├── blocklist_import.py# Scheduled blocklist download and application -│ │ ├── geo_cache_flush.py # Periodic geo cache persistence (dirty-set flush to SQLite)│ │ ├── geo_re_resolve.py # Periodic re-resolution of stale geo cache records│ │ └── health_check.py # Periodic fail2ban connectivity probe +│ │ ├── geo_cache_flush.py # Periodic geo cache persistence (dirty-set flush to SQLite)│ │ ├── geo_cache_cleanup.py # Periodic purge of stale geo cache entries +│ │ ├── geo_re_resolve.py # Periodic re-resolution of stale geo cache records│ │ └── health_check.py # Periodic fail2ban connectivity probe │ └── utils/ # Helpers, constants, shared types │ ├── fail2ban_client.py # Async wrapper around the fail2ban socket protocol +│ ├── fail2ban_response.py # Canonical response parsing: ok(), to_dict(), ensure_list(), is_not_found_error() +│ ├── fail2ban_db_utils.py # fail2ban database query helpers │ ├── ip_utils.py # IP/CIDR validation and normalisation -│ ├── time_utils.py # Timezone-aware datetime helpers│ ├── jail_config.py # Jail config parser/serializer helper -│ ├── conffile_parser.py # Fail2ban config file parser/serializer +│ ├── time_utils.py # Timezone-aware datetime helpers +│ ├── config_file_utils.py # fail2ban config file I/O +│ ├── conffile_parser.py # fail2ban config file parser/serializer │ ├── config_parser.py # Structured config object parser -│ ├── config_writer.py # Atomic config file write operations│ └── constants.py # Shared constants (default paths, limits, etc.) +│ ├── config_writer.py # Atomic config file write operations +│ ├── jail_config.py # Jail config helper +│ └── constants.py # Shared constants (default paths, limits, etc.) ├── tests/ │ ├── conftest.py # Shared fixtures (test app, client, mock DB) │ ├── test_routers/ # One test file per router @@ -173,6 +185,43 @@ The HTTP interface layer. Each router maps URL paths to handler functions. Route The business logic layer. Services orchestrate operations, enforce rules, and coordinate between repositories, the fail2ban client, and external APIs. Each service covers a single domain. +**Service Layer Responsibilities:** + +Services **must be independent of HTTP concerns**. They work with domain models (DTOs), not response models. This ensures: +- Domain logic can evolve without affecting API shape +- Services are reusable across different frontends +- Testing is simpler (no mocking HTTP response types) +- Changes to endpoint responses don't require service changes + +**Domain Models and Response Mapping:** + +Services return **domain models** (e.g., `DomainActiveBanList`, `DomainBansByCountry`) that represent pure business logic. Response models (e.g., `ActiveBanListResponse`, `BansByCountryResponse`) are defined in `app/models/` and used only by routers. + +Conversion happens at the **router boundary**: +1. Router calls service → receives domain model +2. Router calls mapper function to convert domain model → response model +3. Router returns response model to HTTP client + +Example: +```python +# In ban_service.py +async def get_active_bans(...) -> DomainActiveBanList: + """Service returns domain model (not HTTP-aware).""" + ... + +# In routers/bans.py (router boundary) +domain_result = await ban_service.get_active_bans(...) +return map_domain_active_ban_list_to_response(domain_result) +``` + +Mapper functions live in `app/mappers/` and are thin, mechanical translations between structures. + +**Motivation:** +- The Fail2ban domain doesn't care about field names like `country_code` (snake_case) vs `countryCode` (camelCase) +- If the API needs pagination metadata added to the response, only the mapper changes +- If repositories change their output schema, only services need updating (routers are unaffected) +- Services can be tested with simple dataclasses; no need for Pydantic serialization overhead + | Service | Purpose | |---|---| | `auth_service.py` | Hashes and verifies the master password, creates and validates session tokens, enforces session expiry | @@ -186,13 +235,230 @@ The business logic layer. Services orchestrate operations, enforce rules, and co | `action_config_service.py` | Discovers available actions by scanning action.d/; reads, creates, updates, and deletes action definitions; assigns actions to jails | | `config_file_service.py` | Shared utilities for configuration parsing and manipulation: parses config files, validates names/IPs, manages atomic file writes, probes fail2ban socket | | `raw_config_io_service.py` | Low-level file I/O for raw fail2ban config files | +| `fail2ban_metadata_service.py` | Resolves the fail2ban SQLite database path by querying the fail2ban socket and caches the result for reuse across services | | `log_service.py` | Log preview and regex test operations (extracted from config_service) | -| `history_service.py` | Queries the fail2ban database for historical ban records, builds per-IP timelines, computes ban counts and repeat-offender flags | -| `blocklist_service.py` | Downloads blocklists via aiohttp, validates IPs/CIDRs, applies bans through fail2ban or iptables, logs import results | -| `geo_service.py` | Resolves IP addresses to country, ASN, and RIR using external APIs or a local database, caches results | +| `history_service.py` | Queries the fail2ban database for historical ban records, builds per-IP timelines, computes ban counts and repeat-offender flags, and syncs new records into BanGUI's archive table | +| `blocklist_service.py` | Orchestration layer for blocklist imports. Delegates to focused components: `BlocklistDownloader` (HTTP download with retry), `BlocklistParser` (IP validation), `BanExecutor` (fail2ban integration), and `BlocklistImportWorkflow` (orchestrates the flow). Maintains public API for source CRUD, preview, scheduling, and import triggers. | +| `geo_cache.py` | **GeoCache** class that encapsulates all IP geolocation caching: resolves IP addresses to country, ASN, and organization using a primary local MaxMind GeoLite2-Country database (if available) with optional HTTP fallback to ip-api.com (disabled by default for security). Maintains in-memory and persistent caches with negative cache support, and manages background re-resolution. Instantiated once at startup with allow_http_fallback flag and stored on `app.state.geo_cache` | +| `geo_service.py` | (Deprecated) Backward-compatibility wrappers that delegate to the `GeoCache` instance. Kept for compatibility with existing code. New code should use `GeoCache` directly or via dependency injection | | `server_service.py` | Reads and writes fail2ban server-level settings (log level, log target, syslog socket, DB location, purge age) | | `health_service.py` | Probes fail2ban socket connectivity, retrieves server version and global stats, reports online/offline status | +##### Blocklist Import Architecture + +The blocklist import flow has been refactored to separate concerns into focused components: + +``` +blocklist_service.py (Public API) + │ + ├─ import_source() ──┐ + │ │ + └─ import_all() ├──> BlocklistImportWorkflow (Orchestrator) + │ │ + │ ├──> BlocklistDownloader + │ │ • HTTP GET with retry logic + │ │ • Exponential backoff (429, 5xx) + │ │ • Timeout handling + │ │ + │ ├──> BlocklistParser + │ │ • Parse text to IP lines + │ │ • Validate IPv4/IPv6 addresses + │ │ • Skip CIDRs and malformed entries + │ │ + │ ├──> BanExecutor + │ │ • Ban each IP via fail2ban socket + │ │ • Abort on JailNotFoundError + │ │ • Continue on individual ban failures + │ │ + │ └──> Geo pre-warming + │ (optional batch lookup for newly banned IPs) + │ + └──> Result logging (import_log_repo) +``` + +**Component Responsibilities:** + +- **BlocklistDownloader**: Handles HTTP transport concerns (retries, timeouts, backoff) +- **BlocklistParser**: Handles parsing and validation logic (clean, testable, no I/O) +- **BanExecutor**: Handles fail2ban integration with error aggregation +- **BlocklistImportWorkflow**: Coordinates the flow, handles result aggregation and geo pre-warming +- **blocklist_service.py**: Maintains public API (source CRUD, scheduling, import triggers) + +**Benefits of This Architecture:** + +- Each component is independently testable with mock dependencies +- Error handling is clear: JailNotFoundError stops processing, JailOperationError continues +- Components can be evolved independently (e.g., replace HTTP client, add batch validation) +- Logging is contextual and tied to the appropriate layer +- Retry logic and transient error handling are isolated + +#### DNS-Rebinding Protection + +**The Vulnerability:** + +A DNS-rebinding attack exploits a time-of-check-to-time-of-use (TOCTOU) window between when a blocklist URL is validated and when it is actually fetched: + +1. User adds blocklist URL `http://attacker.com/blocklist.txt` +2. `blocklist_service.create_source()` calls `validate_blocklist_url()` which performs DNS resolution +3. `attacker.com` resolves to a public IP (attacker's real server) — validation passes ✓ +4. Later, when `BlocklistDownloader` fetches the URL, the attacker's DNS server responds with `192.168.1.1` +5. The HTTP client connects to the private IP, potentially accessing internal services + +**The Protection:** + +BanGUI closes this window by adding a second DNS-rebinding check at **connection time**: + +1. **Create-time validation** (`app/utils/ip_utils.py:validate_blocklist_url`): Confirms the URL resolves to a public IP when created +2. **Connection-time validation** (`app/services/dns_validated_connector.py`): Validates that all resolved IPs are public when the actual HTTP connection is made + +The HTTP session is created with a custom **socket factory** that intercepts DNS resolution results before socket creation. If any resolved IP is private or reserved, the connection is rejected with a clear error. + +**Implementation:** + +- `app/services/dns_validated_connector.py`: Provides `create_dns_validated_socket_factory()` which returns a socket factory that validates IPs using `is_private_ip()` +- `app/startup.py:_create_http_session()`: Passes the socket factory to `aiohttp.TCPConnector`, protecting all HTTP requests globally +- All blocklist imports automatically inherit this protection through the shared session + +**Protected IP Ranges:** + +The validation blocks all RFC 1918 private ranges, loopback, link-local, ULA, multicast, and reserved addresses: +- IPv4: `10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`, `127.0.0.0/8`, `224.0.0.0/4`, `240.0.0.0/4`, `255.255.255.255/32` +- IPv6: `::1/128`, `fe80::/10`, `fc00::/7`, `ff00::/8`, and others (via `ipaddress.IPv6Address.is_private`, etc.) + +**Reference:** + +- [OWASP SSRF Prevention Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/Server_Side_Request_Forgery_Prevention_Cheat_Sheet.html) +- Tests: `backend/tests/test_services/test_dns_validated_connector.py` + +#### Startup DAG (`app/startup_dag.py`, `app/startup.py`) + +The startup process is orchestrated by an explicit **Directed Acyclic Graph (DAG)** that defines all resource initialization stages, their dependencies, health checks, and rollback strategy. This replaces implicit ordering with explicit, documented prerequisites. + +**Why This Exists:** + +Previously, startup resources were created in a procedural sequence without documented dependencies. If a stage was reordered or a prerequisite was missed, initialization could fail in non-obvious ways. Partial failures could leave stale resources (open database connections, HTTP sessions, running schedulers) that prevented clean rollback. + +**Startup Stages (in order):** + +``` +1. WORKER_MODE + └─ Validates that BANGUI_WORKERS=1 (scheduler cannot run in multiple workers) + +2. DATABASE + ├─ Prerequisite: WORKER_MODE + ├─ Creates database directory + ├─ Initializes database schema + ├─ Caches setup completion state + └─ Loads persisted runtime settings + +3. GEO_CACHE + ├─ Prerequisite: DATABASE + ├─ Loads IP geolocation cache from database + ├─ Counts unresolved IPs + ├─ Initializes MaxMind GeoLite2 database + └─ Configures HTTP fallback (if enabled) + +4. HTTP_SESSION + ├─ Prerequisite: GEO_CACHE + ├─ Creates aiohttp.ClientSession + └─ Configures timeouts and connection limits + +5. SCHEDULER + ├─ Prerequisite: HTTP_SESSION + ├─ Creates APScheduler AsyncIOScheduler + └─ Starts the scheduler + +6. TASKS + ├─ Prerequisite: SCHEDULER + ├─ Registers health_check task (fail2ban connectivity probe) + ├─ Registers blocklist_import task (scheduled imports) + ├─ Registers geo_cache_cleanup task (stale entry purge) + ├─ Registers geo_cache_flush task (periodic persistence) + ├─ Registers geo_re_resolve task (stale record re-resolution) + ├─ Registers history_sync task (ban history sync) + └─ Registers session_cleanup task (expired session purge) +``` + +**Failure Mode & Rollback:** + +If any stage fails: + +1. All completed stages are rolled back **in reverse order** (Tasks → Scheduler → HTTP_SESSION → GEO_CACHE → DATABASE → WORKER_MODE) +2. Each rollback suppresses exceptions to ensure all resources are cleaned up +3. Database connections are closed +4. HTTP sessions are closed +5. The scheduler is shut down +6. The application startup fails with a clear error message + +**Health Checks:** + +After all stages complete, a final health check verifies: +- All resources have initialized successfully +- Resources pass their individual health_check() methods +- No failures occurred during any stage + +**Implementation:** + +- **StartupDAG**: Orchestrates the entire flow, manages prerequisites, and handles failures +- **StartupStage**: Enum defining the 6 startup stages +- **StageDependency**: Defines stage metadata (description, prerequisites, rollback policy) +- **StartupContext**: Tracks registered resources, completed stages, and failure state +- **startup_shared_resources()**: Main entry point that builds and executes the DAG +- **_stage_*()**: Functions that implement each stage's initialization logic + +**Example Usage in Tests:** + +```python +# Test that a stage with missing prerequisites fails +dag = StartupDAG() +dag.register_stage(StartupStage.HTTP_SESSION, "Create HTTP session", + prerequisites=frozenset([StartupStage.DATABASE])) +dag.register_stage(StartupStage.SCHEDULER, "Create scheduler") + +async def http_session_func(): + return aiohttp.ClientSession() + +# This will raise RuntimeError because DATABASE hasn't completed +await dag.execute_stage(StartupStage.HTTP_SESSION, http_session_func) +``` + +#### Mappers (`app/mappers/`) + +The response mapping layer. Mappers convert domain models (returned by services) to response models (consumed by HTTP routers). This layer enforces the separation between business logic and API shape. + +**Location:** `app/mappers/` + +**Responsibilities:** +- Convert service domain models to API response models +- Mechanical, thin translation — no business logic +- Used exclusively at the router boundary + +**Pattern:** + +Each domain model has a corresponding mapper function: + +```python +# Domain model (from service) +DomainActiveBan → map_domain_active_ban_to_response() → ActiveBan (response) + +# Service returns domain models: +async def get_active_bans(...) -> DomainActiveBanList + +# Router converts at the boundary: +domain_result = await ban_service.get_active_bans(...) +return map_domain_active_ban_list_to_response(domain_result) +``` + +**Why separate?** + +When API requirements change (e.g., new field added, field renamed), only: +1. Response model in `app/models/` changes +2. Mapper function in `app/mappers/` updates +3. Routers stay the same +4. Services don't change + +Without this layer, changes to API shape would require modifying services and their tests. + #### Repositories (`app/repositories/`) The data access layer. Repositories execute raw SQL queries against the application SQLite database. They return plain data or domain models — they never raise HTTP exceptions or contain business logic. @@ -206,6 +472,8 @@ The data access layer. Repositories execute raw SQL queries against the applicat | `geo_cache_repo.py` | Persist and query IP geo resolution cache | | `import_log_repo.py` | Record import run results (timestamp, source, IPs imported, errors) for the import log view | +Every repository in `app/repositories/` has a corresponding protocol in `app/repositories/protocols.py`, including `settings_repo.py` and `history_archive_repo.py`. + #### Models (`app/models/`) Pydantic schemas that define data shapes and validation. Models are split into three categories per domain. @@ -223,6 +491,20 @@ Pydantic schemas that define data shapes and validation. Models are split into t | `server.py` | Server status and settings models | | `setup.py` | First-run setup wizard models | +**Model Layering Rules:** Models are pure data classes (leaf nodes) in the dependency graph. They must not import from application-layer modules (`app.services`, `app.config`, `app.utils`). Models may import from: +- Standard library and third-party packages (Pydantic, typing) +- Other models in `app.models/` (sibling models) +- `app.models.response` (response envelopes) + +**Critical Constraint — No I/O or Side Effects:** Pydantic validators, field defaults, and computed fields must be **pure functions with no side effects**: +- ❌ NO imports from `app.config`, `app.services`, `app.utils`, or `app.routers` (these are application-layer modules) +- ❌ NO calls to `get_settings()`, file I/O, database queries, network calls, or any runtime-dependent functions +- ❌ NO `default_factory` that calls app-layer functions + +These constraints ensure that **importing a model file does not trigger application initialization** and prevents hidden circular dependencies. + +**Validation that requires access to app-level state** (e.g., allowed log directories, settings, database) must be moved to the **router or service layer**, not in model validators. Validation occurs at the boundary — where settings and services are already available. + #### Tasks (`app/tasks/`) APScheduler background jobs that run on a schedule without user interaction. @@ -230,9 +512,12 @@ APScheduler background jobs that run on a schedule without user interaction. | Task | Purpose | |---|---| | `blocklist_import.py` | Downloads all enabled blocklist sources, validates entries, applies bans, records results in the import log | +| `geo_cache_cleanup.py` | Periodically removes entries from the `geo_cache` table that have not been referenced in the configured retention period (default: 90 days). Prevents unbounded database growth. | | `geo_cache_flush.py` | Periodically flushes newly resolved IPs from the in-memory dirty set to the `geo_cache` SQLite table (default: every 60 seconds). GET requests populate only the in-memory cache; this task persists them without blocking any request. | | `geo_re_resolve.py` | Periodically re-resolves stale entries in `geo_cache` to keep geolocation data fresh | | `health_check.py` | Periodically pings the fail2ban socket and updates the cached server status so the frontend always has fresh data | +| `history_sync.py` | Periodically copies new records from the fail2ban SQLite database into BanGUI's `history_archive` table; delegates the sync algorithm to `history_service.py` | +| `session_cleanup.py` | Periodically removes expired sessions from the `sessions` SQLite table (default: every 6 hours). Without this cleanup, the table grows unbounded and degrades query performance. | #### Utils (`app/utils/`) @@ -241,6 +526,7 @@ Pure helper modules with no framework dependencies. | Module | Purpose | |---|---| | `fail2ban_client.py` | Async client that communicates with fail2ban via its Unix domain socket — sends commands and parses responses using the fail2ban protocol. Modelled after [`./fail2ban-master/fail2ban/client/csocket.py`](../fail2ban-master/fail2ban/client/csocket.py) and [`./fail2ban-master/fail2ban/client/fail2banclient.py`](../fail2ban-master/fail2ban/client/fail2banclient.py). | +| `jail_socket.py` | Low-level jail reload operations (`reload_all`) extracted to break service dependencies. Used by `jail_service`, `jail_config_service`, `action_config_service`, and `filter_config_service` to avoid circular imports between sibling services. | | `ip_utils.py` | Validates IPv4/IPv6 addresses and CIDR ranges using the `ipaddress` stdlib module, normalises formats | | `jail_utils.py` | Jail helper functions for configuration and status inference | | `jail_config.py` | Jail config parser and serializer for fail2ban config manipulation | @@ -252,7 +538,7 @@ Pure helper modules with no framework dependencies. | `config_file_utils.py` | Common file-level config utility helpers | | `fail2ban_db_utils.py` | Fail2ban DB path discovery and ban-history parsing helpers | | `setup_utils.py` | Setup wizard helper utilities | -| `constants.py` | Shared constants: default socket path, default database path, time-range presets, limits | +| `constants.py` | Shared constants: default socket path, default database path, time-range presets, parser truthy values, limits | #### Configuration (`app/config.py`) @@ -269,8 +555,209 @@ The FastAPI app factory. Responsibilities: - Creates the `FastAPI` instance with metadata (title, version, docs URL) - Registers the **lifespan** context manager (startup: open DB, create aiohttp session, start scheduler; shutdown: close all) - Mounts all routers -- Registers global exception handlers that map domain exceptions to HTTP status codes -- Applies the setup-redirect middleware (redirects all requests to `/api/setup` when no configuration exists) +- Registers global exception handlers that map domain exceptions to HTTP status codes with a hierarchical fallback chain +- Applies the setup-redirect middleware (returns `423 Locked` for all API requests when no configuration exists, except for `/api/setup` and `/api/health`) + +**Exception Handler Hierarchy:** + +Exception handlers are registered in order of specificity to ensure each exception type is caught by the most appropriate handler: + +1. **Specific network errors** (Fail2BanConnectionError, Fail2BanProtocolError) → HTTP 502 Bad Gateway +2. **Specific auth/rate errors** (AuthenticationError, RateLimitError) → HTTP 401 Unauthorized / 429 Too Many Requests +3. **Category handlers** (NotFoundError, BadRequestError, ConflictError, OperationError, ServiceUnavailableError) → HTTP 404/400/409/500/503 +4. **DomainError catch-all** → HTTP 500 (catches any unregistered DomainError subclass, ensuring proper error_code and metadata are returned) +5. **HTTPException** → HTTP status from exception (FastAPI built-in validation and routing errors) +6. **ValueError** → HTTP 400 Bad Request (Pydantic validation errors) +7. **Exception catch-all** → HTTP 500 Internal Server Error (absolute fallback for unexpected errors) + +The DomainError catch-all handler (step 4) is critical: it ensures that any new DomainError subclass automatically gets the correct HTTP status (500), error_code, and metadata through its inherited `error_code` attribute and `get_error_metadata()` method, even if the developer forgot to create an explicit handler for it. This prevents silent failures where an unhandled exception would return a generic "internal_error" code instead of the specific error code defined by the exception class. + +### 2.3 Dependency Wiring and Service Composition + +BanGUI uses a **lightweight dependency injection (DI) pattern** based on FastAPI's `Depends()` framework. There is no heavy container library — the composition root is implicit and managed through simple provider functions in `app/dependencies.py`. + +#### The DI Pattern + +Every injectable dependency follows this structure: + +1. **Provider Function** — An async function in `app/dependencies.py` that creates and returns a dependency: + ```python + async def get_settings(app_context: ...) -> Settings: + """Provide application settings.""" + return app_context.runtime_settings or app_context.settings + ``` + +2. **Type Alias** — An `Annotated` alias that decorates the provider for use in route signatures: + ```python + SettingsDep = Annotated[Settings, Depends(get_settings)] + ``` + +3. **Injection Point** — Routers declare their dependencies using the type alias: + ```python + async def my_route(settings: SettingsDep) -> Response: + # FastAPI automatically calls get_settings() and injects the result + ... + ``` + +**Module-Level Imports:** + +All repository and service modules are imported at module level in `app/dependencies.py`. These imports are safe at the top because no circular dependencies exist — repositories and services do not import from `dependencies.py`. This follows the principle of importing dependencies early and consistently: + +```python +# app/dependencies.py (top of file) +from app.repositories import ( + blocklist_repo, + fail2ban_db_repo, + session_repo, + # ... other repository modules +) +from app.services import auth_service, health_service +from app.services.fail2ban_metadata_service import default_fail2ban_metadata_service + +# Provider functions simply return the module +async def get_session_repo() -> SessionRepository: + return session_repo +``` + +**Exception**: The `from app.db import open_db` import remains local to `get_db()` because it is only used within that specific function and the module load overhead is avoided. + +#### Service Composition Root + +Services are **not instantiated by a container**. Instead, they are **composed by routers and tasks through explicit parameter passing**. This keeps dependencies visible and avoids implicit side effects. + +**Example: How `ban_service.get_active_bans()` is wired:** + +```python +# Step 1: Router declares what it needs (dependencies.py) +async def get_ban_service_context( + db: Annotated[aiosqlite.Connection, Depends(get_db)], + fail2ban_db_repo: Annotated[Fail2BanDbRepository, Depends(get_fail2ban_db_repo)], +) -> BanServiceContext: + """Combine database connection and repository.""" + return BanServiceContext(db=db, fail2ban_db_repo=fail2ban_db_repo) + +BanServiceContextDep = Annotated[BanServiceContext, Depends(get_ban_service_context)] + +# Step 2: Router uses the context and calls the service +@router.get("/active") +async def get_active_bans( + ban_ctx: BanServiceContextDep, + socket_path: Fail2BanSocketDep, + geo_cache: GeoCacheDep, +) -> ActiveBanListResponse: + # Router explicitly passes everything the service needs + domain_result = await ban_service.get_active_bans( + socket_path, + geo_cache=geo_cache, + app_db=ban_ctx.db, # ← Explicit, no magic + ) + return map_domain_active_ban_list_to_response(domain_result) + +# Step 3: Service function accepts dependencies as parameters +async def get_active_bans( + socket_path: str, + geo_cache: GeoCache, + app_db: aiosqlite.Connection, +) -> DomainActiveBanList: + """Retrieve active bans. All dependencies are explicit parameters.""" + # Service logic here + ... +``` + +**Why this pattern?** +- **Explicit**: No hidden coupling. Every dependency is visible in function signatures. +- **Testable**: Easy to mock dependencies by passing test doubles. +- **Lightweight**: No heavyweight DI container library needed. FastAPI's `Depends()` is sufficient. +- **Debuggable**: Stack traces and type checkers understand the full dependency chain. + +#### Service Context Dependencies + +For convenience, related repositories and the database connection are bundled into **context objects**. These prevent routers from depending on the raw database connection (which violates the repository boundary). + +**Available Service Contexts:** + +| Context | Includes | Used By | +|---------|----------|---------| +| `SessionServiceContext` | `db`, `session_repo` | auth router | +| `BlocklistServiceContext` | `db`, `blocklist_repo`, `import_log_repo`, `settings_repo` | blocklist router | +| `SettingsServiceContext` | `db`, `settings_repo` | server settings router | +| `BanServiceContext` | `db`, `fail2ban_db_repo` | ban router | +| `HistoryServiceContext` | `db`, `fail2ban_db_repo`, `history_archive_repo` | history router | + +Each context is created by a provider function: +```python +async def get_ban_service_context( + db: Annotated[aiosqlite.Connection, Depends(get_db)], + fail2ban_db_repo: Annotated[Fail2BanDbRepository, Depends(get_fail2ban_db_repo)], +) -> BanServiceContext: + return BanServiceContext(db=db, fail2ban_db_repo=fail2ban_db_repo) +``` + +#### Adding a New Service + +Follow this checklist when creating a new service: + +1. **Create the service module** — `app/services/my_service.py` +2. **Define the service functions** — Each function takes its dependencies as explicit parameters (no imports of other services at the same layer) +3. **Export key functions** — Only the public API functions are called by routers +4. **If database access is needed:** + - Routers depend on the appropriate `ServiceContextDep` (e.g., `BanServiceContextDep`) + - Pass `context.db` and `context.repository` to the service function +5. **If a new context is needed:** + - Create a `@dataclass` in `app/dependencies.py` to hold the related resources + - Create a provider function `get__context()` that combines them + - Create a type alias `ContextDep` for router injection +6. **Register the service** — No registration step; FastAPI discovers it via `Depends()` + +**Example: Adding a new service that needs blocklist and settings repos:** + +```python +# app/services/my_new_service.py +async def do_something( + db: aiosqlite.Connection, + blocklist_repo: BlocklistRepository, + settings_repo: SettingsRepository, +) -> MyResult: + """Do something with blocklist and settings data.""" + sources = await blocklist_repo.list_sources(db) + settings = await settings_repo.load(db) + # Business logic + return ... + +# app/routers/my_router.py +from app.dependencies import BlocklistServiceContextDep +from app.services import my_new_service + +@router.get("/something") +async def my_endpoint( + ctx: BlocklistServiceContextDep, # ← Already has db, blocklist_repo, settings_repo +) -> MyResponse: + result = await my_new_service.do_something( + db=ctx.db, + blocklist_repo=ctx.blocklist_repo, + settings_repo=ctx.settings_repo, + ) + return MyResponse(...) +``` + +#### The Repository Boundary + +Services **must not** depend on raw database connections. The repository boundary is enforced by **not exporting `DbDep` to routers**. Instead: + +- Routers declare a `ServiceContextDep` which includes both the `db` and the needed repositories +- Services receive the `db` connection and repositories as parameters +- Repositories are the **only modules** that execute SQL; services never call SQL directly + +This ensures: +- Queries are centralized and testable +- Changes to the database layer don't leak into business logic +- Repositories can be mocked independently for testing + +#### Lifecycle and Scope + +- **Request-scoped**: Database connections are created fresh for each request and closed after the response is sent. This prevents contention and locking issues with SQLite. +- **Application-scoped**: Shared resources like `aiohttp.ClientSession`, the scheduler, and the `GeoCache` are created at startup and reused across all requests. +- **Singleton**: Some services (e.g., `Fail2BanMetadataService`) are instantiated once and cached in `app.state` or imported as module-level instances. --- @@ -462,11 +949,57 @@ Shared TypeScript interfaces and type aliases. Purely declarative — no runtime React context providers for application-wide concerns. +**Provider Ordering and Compile-Time Validation** + +Provider order is **order-sensitive** and enforced at compile-time through TypeScript discriminated unions. The required order (outermost to innermost) is: + +1. `ThemeProvider` — must be outermost; provides theme context to `AppContents` +2. `FluentProvider` — supplies Fluent UI theme and design tokens to all Fluent UI consumers +3. `NotificationProvider` — provides notification service; must wrap error boundaries +4. `ErrorBoundary` — catches catastrophic errors at the top level +5. `BrowserRouter` — enables client-side routing +6. `NavigationCancellationProvider` — manages route-aware request cancellation using `useLocation()` +7. `AuthProvider` — validates session on mount; must be inside BrowserRouter (uses `useNavigate()`) +8. `TimezoneProvider` — fetches timezone after auth; wraps protected routes only + +**Compile-Time Validation:** + +A type-safe builder pattern (`ProviderCompositionBuilder`) in `providerComposition.tsx` enforces this order using TypeScript's discriminated unions. The builder prevents adding providers out of order at compile-time: + +```tsx +const tree = createProviderComposition() + .withTheme({ children }) + .withFluent(theme) // ✓ Must come after withTheme + .withNotification() // ✓ Must come after withFluent + .withErrorBoundary() // ✓ Correct order enforced + .withBrowserRouter() + .withNavigationCancellation() + .withAuth() + .build(routes); +``` + +Attempting to add providers out of order results in TypeScript errors (no runtime overhead). + +**Runtime Validation (Development):** + +A runtime validator (`providerOrderValidator.tsx`) provides fallback validation for development: + +- `validateProviderPosition()` — checks if a provider is correctly nested +- `validateProvidersExist()` — ensures required providers are in the tree +- `hasProvider()` — queries provider presence +- `useProviderValidation()` — development-only hook that warns if required providers are missing + +See `src/providers/PROVIDER_ORDER.md` for detailed dependency rationale. + +**Provider Reference:** + | Provider | Purpose | |---|---| -| `AuthProvider` | Holds authentication state; exposes `isAuthenticated`, `login()`, and `logout()` via `useAuth()` | +| `AuthProvider` | Holds authentication state; exposes `isAuthenticated`, `login()`, and `logout()` via `useAuth()`. Synchronizes logout events across browser tabs in real-time using the BroadcastChannel API (with storage event fallback for older browsers). When a user logs out in any tab, all other open tabs immediately reflect the logout state without requiring a page refresh. | | `TimezoneProvider` | Reads the configured IANA timezone from the backend and supplies it to all children via `useTimezone()` | | `ThemeProvider` | Manages light/dark theme selection, supplies the active Fluent UI theme to `FluentProvider` | +| `NotificationProvider` | Provides notification service via `useNotification()` hook; must wrap error boundaries so they can display error notifications | +| `NavigationCancellationProvider` | Detects route changes and automatically aborts pending API requests; call `useNavigationAbortSignal()` to get an `AbortSignal` that lives for the current route | #### Theme (`src/theme/`) @@ -634,8 +1167,8 @@ BanGUI maintains its **own SQLite database** (separate from the fail2ban databas | Table | Purpose | |---|---| | `settings` | Key-value store for application configuration (master password hash, fail2ban socket path, database path, timezone, session duration) | -| `sessions` | Active session tokens with expiry timestamps | -| `geo_cache` | Resolved IP geolocation results (ip, country_code, country_name, asn, org, cached_at). Loaded into memory at startup via `load_cache_from_db()`; new entries are flushed back by the `geo_cache_flush` background task. | +| `sessions` | Active session token hashes with expiry timestamps. Tokens are stored as one-way SHA256 hashes to prevent token hijacking if the database is exposed. | +| `geo_cache` | Resolved IP geolocation results (ip, country_code, country_name, asn, org, cached_at, last_seen). Tracks the last time each IP address was referenced to enable retention policies. Entries older than 90 days are automatically purged by the `geo_cache_cleanup` task to prevent unbounded growth. Loaded into memory at startup via `load_cache_from_db()`; new entries are flushed back by the `geo_cache_flush` background task. | | `blocklist_sources` | Registered blocklist URLs (id, name, url, enabled, created_at, updated_at) | | `import_logs` | Record of every blocklist import run (id, source_id, timestamp, ips_imported, ips_skipped, errors, status) | @@ -648,20 +1181,79 @@ BanGUI maintains its **own SQLite database** (separate from the fail2ban databas --- -## 6. Authentication & Session Management +## 6. Setup & Configuration Persistence -- **Single-user model** — one master password, no usernames. -- Password is hashed with a strong algorithm (e.g., bcrypt or argon2) and stored in the application database during setup. -- Sessions are token-based, stored server-side in the `sessions` table, and delivered to the browser as HTTP-only secure cookies. -- Session expiry is configurable (set during setup, stored in `settings`). -- The frontend `AuthProvider` checks session validity on mount and redirects to `/login` if invalid. -- The backend `dependencies.py` provides an `authenticated` dependency that validates the session cookie on every protected endpoint. -- **Session validation cache** — validated session tokens are cached in memory for 10 seconds (`_session_cache` dict in `dependencies.py`) to avoid a SQLite round-trip on every request from the same browser. The cache is invalidated immediately on logout. -- **Setup-completion flag** — once `is_setup_complete()` returns `True`, the result is stored in `app.state._setup_complete_cached`. The `SetupRedirectMiddleware` skips the DB query on all subsequent requests, removing 1 SQL query per request for the common post-setup case. +### 6.1 Initial Setup Wizard & One-Time Configuration + +The setup wizard (`POST /api/setup`) runs once during first-time startup to configure: +- Master password (bcrypt-hashed) +- Runtime database path (where BanGUI stores operational state) +- fail2ban Unix socket path +- IANA timezone +- Session duration (in minutes) +- Map color thresholds for geolocation visualization + +**Atomicity & Crash-Safety:** + +Setup is implemented with explicit transaction boundaries across two SQLite databases (bootstrap config DB and runtime app DB) to ensure atomicity: + +1. **Phase 1 (Bootstrap DB transaction)**: Set `setup_state = "in_progress"` and persist `database_path`. On commit, this is the first checkpoint — if process crashes here, the next setup attempt will detect and clean up. + +2. **Phase 2 (Filesystem + Runtime DB)**: Initialize runtime database schema outside a transaction (idempotent via `CREATE TABLE IF NOT EXISTS`). + +3. **Phase 3 (Runtime DB transaction)**: Batch-write all runtime settings (password hash, paths, config) atomically in a single `BEGIN IMMEDIATE ... COMMIT` transaction. Either all settings are persisted or none are. + +4. **Phase 4 (Bootstrap DB transaction)**: Set `setup_state = "complete"` and `setup_completed = "1"`. This is the final commit point — only when this succeeds is setup considered complete. + +**Password Hash Idempotency:** + +The bcrypt password hash is computed early (before any DB writes) to ensure that if setup is retried after a crash, the same hash is used throughout all retry attempts. This prevents divergent hashes due to bcrypt's random salt generation. + +**State Machine:** + +| State | Meaning | Recovery | +|-------|---------|----------| +| `null` | Setup not started | Normal flow: begin setup | +| `"in_progress"` | Bootstrap DB marked, runtime DB being initialized | Retry from beginning (runtime DB may be partial) | +| `"complete"` | All settings persisted, setup finished | Skip setup (already done) | + +If a crash is detected in `"in_progress"` state on the next startup, cleanup logic can detect this and either retry or remove the partial runtime database before retrying. + +**Backward Compatibility:** + +The `setup_completed = "1"` key is still written for backward compatibility with cache detection. Modern code checks `setup_state = "complete"` for clearer semantics. --- -## 7. Scheduling +## 8. Authentication & Session Management + +- **Single-user model** — one master password, no usernames. +- Password is hashed with a strong algorithm (e.g., bcrypt or argon2) and stored in the application database during setup. +- Sessions are token-based, stored server-side in the `sessions` table as one-way SHA256 hashes, and delivered to the browser as HTTP-only secure cookies. +- **Session token hashing** — Session tokens are hashed before storage to prevent token hijacking if the database file is exposed. Only the hash (`token_hash`) is stored in the database; the raw token is never persisted. When validating a session, the incoming token is hashed before the database lookup. This ensures the database alone is not sufficient to usurp a session — an attacker would also need knowledge of the original token value. +- Session expiry is configurable (set during setup, stored in `settings`). +- The frontend `AuthProvider` checks session validity on mount and redirects to `/login` if invalid. +- The backend `dependencies.py` provides an `authenticated` dependency that validates the session cookie on every protected endpoint. +- **Session validation cache** (`InMemorySessionCache` in `app.utils.session_cache`) — validated session tokens are cached in memory for 10 seconds (configurable via `session_cache_ttl_seconds`) to avoid a SQLite round-trip on every request from the same browser. The cache is invalidated immediately on logout. **⚠️ This cache is process-local and not safe for multi-worker or distributed deployments.** In single-worker mode (enforced by TASK-002), this is safe and improves performance. For multi-worker deployments, replace `InMemorySessionCache` with a shared backend (Redis, database, shared memory) implementing the `SessionCache` protocol. See `app/utils/session_cache.py` module docstring for implementation details. +- **GeoCache** — `GeoCache` instance is created at startup with a configurable `allow_http_fallback` flag and stored on `app.state.geo_cache`. It implements a primary + fallback resolution strategy: (1) try local MaxMind GeoLite2-Country MMDB database (primary, encrypted, no network traffic), (2) if unavailable/no result and allowed, fall back to ip-api.com HTTP API (unencrypted, disabled by default for security). Encapsulates in-memory lookup cache, negative cache for unresolvable IPs (5-minute TTL), dirty set for persistence, and thread-safe async locking. Cache is loaded from the `geo_cache` SQLite table on startup. New resolutions are accumulated in memory and periodically flushed to the database by the `geo_cache_flush` background task. Stale entries are re-resolved by the `geo_re_resolve` task. Injected into routes and tasks via FastAPI's dependency system. See Backend-Development.md § IP Geolocation Resolution for setup and security details. +- **Runtime state** (`RuntimeState` in `app.utils.runtime_state`) — stores mutable application state: `server_status` (fail2ban online/offline), `last_activation` (jail activation tracking), `pending_recovery` (crash detection), `runtime_settings` (effective configuration), and service-specific state holders like `jail_service_state` (`JailServiceState` for jail capability detection cache). RuntimeState fields are managed through dedicated functions (e.g., `record_activation()`, `clear_pending_recovery()`) and via dependency injection to services. Service-specific state (like `JailServiceState`) is nested within `RuntimeState` to keep all mutable state in one controlled location. **⚠️ RuntimeState is process-local and only safe when BanGUI runs as a single asyncio worker.** Mutations must not span `await` points (cooperative scheduling within a single event loop is safe). In multi-worker deployments, each process has its own copy — logouts from worker A don't affect worker B's cache, health status updates are per-worker, and activation tracking is unreliable. BanGUI enforces single-worker mode (TASK-002) to prevent this issue. For future multi-worker support, replace RuntimeState with a shared coordination backend (Redis, shared memory, database). See `app/utils/runtime_state.py` module docstring for details. +- **Setup-completion flag** — once `is_setup_complete()` returns `True`, the result is stored in `app.state._setup_complete_cached`. The `SetupRedirectMiddleware` skips the DB query on all subsequent requests, removing 1 SQL query per request for the common post-setup case. The completion flag is only written after the runtime database is successfully initialized and all initial setup settings are persisted, preventing a failed setup from permanently bypassing the setup wizard. +### 8.1 CSRF Protection + +State-mutating endpoints (POST, PUT, DELETE, PATCH) that use cookie-based authentication are protected against Cross-Site Request Forgery (CSRF) attacks via a **custom header check middleware**. + +**Design:** +- For requests authenticated via the session cookie (not Bearer token), the `CsrfMiddleware` requires the custom header `X-BanGUI-Request: 1` to be present. +- The frontend API client automatically includes this header on all requests. +- Cross-site `fetch()` calls cannot set custom headers without CORS preflight, which the backend rejects for non-allowed origins, providing defense-in-depth. +- Safe HTTP methods (GET, HEAD, OPTIONS) bypass the check. +- Bearer token authentication (via `Authorization: Bearer` header) bypasses the check because tokens are not CSRF-vulnerable (they are not automatically sent on cross-origin requests). +- Requests missing the CSRF header receive a `403 Forbidden` response with detail: `"CSRF validation failed. Request rejected."`. + +This mechanism complements the existing `SameSite=Lax` cookie policy, which blocks traditional `
` POST requests but does not protect against JavaScript-initiated requests on a subdomain or same-origin XSS injection. + +--- +## 9. Scheduling APScheduler 4.x (async mode) manages recurring background tasks. @@ -671,6 +1263,7 @@ APScheduler 4.x (async mode) manages recurring background tasks. │ (async, in-process) │ ├──────────────────────┤ │ blocklist_import │ ── runs on configured schedule (default: daily 03:00) +│ geo_cache_cleanup │ ── runs every 24 hours (nightly) │ geo_cache_flush │ ── runs every 60 seconds │ health_check │ ── runs every 30 seconds └──────────────────────┘ @@ -683,18 +1276,27 @@ APScheduler 4.x (async mode) manages recurring background tasks. --- -## 8. API Design +## 10.1 Background Tasks and Database Access -### 8.1 Conventions +- APScheduler jobs run outside FastAPI request/response scope and therefore cannot rely on ``Depends(get_db)``. +- Background tasks must open their own application database connection via ``app.db.open_db`` and close it when the work completes. +- Use a shared task helper (``app.tasks.db.task_db``) so every task follows the same async context manager pattern and avoids connection leaks. +- This pattern is intentional: task code is structurally separate from request-handling dependencies and should not attempt to reuse request-scoped DB connections. + +--- + +## 9. API Design + +### 9.1 Conventions - All endpoints are grouped under `/api/` prefix. - JSON request and response bodies, validated by Pydantic models. - Authentication via session cookie on all endpoints except `/api/setup` and `/api/auth/login`. -- Setup-redirect middleware: while no configuration exists, all endpoints return `303 See Other` → `/api/setup`. -- Standard HTTP status codes: `200` success, `201` created, `204` no content, `400` bad request, `401` unauthorized, `404` not found, `422` validation error, `500` server error. +- Setup-redirect middleware: while no configuration exists, all API endpoints (except `/api/setup` and `/api/health`) return `423 Locked` with `{"detail": "Setup not complete.", "setup_required": true}`. This ensures API consumers can detect setup as a distinct condition rather than transparently following redirects. +- Standard HTTP status codes: `200` success, `201` created, `204` no content, `400` bad request, `401` unauthorized, `404` not found, `422` validation error, `423` locked, `500` server error. - Error responses follow a consistent shape: `{ "detail": "Human-readable message" }`. -### 8.2 Endpoint Groups +### 9.2 Endpoint Groups | Group | Endpoints | Description | |---|---|---| @@ -745,13 +1347,356 @@ APScheduler 4.x (async mode) manages recurring background tasks. --- -## 10. Design Principles +## 10.2 nginx Routing Rules + +The reverse proxy (nginx) must route requests correctly to prevent frontend SPA fallback rules from hiding backend 404 errors. The following location blocks ensure proper behavior: + +### Location Block Priority + +nginx uses **longest-prefix matching** to determine which location block handles a request: +1. Exact matches (`location =`) — highest priority +2. Regular expression matches (`location ~`) — second priority +3. Prefix matches (`location /prefix`) — matched in order of specificity (longest first) +4. Catch-all (`location /`) — lowest priority + +### Routing Configuration + +| Location Block | Rule | Purpose | +|---|---|---| +| `location /api/` | `proxy_pass http://backend:8000;` — **no `try_files`** | Proxy all API requests to FastAPI backend. Any unmatched API route (typos, invalid paths) returns 404 from the backend. | +| `location /assets/` | `try_files $uri =404;` | Serve static assets with long-term caching. Return 404 if file doesn't exist. | +| `location /` | `try_files $uri $uri/ /index.html;` | SPA fallback: serve `index.html` for all unmatched routes (client-side routing). | + +### Routing Behavior + +``` +Request → /api/some-endpoint + ↓ + nginx matches location /api/ (longest prefix) + ↓ + proxy_pass → backend:8000 + ↓ + Backend returns 404 if endpoint doesn't exist (✓ correct) + Client sees 404, not SPA HTML + +Request → /some-page + ↓ + nginx matches location / (catch-all) + ↓ + try_files looks for file, then directory, then /index.html + ↓ + Serves /index.html (React Router handles client-side routing) + ↓ + Client sees 200 with HTML (✓ correct for SPA) + +Request → /api/typos + ↓ + nginx matches location /api/ (longest prefix, NOT catch-all) + ↓ + proxy_pass → backend:8000 + ↓ + FastAPI returns 404 (✓ correct, not caught by SPA fallback) +``` + +### Critical Implementation Notes + +- **Never add `try_files` to the `/api/` location block** — this would hide backend 404s. +- **The `/api/` location must come before the `/` catch-all** in the config (this is automatically respected via longest-prefix matching). +- **No inherited `try_files` rules** — the `/api/` location has no global `try_files` that could affect it. +- **Backend 404 responses pass through nginx unchanged** — nginx does not rewrite 404 responses from the backend. + +--- + +## 9.2a nginx Security Headers + +nginx adds the following OWASP-recommended security headers to all responses: + +| Header | Value | Purpose | +|---|---|---| +| **Content-Security-Policy** | `default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data:; font-src 'self'; connect-src 'self'; frame-ancestors 'none';` | Prevents XSS attacks by restricting script execution to same-origin. `style-src 'unsafe-inline'` is required for Fluent UI v9's inline styles. | +| **X-Frame-Options** | `DENY` | Prevents clickjacking by disallowing iframe embedding. | +| **X-Content-Type-Options** | `nosniff` | Prevents MIME-sniffing; browsers must respect the declared Content-Type. | +| **Referrer-Policy** | `no-referrer` | Prevents leaking internal URLs in the `Referer` header to third-party resources. | +| **Permissions-Policy** | `geolocation=(), microphone=(), camera=()` | Disables access to browser APIs not needed by the application. | +| **Strict-Transport-Security** | *Commented out* | Must only be enabled after HTTPS is fully configured. Uncomment when TLS termination is production-ready. | + +All headers use the `always` directive, ensuring they are included in error responses (4xx, 5xx) as well. + +### CSP and Fluent UI + +Fluent UI v9 applies styles via inline `style` attributes on DOM elements. To support this, `style-src 'unsafe-inline'` is required. A stricter CSP using nonces would require server-side rendering of the HTML shell, which is outside the current architecture. + +--- + +## 9.3 Deployment Constraints + +### Single-Executor Scheduler Requirement + +**BanGUI's background scheduler must run with exactly one executor process.** + +The application uses APScheduler's `AsyncIOScheduler`, which is bound to a single asyncio event loop and cannot be safely shared across multiple worker processes. If the app is deployed with `--workers N` (where N > 1), the following failures occur: + +- Each worker process creates its own independent scheduler instance. +- All background jobs execute **N times simultaneously** (once per worker). +- Results: + - **Duplicate blocklist imports** — the same IP ranges are banned N times. + - **Duplicate history entries** — the same historical events are recorded N times. + - **Duplicate ban operations** — bans are executed multiple times, with potential state conflicts. + - **SQLite lock contention** — concurrent writes to the same database from N workers cause lock timeouts. + +### Enforcement Mechanism + +BanGUI enforces single-executor safety through a **database-backed lock** that works reliably in container orchestration environments: + +1. **Fast check (env var):** On startup, the `BANGUI_WORKERS` environment variable is checked (if set). If explicitly set to a value > 1, startup fails immediately with a clear error. + +2. **Authoritative check (database lock):** During startup, BanGUI acquires an atomic database lock in the `scheduler_lock` table. This lock: + - Uses a singleton row (id=1) to prevent race conditions across simultaneously starting instances + - Stores the PID, hostname, creation timestamp, and heartbeat timestamp of the lock holder + - Is considered stale if the heartbeat hasn't been updated for 60 seconds + - Is automatically cleaned up on stale instance detection, allowing failover in rolling deployments + +3. **Lock acquisition (startup):** + - Clean up any stale locks (heartbeat older than 60 seconds) + - Attempt to insert a new lock row with this instance's PID and hostname + - If the INSERT fails (row already exists), reject startup with a clear error + - If the INSERT succeeds, this instance holds the lock and will start the scheduler + +4. **Lock maintenance (runtime):** A periodic background task (`scheduler_lock_heartbeat`) updates the lock's heartbeat timestamp every 10 seconds, keeping it alive and preventing false positives from temporary load spikes. + +5. **Lock release (shutdown):** On graceful shutdown, the lock is released, allowing other instances to acquire it. + +**Why database-backed instead of filesystem?** + +Database-backed locking is more reliable in container orchestration because: +- **Atomicity:** SQLite transactions are atomic — no race condition window between checking and inserting +- **Container-safe:** Works across containers with shared database volumes (no NFS/SMB edge cases) +- **Stale detection:** Heartbeat-based TTL is simpler and more reliable than PID-based checks (PID reuse is common in containers) +- **No false positives:** Timestamp-based expiration eliminates issues with PID reuse + +### Startup Sequence with Scheduler Lock + +``` +1. DATABASE stage + └─ Initialize SQLite schema (including scheduler_lock table) + +2. WORKER_MODE stage (formerly first, now depends on DATABASE) + ├─ Fast check: Verify BANGUI_WORKERS env var if explicitly set + └─ Authoritative check: Acquire scheduler lock in database + → If lock held by another instance: Fail with clear error + → If lock acquired: Continue to GEO_CACHE stage + +3. (rest of startup continues as normal) +``` + +### Troubleshooting + +**Problem:** Startup fails with "Could not acquire scheduler lock" + +**Solution:** +1. Verify no other BanGUI instances are running +2. Inspect the lock: `sqlite3 bangui.db "SELECT * FROM scheduler_lock;"` +3. Check who holds the lock (hostname, PID, heartbeat time) +4. If stale (heartbeat older than 60 seconds), clean it: + ```sql + sqlite3 bangui.db "DELETE FROM scheduler_lock WHERE (strftime('%s', 'now') - heartbeat_at) > 60;" + ``` +5. Retry the failed instance + +**Problem:** Stale lock after instance crash + +BanGUI handles this automatically: +- The next instance to start will detect the stale lock (heartbeat older than 60 seconds) +- It will clean it up and acquire the lock +- The new instance starts the scheduler as normal + +No manual intervention is required. + +### Environment Variables + +- **`BANGUI_WORKERS`** (optional, default: unset) + - If set to `1` or unset: Normal operation (any number of instances may start, but only one holds the lock) + - If set to > `1`: Startup fails immediately with an error (fast check) + - Reason: Legacy env var for explicitly forbidding multi-worker deployments + +### Container Orchestration Examples + +**Docker Compose:** +- Single service instance (no scaling) — scheduler runs normally + +**Kubernetes:** +- Single Pod replica — scheduler runs normally +- Multiple Pod replicas (during rolling update) — old Pod releases lock on shutdown, new Pod acquires it + - No duplicate jobs, no startup failures + - Health check should allow 30-60 seconds for lock handoff + +**systemd / process manager:** +- Single process — scheduler runs normally +- Accidental multi-process restart — lock prevents duplicate jobs, other processes fail to start scheduler + +### Future Multi-Worker Support + +To safely support multiple workers in the future: + +1. **External job store:** Move APScheduler from in-memory to a persistent store (e.g., SQLAlchemy-backed job store with PostgreSQL or Redis). +2. **Distributed locking:** Use a distributed lock (Redis, etcd) instead of database lock for better performance. +3. **Process coordination:** Implement a process-to-worker pool communication mechanism so the scheduler runs only on one designated worker. + +Currently, the single-executor approach is simple, maintainable, and sufficient for BanGUI's operational requirements. The database lock provides reliable enforcement across all deployment scenarios. + +--- + +## 10. Observability & Distributed Tracing + +BanGUI implements **distributed tracing** via **correlation IDs** to correlate errors and requests across frontend and backend systems. + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Frontend (React + TypeScript) │ +├─────────────────────────────────────────────────────────────┤ +│ • API Client generates session-scoped UUID4 (correlation ID)│ +│ • Telemetry service records structured events │ +│ • Error boundaries catch render errors │ +│ • All telemetry events include correlation ID for tracing │ +└────────────────────┬────────────────────────────────────────┘ + │ + ├─ Every request includes + │ X-Correlation-ID header + │ +┌────────────────────┴────────────────────────────────────────┐ +│ Backend (Python + FastAPI + structlog) │ +├─────────────────────────────────────────────────────────────┤ +│ • CorrelationIdMiddleware extracts/generates correlation ID │ +│ • All logs automatically include correlation ID │ +│ • Error responses include correlation_id field │ +│ • structlog outputs JSON with correlation ID in all events │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Correlation ID Flow + +1. **Frontend → Backend:** + - API client generates/retrieves session-scoped UUID4 + - UUID4 sent in `X-Correlation-ID` request header + - All requests use same session UUID (set once, reused) + +2. **Backend Processing:** + - CorrelationIdMiddleware extracts/generates correlation ID + - ID stored in structlog contextvars + - All structured log entries include correlation ID automatically + - Error responses include `correlation_id` field in JSON + +3. **Backend → Frontend:** + - Response includes `X-Correlation-ID` header + - Error responses include `correlation_id` in response body + - Frontend error handlers extract correlation ID + +4. **Frontend Error Logging:** + - Error handlers extract correlation ID from API response + - Telemetry service logs error with correlation ID + - Browser console and telemetry backends receive linked events + +### Example: Correlating an Error Across Systems + +**Scenario:** User clicks "Ban IP" button → API returns 500 error → error logged and displayed + +**Frontend telemetry event:** +```json +{ + "event": "api_error", + "severity": "error", + "message": "Server error banning IP", + "correlation_id": "550e8400-e29b-41d4-a716-446655440000", + "context": { + "status": 500, + "endpoint": "/api/bans" + }, + "timestamp": "2025-04-30T18:30:00.000Z" +} +``` + +**Backend structured log:** +```json +{ + "event": "ban_service_error", + "severity": "error", + "message": "Failed to ban IP", + "correlation_id": "550e8400-e29b-41d4-a716-446655440000", + "context": { + "ip": "192.168.1.1", + "jail": "sshd", + "error": "fail2ban socket error" + }, + "timestamp": "2025-04-30T18:30:00.000Z" +} +``` + +**Troubleshooting:** Engineer searches logs for correlation ID `550e8400-e29b-41d4-a716-446655440000` and finds all related events (request received, jail lookup, fail2ban call, error response) in order. + +### Implementation Details + +**Backend:** +- Middleware: `app/middleware/correlation.py` + - Generates UUID4 if `X-Correlation-ID` header missing + - Stores in structlog contextvars for automatic inclusion in all logs + - Adds correlation ID to response header and error responses +- All error handlers include `correlation_id` in `ErrorResponse` +- See `backend/app/models/response.py` for `ErrorResponse.correlation_id` field + +**Frontend:** +- API client: `frontend/src/api/client.ts` + - Generates session-scoped UUID4 on first use + - Includes in `X-Correlation-ID` header for all requests + - Extracts from response headers and stores in `ApiError` +- Telemetry service: `frontend/src/utils/telemetry.ts` + - Structured event logging with correlation ID support + - Redaction utilities for privacy/security + - Handlers for custom backends (console logger by default) +- Error handlers: `frontend/src/utils/fetchError.ts` + - Extract correlation ID from API errors + - Log with telemetry for distributed tracing +- Error boundaries: `frontend/src/components/{Error,Page,Section}ErrorBoundary.tsx` + - Catch render-time exceptions + - Log with telemetry for observability + - **Note:** `ErrorBoundary.componentDidCatch()` accesses `errorInfo.componentStack` which is not part of the public React.ErrorInfo type definition. This is a React DevTools implementation detail accessed via type casting (`as any`). It captures the React component hierarchy for debugging but may change in future React versions. See [React issue #3623](https://github.com/facebook/react/issues/3623) for context. + +### Privacy & Security + +- **No sensitive data logged:** + - Passwords, tokens, session IDs never logged + - PII (names, emails, IPs) logged only with explicit intent and redaction + - Redaction utilities: `telemetry.redact()`, `telemetry.redactObject()` + +- **Backend:** Correlation IDs use opaque UUID4 (no user data embedded) +- **Frontend:** Same session UUID for all requests (safe to expose in logs) + +### Future Enhancements + +1. **Backend error telemetry aggregation:** + - Send structured logs to observability platform (DataDog, Grafana Loki, etc.) + - Query by correlation ID to trace entire request flow + +2. **Frontend error reporting:** + - Send frontend telemetry to backend `/api/telemetry` endpoint + - Store alongside backend logs for unified view + +3. **Metrics & dashboards:** + - Error rates by endpoint, severity, error type + - Latency percentiles and distribution + - Request success/failure trends + +--- + +## 11. Design Principles These principles govern all architectural decisions in BanGUI. | Principle | Application | |---|---| | **Separation of Concerns** | Frontend and backend are independent. Backend layers (router → service → repository) never mix responsibilities. | +| **Service Independence** | Services must not import other services at the same layer (e.g., `jail_config_service` must not import `jail_service`). Shared logic belongs in the utils layer (`app/utils/`). This prevents circular dependencies, improves testability, and keeps each service focused on its domain. | | **Single Responsibility** | Each module, service, and component has one well-defined job. | | **Dependency Inversion** | Services depend on abstractions (protocols), not concrete implementations. FastAPI `Depends()` wires everything. | | **Async Everything** | All I/O is non-blocking. No synchronous database, HTTP, or socket calls anywhere in the backend. | diff --git a/Docs/Backend-Development.md b/Docs/Backend-Development.md index e2932dc..d95f651 100644 --- a/Docs/Backend-Development.md +++ b/Docs/Backend-Development.md @@ -69,6 +69,239 @@ from fail2ban.client.csocket import CSSocket # noqa: E402 - `print()` for logging — use `structlog`. - `json.loads` / `json.dumps` on Pydantic models — use `.model_dump()` / `.model_validate()`. +### Timestamp Handling + +Timestamp consistency is critical for accurate ban history queries across the dashboard and history endpoints. Follow these rules: + +**Rule 1: Use consistent UTC timestamps** +- All timestamps in the database are stored as Unix epochs (seconds since 1970-01-01 UTC). +- fail2ban stores timestamps using `time.time()`, which is always UTC epoch seconds. +- When querying fail2ban's SQLite database by timestamp, use `app.utils.time_utils.since_unix()` (not manual datetime calculations). + +**Rule 2: Time-range windows include a 60-second slack** +- The `since_unix()` function includes a 60-second slack window (`TIME_RANGE_SLACK_SECONDS` in `app.utils.constants`). +- This slack accommodates: + - Clock drift between the local system and fail2ban. + - Test seeding delays when timestamps are manually set to exact boundaries. +- The slack ensures that dashboard and history queries return consistent row counts for the same time range. + +**Rule 3: Never duplicate timestamp calculation logic** +- All services that query by time range must import and use `since_unix()`. +- Do not recalculate timestamps locally using `datetime` or `time` modules in service code. +- If you need a timestamp for a time range, use `since_unix()`. + +**Example:** +```python +from app.utils.time_utils import since_unix + +# Get all bans from the last 24 hours (with 60-second slack) +since_ts: int = since_unix("24h") +rows = await db.execute( + "SELECT * FROM bans WHERE timeofban >= ?", + (since_ts,) +) +``` + +### Database Performance & Indexing + +Large archive datasets can degrade query performance without proper indexing. The `history_archive` table supports multiple filter patterns: + +**Query Patterns (Indexed for Performance):** + +1. **MAX(timeofban)** — `history_sync_task` queries for the latest timestamp to know where to resume syncing from fail2ban. This is a covering index lookup. + +2. **Jail filter with time ordering** — Dashboard and API endpoints filter by `jail` and sort by `timeofban DESC` for pagination. This is accelerated by `idx_history_archive_jail_timeofban`. + +3. **Time-range filter** — Queries filter by `timeofban >= since` to fetch recent records. This uses the composite index `idx_history_archive_timeofban_jail_action` which includes `timeofban` as the leading column for efficient range scans. + +4. **IP filter** — Users can search by exact IP or IP prefix (using `LIKE ip%`). The `idx_history_archive_ip` index accelerates these searches. + +5. **Action filter** — Queries may filter by action ('ban' or 'unban'). The `idx_history_archive_action` index supports this. + +6. **Purge old entries** — Background tasks delete entries older than a threshold (`timeofban < cutoff`). This uses `idx_history_archive_timeofban_jail_action`. + +**Current Indexes (defined in `backend/app/db.py` Migration 5):** + +- `idx_history_archive_jail_timeofban(jail, timeofban DESC)` — Composite index for jail-filtered queries. +- `idx_history_archive_timeofban_jail_action(timeofban DESC, jail, action)` — Covering index for time-range queries and MAX lookups. +- `idx_history_archive_ip(ip)` — Single-column index for IP searches. +- `idx_history_archive_action(action)` — Single-column index for action filtering. + +**Benchmark Results:** + +Query benchmarks (see `backend/tests/test_repositories/test_history_archive_indexing.py`) verify that common operations complete within expected thresholds on datasets with 10,000+ records: + +| Operation | Time Budget | Actual (with indexes) | +|-----------|-------------|----------------------| +| MAX(timeofban) | <0.01s | ✓ Uses covering index | +| Count with jail filter | <0.10s | ✓ Covering index scan | +| List with jail + order | <0.05s | ✓ Index fully utilized | +| Time-range filter | <0.05s | ✓ Range scan on timeofban | +| Combined filters | <0.05s | ✓ Composite indexes used | + +**Adding New Indexes:** + +If you add new query patterns to `history_archive_repo.py`: + +1. **Analyze the WHERE and ORDER BY clauses** — Identify which columns are filtered and sorted. +2. **Check EXPLAIN QUERY PLAN** in a local test: + ```python + async with db.execute("EXPLAIN QUERY PLAN SELECT ...") as cur: + rows = await cur.fetchall() + for row in rows: print(row[3]) # Print the plan text + ``` +3. **If the plan shows a full table scan, add an index** that matches the filter columns in order. +4. **Create a migration** in `backend/app/db.py` following the pattern from Migration 5. +5. **Add a benchmark test** to verify the new index improves query performance. + +**Index Tradeoffs:** + +- **Pros**: Faster SELECT queries, reduced CPU during queries. +- **Cons**: Slower INSERT/UPDATE/DELETE (indexes must be maintained), larger database file size. + +--- + +## 7.5 Cursor-Based Pagination for Large Result Sets + +**Problem:** Offset-based pagination (`LIMIT ? OFFSET ?`) scans and discards N rows before fetching the next N. On a 10M-row table, fetching the last page takes 15+ seconds because SQLite must evaluate all previous rows. + +**Solution:** Use keyset pagination (cursor-based) with `WHERE id > last_id` instead of OFFSET. This leverages indexes to jump directly to the next page in O(log N) time. + +### Offset vs. Cursor Pagination + +| Aspect | Offset (`LIMIT ? OFFSET ?`) | Cursor (`WHERE id > ?`) | +|--------|-----|-----| +| Performance | O(N) — scans N rows to fetch | O(log N) — index jump | +| Last page on 10M rows | 15+ seconds ⚠️ | <50ms ✓ | +| API Contract | `page`, `page_size` | `cursor`, `page_size` | +| Backward nav | Stateless (any page any time) | Stateless (cursor is opaque) | +| Count query | Required (slow on large tables) | Not required | + +### When to Use Cursor Pagination + +- ✓ **Use cursor pagination** for large tables (>100K rows) with frequent pagination queries +- ✓ **Use cursor pagination** for real-time feeds where rows are constantly added/modified +- ✓ **Use cursor pagination** if your API already exposes cursor tokens to clients +- ✗ **Use offset pagination** for small datasets or administrative interfaces where performance is not critical + +### Implementation Pattern + +**1. Add indexes on sort columns:** + +Cursor queries use `WHERE id > :cursor ORDER BY id ASC LIMIT :page_size`. Ensure the sort column is indexed or part of a composite index. + +**2. Use cursor pagination utilities:** + +```python +from app.utils.pagination import encode_cursor, decode_cursor + +# Fetch next page using cursor +last_row_id = decode_cursor(cursor) if cursor else None +items, has_more = await repo.get_items_keyset( + page_size=50, + last_row_id=last_row_id, +) + +# Encode cursor for next page +next_cursor = encode_cursor(items[-1]["id"]) if items and has_more else None +``` + +**3. Return cursor in pagination metadata:** + +The response includes `cursor` (for cursor pagination) in addition to `page`, `page_size`, and `has_next_page`: + +```json +{ + "items": [...], + "pagination": { + "page": 1, + "page_size": 50, + "total": -1, + "total_pages": -1, + "has_next_page": true, + "has_prev_page": false, + "cursor": "eyJpZCI6IDQyN30=" + } +} +``` + +**4. Repositories supporting cursor pagination:** + +- `import_log_repo.list_logs_keyset()` — Import log with cursor pagination +- `history_archive_repo.get_archived_history_keyset()` — Archived bans with cursor pagination + +Both functions return `(items, has_more)` instead of `(items, total)` to avoid expensive COUNT queries. + +### Cursor Format & Security + +Cursors are **opaque base64-encoded JSON** objects. Clients must not decode or modify them: + +```python +# Cursor structure (internal only — never expose raw JSON to client) +{"id": 12345} + +# Base64-encoded cursor sent to client: +# eyJpZCI6IDEyMzQ1fQ== + +# Decode with decode_cursor() which validates the format +last_id = decode_cursor(cursor) +``` + +Benefits: +- ✓ **Opaque to client** — Format can evolve without breaking API compatibility +- ✓ **Deterministic** — Same row ID always produces the same cursor +- ✓ **Tamper-evident** — Invalid/malformed cursors are rejected with clear errors + + +For `history_archive`, the read-heavy workload justifies these indexes because: +- Inserts are batched during sync (one batch per minute), not per-request. +- Deletes happen once per day during purge. +- SELECT queries run on every API request to the history endpoint. + +--- + +## 7.6 Never Load Unbounded Result Sets + +**Problem:** Loading large result sets entirely into Python memory causes: +- Memory spikes that crash containers +- Slow dashboard performance +- Unbounded database file growth + +**Rule:** Never load unbounded result sets. Always use SQL aggregation or pagination. + +**Anti-patterns:** + +```python +# BAD — loads all rows into memory +all_rows = await history_archive_repo.get_all_archived_history(db=db, ...) + +# GOOD — SQL aggregation returns lightweight counts +ip_counts = await history_archive_repo.get_ip_ban_counts(db=db, ...) +``` + +**SQL aggregation patterns for common operations:** + +| Operation | SQL Pattern | Repository Function | +|-----------|-------------|---------------------| +| Count by IP | `SELECT ip, COUNT(*) FROM bans GROUP BY ip` | `get_ip_ban_counts()` | +| Count by jail | `SELECT jail, COUNT(*) FROM bans GROUP BY jail` | `get_jail_ban_counts()` | +| Count by time bucket | `SELECT CAST((timeofban - ?) / ? AS INTEGER), COUNT(*) ... GROUP BY bucket_idx` | `get_ban_counts_by_bucket()` | +| Paginated rows | `WHERE id < ? ORDER BY id DESC LIMIT ?` | `get_archived_history_keyset()` | + +**When to use SQL aggregation:** +- Computing totals, counts, or aggregations for display +- Building country/jail/geo maps from large datasets +- Any endpoint that needs only a summary, not full row data + +**When to use pagination:** +- Endpoints that return individual records for display (ban lists, history) +- Any endpoint where clients need access to specific rows + +**Memory budgets for reference:** +- 1M ban records ≈ 200-400 MB if fully materialized as Python dicts +- SQL aggregation returns lightweight results: {ip, count} pairs = a few KB for same 1M records +- Keyset pagination returns only the page size (typically 50-200 rows) + --- ## 3. Project Structure @@ -100,6 +333,85 @@ backend/ - **Repositories** handle raw database queries — nothing else. - Never put business logic inside routers or repositories. +### Service Dependencies and Injection + +Services should **never** directly import other services to avoid hidden coupling and make testing harder. Instead: + +1. **Define clear service interfaces** using Protocol classes in `app/services/protocols.py`. +2. **Make dependencies explicit** by passing them as function parameters with optional defaults. +3. **Use lazy imports** for fallback singletons (not at module level). +4. **Inject services via FastAPI dependencies** when called from routers. + +**Example:** The `history_service` depends on `Fail2BanMetadataService` to resolve the fail2ban database path: + +```python +# Good — dependency passed as parameter +async def list_history( + socket_path: str, + fail2ban_metadata_service: Fail2BanMetadataService | None = None, +) -> HistoryListResponse: + if fail2ban_metadata_service is None: + # Lazy import fallback for backward compatibility + from app.services.fail2ban_metadata_service import default_fail2ban_metadata_service + fail2ban_metadata_service = default_fail2ban_metadata_service + ... +``` + +Routers inject the service dependency explicitly: + +```python +from app.dependencies import Fail2BanMetadataServiceDep + +@router.get("/api/history") +async def get_history( + fail2ban_metadata_service: Fail2BanMetadataServiceDep, +) -> HistoryListResponse: + return await history_service.list_history( + socket_path, + fail2ban_metadata_service=fail2ban_metadata_service, + ) +``` + +This pattern prevents circular imports, makes services testable, and allows easy mocking in tests. + +### Mutable Runtime State + +All mutable runtime state (state that changes during the application's lifetime) **must** be stored in `RuntimeState` defined in `app/utils/runtime_state.py`. This centralizes state management, prevents accidental global mutable variables, and makes state management testable and synchronization-safe. + +**Allowed locations for mutable state:** + +1. **RuntimeState fields** — Core application state (e.g., `server_status`, `last_activation`, `pending_recovery`, `runtime_settings`). Managed through dedicated functions (e.g., `record_activation()`, `clear_pending_recovery()`). +2. **Nested service state** — Service-specific mutable state (e.g., `JailServiceState` for jail capability detection cache) is nested within `RuntimeState` as a field. Services receive their state via dependency injection. +3. **Controlled via dependencies** — State is injected into services and routers using FastAPI `Depends()`. This ensures single-source-of-truth and testability. + +**Example — jail_service state management:** + +```python +# Define service-specific state (in app/utils/runtime_state.py) +@dataclass +class JailServiceState: + backend_cmd_supported: bool | None = None + backend_cmd_lock: asyncio.Lock | None = None + +# Nested in RuntimeState +@dataclass +class RuntimeState: + jail_service_state: JailServiceState = field(default_factory=JailServiceState) + ... + +# Injected into services via dependency +async def list_jails(socket_path: str, state: JailServiceState) -> JailListResponse: + backend_cmd_is_supported = await _check_backend_cmd_supported(client, name, state) + ... + +# Routers inject state through FastAPI dependencies +@router.get("/api/jails") +async def get_jails(state: JailServiceStateDep) -> JailListResponse: + return await jail_service.list_jails(socket_path, state) +``` + +**Why:** Centralizing mutable state prevents race conditions, makes concurrency boundaries explicit, simplifies testing (each test gets a fresh state object), and prepares for multi-worker deployments (shared state would need to be extracted to Redis, database, or shared memory). + --- ## 4. FastAPI Conventions @@ -110,17 +422,40 @@ backend/ - Use **Depends()** for dependency injection (database sessions, services, auth). - Group endpoints into routers by feature domain (`routers/jails.py`, `routers/bans.py`, …). - Use appropriate HTTP status codes: `201` for creation, `204` for deletion with no body, `404` for not found, etc. +- Protected endpoints should return `401 Unauthorized` or `403 Forbidden` when the session is invalid or expired; the frontend treats these responses as a session-expiry event and redirects the user to `/login`. - Use **HTTPException** or custom exception handlers — never return error dicts manually. +- All successful responses must use a standardized Pydantic response model. List and collection endpoints should wrap data in `items`, `total`, and optional pagination metadata. Detail endpoints must expose a single domain object under a named field (for example `jail`, `status`, or `settings`). Command endpoints must use a `CommandResponse`-style wrapper with `message` and `success`. - **GET endpoints are read-only — never call `db.commit()` or execute INSERT/UPDATE/DELETE inside a GET handler.** If a GET path produces side-effects (e.g., caching resolved data), that write belongs in a background task, a scheduled flush, or a separate POST endpoint. Users and HTTP caches assume GET is idempotent and non-mutating. ```python # Good — pass db=None on GET so geo_service never commits result = await geo_service.lookup_batch(ips, http_session, db=None) - # Bad — triggers INSERT + COMMIT per IP inside a GET handler + # Bad — triggers INSERT + COMMIT per GET inside a GET handler result = await geo_service.lookup_batch(ips, http_session, db=app_db) ``` +### OpenAPI Schema Synchronization + +**Critical:** The OpenAPI schema is the single source of truth for frontend types. When you add, modify, or remove endpoints or response models: + +1. **FastAPI automatically updates the schema** based on your Pydantic models and endpoint definitions. +2. **The frontend regenerates types** from the schema on every build: `npm run generate:types`. +3. **Ensure your Pydantic models are accurate** — they are directly serialized into the schema and used to generate frontend types. +4. **Test type generation locally** before committing: + ```bash + cd frontend + npm run generate:types # Generates src/types/generated.ts + npm run build # Build should succeed if types match + ``` +5. **The backend must be running** for type generation to work (the tool fetches `/api/openapi.json`). +6. **Commit generated types** alongside backend changes — they must always be in sync. + +**Never:** +- Manually edit `src/types/generated.ts` on the frontend — regenerate from the schema instead. +- Commit backend changes without ensuring the frontend can regenerate types. +- Assume the OpenAPI schema is correct — validate your Pydantic model's `field` descriptions and types are as intended. + ```python from fastapi import APIRouter, Depends, HTTPException, status from app.models.jail import JailResponse, JailListResponse @@ -134,22 +469,455 @@ async def list_jails(service: JailService = Depends()) -> JailListResponse: return JailListResponse(jails=jails) ``` + +### Dependency Layering: Enforcing the Repository Boundary + +The **repository boundary** separates database-aware code from application logic. This is enforced through dependency injection. + +For a complete overview of BanGUI's DI pattern, including the composition root, service wiring, and lifecycle management, see [Architekture.md § 2.3 Dependency Wiring and Service Composition](Architekture.md#23-dependency-wiring-and-service-composition). + +| Layer | Responsibilities | Dependencies | +|---|---|---| +| **Routers** | Receive requests, validate input, return responses. | Service context dependencies (SessionServiceContextDep, BlocklistServiceContextDep), settings, auth. Never raw database connections. | +| **Services** | Contain business logic, orchestrate operations. | Other services, repositories. May receive `aiosqlite.Connection` for repository operations. | +| **Repositories** | Execute all SQL queries. All database knowledge lives here. | `aiosqlite.Connection` (from callers). | + +**Rule: Routers must NOT depend on `DbDep` (raw database connections).** + +Instead, routers should: +1. Depend on **service context dependencies** like `SessionServiceContextDep`, `BlocklistServiceContextDep`, etc. +2. These context dependencies combine the database connection and related repositories. +3. Pass the context to services, which internally orchestrate database operations. + +**Service Context Dependencies Available:** +- `SessionServiceContextDep` — Contains `db` and `session_repo` for session operations. +- `BlocklistServiceContextDep` — Contains `db`, `blocklist_repo`, `import_log_repo`, `settings_repo`. +- `SettingsServiceContextDep` — Contains `db` and `settings_repo`. +- `BanServiceContextDep` — Contains `db` and `fail2ban_db_repo`. +- `HistoryServiceContextDep` — Contains `db`, `fail2ban_db_repo`, `history_archive_repo`. + +**Why:** +- **Enforcement**: Not exporting `DbDep` from the dependencies module makes it impossible for routers to accidentally bypass repositories. +- **Clarity**: Service context dependencies explicitly declare which database operations a router needs. +- **Testability**: Services and routers are easier to test when they depend on repositories (which can be mocked) rather than raw connections. + +**Example:** +```python +# ✅ GOOD — router depends on service context +@router.post("/login") +async def login( + body: LoginRequest, + response: Response, + session_ctx: SessionServiceContextDep, # Contains db + session_repo + _auth: AuthDep, +) -> LoginResponse: + return await auth_service.login( + session_ctx.db, + password=body.password, + session_repo=session_ctx.session_repo, + ... + ) + +# ❌ BAD — router depends on raw db (DbDep is not exported for this reason) +@router.post("/login") +async def login( + body: LoginRequest, + db: DbDep, # ← Cannot import DbDep in routers + _auth: AuthDep, +) -> LoginResponse: + return await auth_service.login(db, password=body.password, ...) +``` + +**DEPRECATED: DbDep** +- The `DbDep` type alias is provided for backward compatibility only. +- DO NOT use in new code. Use service context dependencies instead. +- See `backend/app/dependencies.py` for available service contexts. + +--- + +## 4.1 API Response Envelope Policy + +All API responses must follow a **consistent wrapper pattern**. This standardization reduces frontend branching logic, prevents integration bugs, and makes the API easier to document and maintain. + +### Response Patterns + +#### Pattern 1: Paginated Lists + +Use `PaginatedListResponse[T]` for endpoints that return paginated collections: + +```python +from app.models.response import PaginatedListResponse + +class JailListResponse(PaginatedListResponse[JailSummary]): + """Response for ``GET /api/jails``.""" + pass + +# Returns: +{ + "items": [...], # T[] + "pagination": { + "page": 2, # int: current page (1-based) + "page_size": 20, # int: items per page + "total": 100, # int: total items across all pages + "total_pages": 5, # int: computed total number of pages + "has_next_page": true, # bool: whether more pages exist + "has_prev_page": true # bool: whether previous pages exist + } +} +``` + +**When to use:** Endpoints that support pagination parameters (`page`, `page_size`, `limit`, `offset`). + +#### Pattern 2: Non-Paginated Collections + +Use `CollectionResponse[T]` for endpoints that return a complete collection without pagination: + +```python +from app.models.response import CollectionResponse + +class JailConfigListResponse(CollectionResponse[JailConfig]): + """Response for ``GET /api/config/jails``.""" + pass + +# Returns: +{ + "items": [...], # T[] + "total": 42 # int: total items +} +``` + +**When to use:** Endpoints that return a complete collection (not paginated). The frontend can render all items without worrying about paging. + +#### Pattern 3: Single-Item Detail Responses + +Use **domain-specific field names** (not generic wrappers) for detail endpoints: + +```python +class JailDetailResponse(BaseModel): + """Response for ``GET /api/jails/{name}``.""" + jail: Jail + ignore_list: list[str] + ignore_self: bool + +# Returns: +{ + "jail": { ... }, # Jail object + "ignore_list": [...], # Additional context + "ignore_self": true +} +``` + +**When to use:** Endpoints that fetch a single entity. Use the entity name as the field (jail, status, settings, etc.). + +**Field naming:** +- Primary entity uses its own name: `jail`, `status`, `settings`, etc. +- Related or supplementary data uses descriptive names: `ignore_list`, `warnings`, `metadata`, etc. + +#### Pattern 4: Command/Action Responses + +Use `CommandResponse` for endpoints that execute commands: + +```python +from app.models.response import CommandResponse + +class JailCommandResponse(CommandResponse): + """Generic response for jail control commands.""" + jail: str # Target identifier (optional) + +# Returns: +{ + "message": "Jail 'sshd' started.", + "success": true, + "jail": "sshd" # Optional: target identifier +} +``` + +**When to use:** POST/PUT/DELETE endpoints that perform operations (start jail, ban IP, update config, etc.). + +**Fields:** +- `message: str` — Human-readable result or error description. +- `success: bool` — Operation succeeded (default: true). Use false for non-exception error handlers. +- Optional domain-specific fields (jail, ip, etc.) to identify the affected resource. + +#### Pattern 5: Aggregation Responses + +Use domain-specific field names for aggregated data: + +```python +class BansByJailResponse(BaseModel): + """Response for ``GET /api/dashboard/bans/by-jail``.""" + jails: list[JailBanCount] # Aggregated per-jail data + total: int # Total count across all jails + +class BansByCountryResponse(BaseModel): + """Response for ``GET /api/dashboard/bans/by-country``.""" + countries: dict[str, int] # Country code → count + country_names: dict[str, str] # Country code → name + bans: list[DashboardBanItem] # Full list for rendering companion table + total: int # Total ban count + +# Returns: +{ + "jails": [ { "jail": "sshd", "count": 42 }, ... ], + "total": 500 +} +``` + +**When to use:** Endpoints that return computed/aggregated data. Use field names that reflect the data (jails, countries, buckets, etc.). + +### Summary Table + +| Pattern | Used for | Field Names | Example | +|---------|----------|---|---| +| **PaginatedListResponse** | Paginated collections | `items`, `pagination` (page, page_size, total, total_pages, has_next_page, has_prev_page) | `GET /api/dashboard/bans` | +| **CollectionResponse** | Complete collections | `items`, `total` | `GET /api/config/jails` | +| **Detail Response** | Single entity + metadata | Entity name + descriptors | `GET /api/jails/{name}` | +| **CommandResponse** | Action results | `message`, `success` + optional identifiers | `POST /api/jails/{name}/start` | +| **Aggregation Response** | Computed data | Domain-specific names | `GET /api/dashboard/bans/by-jail` | + +### Rules + +1. **Always wrap lists in `items` field** — Consistency aids frontend parsing. + - ✅ `{ "items": [...], "total": 100 }` + - ❌ `{ "jails": [...], "total": 100 }` (for list endpoints; OK for aggregations) + +2. **Aggregation responses are exceptions** — They use domain-specific field names because the data represents computed results, not a simple list. + - ✅ `{ "countries": {...}, "jails": [...], "total": 100 }` + +3. **Every response with >1 item must include `total`** — Enables frontend to understand scale. + +4. **Paginated responses must include `page` and `page_size`** — Enables the frontend to render pagination controls. + +5. **No ad-hoc wrapper objects** — Don't invent new response shapes. Use the patterns above. + +### Standardized Pagination Query Parameters + +All paginated endpoints follow a consistent query parameter contract: + +| Parameter | Type | Constraints | Default | Notes | +|---|---|---|---|---| +| `page` | int | ≥ 1 | `1` | 1-based page number (not 0-based offset). | +| `page_size` | int | 1–500 | `100` | Items per page. Clients may request smaller pages for UI reasons. | + +**Implementation:** + +```python +from fastapi import Query +from app.utils.constants import DEFAULT_PAGE_SIZE +from app.utils.pagination import create_pagination_metadata + +@router.get("/items") +async def get_items( + page: int = Query(default=1, ge=1, description="1-based page number."), + page_size: int = Query( + default=DEFAULT_PAGE_SIZE, + ge=1, + le=500, + description="Items per page (max 500).", + ), +): + # Compute offset for database query + offset = (page - 1) * page_size + items = await db.fetch("SELECT * FROM items LIMIT ? OFFSET ?", page_size, offset) + total = await db.fetchval("SELECT COUNT(*) FROM items") + + # Create pagination metadata with computed fields + pagination = create_pagination_metadata(total, page, page_size) + + return PaginatedListResponse( + items=items, + pagination=pagination, + ) +``` + +**Helper functions** are available in `app.utils.pagination`: + +```python +from app.utils.pagination import get_offset, compute_total_pages, create_pagination_metadata + +# Calculate database offset from page and page_size +offset = get_offset(page, page_size) # Equivalent to (page - 1) * page_size + +# Calculate total pages for rendering pagination UI (optional) +total_pages = compute_total_pages(total, page_size) + +# Create complete pagination metadata with all computed fields +pagination = create_pagination_metadata(total, page, page_size) +# Returns PaginationMetadata with: page, page_size, total, total_pages, has_next_page, has_prev_page +``` + +**Rules:** + +1. **Use 1-based pages** — Not 0-based offsets. Page 1 is always the first page. +2. **Always provide defaults** — Use `DEFAULT_PAGE_SIZE` (100) and initial page 1. +3. **Cap maximum page_size at 500** — Prevents accidental DoS from enormous requests. +4. **Use `create_pagination_metadata()`** — Factory function computes derived fields (total_pages, has_next_page, has_prev_page) consistently. +5. **Respond with `PaginatedListResponse[T]`** — Must include `items` and `pagination` metadata object. + +--- + +## 4.2 Error Response Schema + +All error responses use a consistent machine-readable format that enables frontend code to branch reliably on error conditions without string-parsing error detail text. + +### Error Response Format + +Every non-2xx HTTP response body is a JSON object with this structure: + +```json +{ + "code": "jail_not_found", + "detail": "Jail 'example' not found", + "metadata": { + "jail_name": "example" + }, + "correlation_id": "550e8400-e29b-41d4-a716-446655440000" +} +``` + +**Fields:** +- **`code`** (string, required): Machine-readable error code for client-side branching. Examples: `jail_not_found`, `rate_limit_exceeded`, `authentication_required`, `invalid_input`. +- **`detail`** (string, required): Human-readable error message. Safe for displaying to users. +- **`metadata`** (object, optional): Structured context data relevant to the error. Only includes data safe for client consumption (no sensitive internal state). Examples: offending parameter names, resource identifiers, field error counts, time windows. +- **`correlation_id`** (string | null, optional): Unique request ID for tracing this error across logs and systems. Set by the `CorrelationIdMiddleware`. Use this to correlate client-side errors with server logs for debugging. + +### Exception Hierarchy & Error Codes + +All domain exceptions inherit from `DomainError` (defined in `backend/app/exceptions.py`) and are organized by HTTP status category: + +| HTTP Status | Category Class | Error Codes | Use Case | +|---|---|---|---| +| **404** | `NotFoundError` | `not_found`, `jail_not_found`, `filter_not_found`, `action_not_found`, `config_file_not_found`, `blocklist_source_not_found`, `history_not_found` | Requested resource does not exist | +| **400** | `BadRequestError` | `invalid_input`, `config_validation_failed`, `config_operation_failed`, `jail_name_invalid`, `filter_name_invalid`, `action_name_invalid`, `config_file_name_invalid`, `filter_invalid_regex` | Invalid input, validation failure, malformed request | +| **409** | `ConflictError` | `conflict`, `jail_operation_failed`, `jail_already_active`, `jail_already_inactive`, `jail_not_in_config`, `action_already_exists`, `filter_already_exists`, `config_file_exists` | State conflict, resource already exists, invalid state transition | +| **500** | `OperationError` | `operation_failed`, `config_write_failed`, `config_file_write_failed`, `server_operation_failed`, `fail2ban_protocol_error` | Operation failure, write errors, unexpected failures | +| **503** | `ServiceUnavailableError` | `service_unavailable`, `config_dir_unavailable`, `fail2ban_unreachable` | Infrastructure/external service issues, temporary unavailability | +| **401** | `AuthenticationError` | `authentication_required` | Authentication or authorization failure, invalid/expired credentials | +| **429** | `RateLimitError` | `rate_limit_exceeded` | Rate limit exceeded, too many requests | + +**Note on request validation errors:** Pydantic validation errors (from request body type mismatches, missing required fields, etc.) are automatically caught by the `_request_validation_error_handler` and converted to `ErrorResponse` with `code="invalid_input"`. The `metadata` field includes `field_errors` (count of validation failures) and `first_field` (location of the first error field) to help clients debug malformed requests. + +### Implementing Error Handlers + +Every exception category has a corresponding exception handler registered in `backend/app/main.py`. When a domain exception is raised: + +1. FastAPI's exception handling middleware catches it. +2. The registered handler converts it to an `ErrorResponse` with HTTP status code. +3. The response is serialized as JSON with `code`, `detail`, and `metadata` fields. + +**Pattern for service code:** + +```python +from app.exceptions import JailNotFoundError, ConfigValidationError + +async def get_jail(name: str) -> Jail: + """Raises JailNotFoundError if jail not found.""" + jail = await db.fetchone("SELECT * FROM jails WHERE name = ?", (name,)) + if jail is None: + raise JailNotFoundError(name) # HTTP 404, code='jail_not_found' + return jail + +async def apply_config(config: JailConfig) -> None: + """Raises ConfigValidationError if invalid.""" + if not config.filter_name: + raise ConfigValidationError("filter_name is required") # HTTP 400, code='config_validation_failed' +``` + +### Adding New Exception Types + +1. **Choose the appropriate category** based on the HTTP status (NotFoundError for 404, BadRequestError for 400, etc.). +2. **Create a subclass** in `backend/app/exceptions.py`: + +```python +class MySpecificError(BadRequestError): + """Raised when X happens.""" + + error_code: str = "my_specific_error" + + def __init__(self, detail_msg: str, **context) -> None: + self.context = context + super().__init__(detail_msg) + + def get_error_metadata(self) -> dict[str, str | int | float | bool | None]: + """Return only safe, relevant metadata.""" + return {k: v for k, v in self.context.items() if k in ("offending_value", "constraint")} +``` + +3. **Use explicit error codes** — Don't derive them from the class name. This makes them self-documenting and prevents breakage on class renames. +4. **Implement `get_error_metadata()`** — Return only data safe for client consumption. Never leak internal state, file paths, or system details. +5. **Raise from service code** — Never from repositories or utils. Exceptions represent business logic violations, not infrastructure errors. + +### Exception Handler Hierarchy + +All domain exceptions are automatically converted to `ErrorResponse` via handlers registered in `backend/app/main.py`. The handler registration order is critical: + +```python +# Handlers are registered from most specific to least specific: +1. Network errors (Fail2BanConnectionError, etc.) → HTTP 502 +2. Auth/rate errors (AuthenticationError, RateLimitError) → HTTP 401/429 +3. Category handlers (NotFoundError, BadRequestError, ConflictError, etc.) → HTTP 404/400/409/500/503 +4. DomainError catch-all → HTTP 500 # ← Catches unregistered DomainError subclasses +5. HTTPException (FastAPI built-ins) → HTTP varies +6. ValueError (Pydantic validation) → HTTP 400 +7. Exception catch-all → HTTP 500 # ← Absolute last resort +``` + +**Important:** The `DomainError` catch-all handler (step 4) is the safety net. If you add a new `DomainError` subclass without placing it in a category (e.g., `class MyError(DomainError)` instead of `class MyError(BadRequestError)`), it will still get the correct `error_code` and `metadata` via this handler instead of silently falling through to the generic exception handler. + +**Critical caveat:** Every new `DomainError` subclass **must**: +- Define an `error_code` class attribute (e.g., `error_code: str = "my_error"`) +- Override `get_error_metadata()` if it needs to return context data +- Inherit from the appropriate category (NotFoundError, BadRequestError, ConflictError, OperationError, or ServiceUnavailableError) + +If you forget to implement `error_code` or `get_error_metadata()`, the fallback to parent class implementations will produce misleading error codes and empty metadata — check your tests! + +**What NOT to do:** +- ❌ Don't raise `HTTPException` from service code (bypass the ErrorResponse format). +- ❌ Don't put sensitive information in `metadata` (database paths, SQL, internal IDs). +- ❌ Don't derive error codes from class names using regex (fragile and non-self-documenting). +- ❌ Don't create a `DomainError` subclass without a category (always inherit from one of the seven categories) + +### Frontend Error Parsing + +The frontend `ApiError` class parses error responses automatically: + +```typescript +import { api } from "src/api/client"; + +try { + const jail = await api.get("/jails/example"); +} catch (error) { + if (error instanceof ApiError) { + const code = error.errorResponse?.code; + + if (code === "jail_not_found") { + // Handle not found + console.log("Jail does not exist:", error.errorResponse?.metadata?.jail_name); + } else if (code === "rate_limit_exceeded") { + // Handle rate limit + showRateLimitModal(); + } else if (code === "authentication_required") { + // Handle auth — the frontend framework auto-redirects to /login + redirectToLogin(); + } + } +} +``` + +The `errorResponse` field contains the parsed error object with `code`, `detail`, and `metadata` fields, enabling reliable machine-readable branching. + --- ## 5. Pydantic Models -- Every model inherits from `pydantic.BaseModel`. -- Use `model_config = ConfigDict(strict=True)` where appropriate. -- Field names use **snake_case** in Python, export as **camelCase** to the frontend via alias generators if needed. -- Validate at the boundary — once data enters a Pydantic model it is trusted. -- Use `Field(...)` with descriptions for every field to keep auto-generated docs useful. -- Separate **request models**, **response models**, and **domain (internal) models** — do not reuse one model for all three. +### Base Class + +Every model in `app/models/` **must** inherit from `BanGuiBaseModel` (defined in `app/models/response.py`), not from `pydantic.BaseModel` directly. ```python -from pydantic import BaseModel, Field -from datetime import datetime +from app.models.response import BanGuiBaseModel -class BanResponse(BaseModel): +class BanResponse(BanGuiBaseModel): ip: str = Field(..., description="Banned IP address") jail: str = Field(..., description="Jail that issued the ban") banned_at: datetime = Field(..., description="UTC timestamp of the ban") @@ -157,6 +925,189 @@ class BanResponse(BaseModel): ban_count: int = Field(..., ge=1, description="Number of times this IP was banned") ``` +`BanGuiBaseModel` sets `strict=True` and documents the naming policy. Do **not** override `model_config` on individual models unless you have a specific, documented reason. + +### API Field Naming Policy — snake_case everywhere + +All API field names use **`snake_case`** in Python, in the JSON wire format, and in the corresponding TypeScript interfaces. There is no `alias_generator` that converts to camelCase. + +- ✅ Python field: `active_jails` → JSON key: `"active_jails"` → TypeScript property: `active_jails` +- ❌ Do **not** add a camelCase `alias_generator` to individual models. +- ❌ Do **not** mix field name conventions within a single API response. + +This policy eliminates a whole class of frontend–backend contract bugs. If the naming policy ever needs to change (e.g. to emit camelCase), change `BanGuiBaseModel` once — all models update automatically. + +### Other Model Rules + +- Validate at the boundary — once data enters a Pydantic model it is trusted. +- Use `Field(...)` with descriptions for every field to keep auto-generated docs useful. +- Separate **request models**, **response models**, and **domain (internal) models** — do not reuse one model for all three. +- **Models are leaf nodes**: Models in `app/models/` must not import from application-layer modules (`app.services`, `app.config`, `app.utils`). Models may only import from: + - Standard library and third-party packages (Pydantic, typing) + - Other models in `app/models/` (sibling models) + - `app.models.response` (response envelopes) + - Validation that requires app-level state (e.g., `settings`, allowed directories) must happen at the router or service layer, never in model validators. + +### Using `Literal` Types for Constrained Strings + +When a field should only accept a small set of predefined values, use `Literal` to enforce this at the type level: + +```python +from typing import Literal +from pydantic import BaseModel, Field + +LogLevel = Literal["CRITICAL", "ERROR", "WARNING", "NOTICE", "INFO", "DEBUG"] + +class GlobalConfigUpdate(BaseModel): + log_level: LogLevel | None = Field( + default=None, + description="Log level: CRITICAL, ERROR, WARNING, NOTICE, INFO, or DEBUG.", + ) +``` + +This provides: +- **Type safety** — IDEs and type checkers enforce valid values. +- **API documentation** — OpenAPI docs automatically list all allowed values. +- **Validation** — Pydantic rejects invalid values and provides a clear error message. + +### Field Validators and Validation Placement + +**Critical Constraint — No Import-Time Execution:** + +Pydantic validators, field defaults, and computed fields execute when a model is **defined** (at import time), not just when instances are created. For this reason: +- Validators must be **pure functions** with no side effects +- **NEVER** import or call runtime-dependent functions: `get_settings()`, file I/O, database queries, network calls, etc. +- **NEVER** import from `app.config`, `app.utils`, `app.services`, or `app.routers` in model files + +Violating this constraint creates **hidden circular dependencies** that prevent the application from starting. + +**Example of What NOT to Do:** +```python +# ❌ WRONG — This gets executed at import time: +from pydantic import Field +from app.config import get_settings + +class ConfigModel(BaseModel): + max_age: int = Field(default_factory=lambda: get_settings().max_log_max_age_days) + # ↑ get_settings() is called when Python imports this module! +``` + +Field validators in models should only contain logic that is **stateless and does not depend on application configuration or state**. Validators must not import from `app.config`, `app.utils`, or `app.services`. + +For validation that depends on app-level state (e.g., file paths that must be within allowed directories), perform validation in the router or service layer: + +```python +# ✅ Good: Validation in router (has access to settings) +from fastapi import APIRouter +from app.config import get_settings +from app.utils.path_utils import validate_log_path + +@router.post("/jails/{name}/logpath") +async def add_log_path(name: str, body: AddLogPathRequest) -> None: + # Validate before using + validate_log_path(body.log_path) + await config_service.add_log_path(socket_path, name, body) + +# ❌ Avoid: Importing from app layer in model validators +# Do NOT do this in app/models/config.py: +# from app.config import get_settings +# @field_validator("log_path") +# def validate_log_path_field(cls, value: str) -> str: +# settings = get_settings() # ← Models must not import from app layer +# ... +``` + +**Common Helper:** For shared path validation logic, use the `validate_log_path()` helper from `app.utils.path_utils` in your router or service, not in model validators. + +```python +from fastapi import HTTPException, status +from app.utils.path_utils import validate_log_path + +@router.delete("/{name}/logpath") +async def delete_log_path( + name: str, + log_path: str = Query(...), +) -> None: + try: + validate_log_path(log_path) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=str(e), + ) from e + # ... rest of handler +``` + +**Key points:** +- Use `mode="after"` in model validators to validate after Pydantic's basic type coercion. +- Raise `ValueError` if validation fails; Pydantic converts it to an HTTP 400 response. +- For query parameters that cannot use Pydantic validators, use the `validate_log_path()` helper and raise HTTP 422. +- **Never use string prefix matching** for path validation (e.g., `path.startswith("/var/log")`). The helper uses `Path.relative_to()` to prevent bypasses like `/var/log_evil/file.log`. +- Symlinks are resolved before validating to prevent symlink-based escapes. + +### Model Type Usage by Layer + +**Pydantic models** are mandatory for all **external-facing** data structures — anything that crosses layer boundaries or is serialized to HTTP responses. **TypedDict** may be used **only** for internal, layer-private data structures where they provide precise typing without runtime overhead. + +**Rules:** + +1. **Routers (HTTP boundary):** All request and response types **must be Pydantic models**. FastAPI uses these for validation, serialization, and OpenAPI documentation. + - Use Pydantic request models for request bodies and query parameters. + - Use Pydantic response models in the `response_model` parameter. + + ```python + # Good — Pydantic models for router layer + class JailStatsRequest(BaseModel): + jail_name: str + + class JailStatsResponse(BaseModel): + jail_name: str + active_bans: int + + @router.post("/stats", response_model=JailStatsResponse) + async def get_stats(req: JailStatsRequest) -> JailStatsResponse: + ... + ``` + +2. **Services (business logic):** Return types should be **Pydantic models** if the result is: + - Returned to a router (likely — they become API responses). + - Used across multiple services (shared interfaces). + - Exposed to external consumers (even indirectly). + + If a service returns a purely internal intermediate result used by a single caller, TypedDict is acceptable but should be rare. + + ```python + # Good — service returns Pydantic (may be used by multiple routers) + async def get_jail_details(name: str) -> JailDetailResponse: + ... + + # Acceptable — purely internal utility result + def _parse_fail2ban_response(raw: str) -> ParsedResponse: + """Internal helper—used only by this service.""" + ... + ``` + +3. **Repositories (data access):** Return types may use **TypedDict** because they represent **raw database rows** that: + - Are layer-private (only called by their own service). + - Do not cross HTTP boundaries directly. + - Benefit from lightweight typing without runtime validation. + + ```python + # Good — TypedDict for raw repository rows + class GeoRow(TypedDict): + ip: str + country_code: str | None + + async def load_all(db: aiosqlite.Connection) -> list[GeoRow]: + ... + ``` + + If a repository result becomes part of a service's public interface (returned to routers or other services), convert it to a Pydantic model. + +4. **Utilities and helpers:** Internal helper results may use TypedDict if they are not part of a public module interface. + +**Migration path:** Existing internal TypedDicts (e.g., `GeoCacheRow`, `ImportLogRow`) may remain as TypedDicts so long as they stay within their layer. If a type needs to cross layer boundaries (repo → service → router), convert it to a Pydantic model incrementally as you refactor that data flow. + --- ## 6. Async Rules @@ -205,32 +1156,546 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None]: await app.state.db.close() ``` ---- +### Fire-and-Forget Background Tasks -## 7. Logging +When you need to spawn a background task that runs independently without waiting for the result, use `asyncio.create_task()` with the `logged_task()` helper from `app.utils.async_utils`. This ensures exceptions in background tasks are always logged and never silently discarded. -- Use **structlog** for every log message. -- Bind contextual key-value pairs — never format strings manually. -- Log levels: `debug` for development detail, `info` for operational events, `warning` for recoverable issues, `error` for failures, `critical` for fatal problems. -- Never log sensitive data (passwords, tokens, session IDs). +**Why this matters:** In Python 3.11+, unhandled exceptions in fire-and-forget tasks become silent `RuntimeWarning`s. Without logging, background errors (network failures, database writes, API timeouts) become invisible in structured logs and are extremely hard to debug. +**Pattern:** ```python +from app.utils.async_utils import logged_task + +# Bad — exceptions are silently discarded +asyncio.create_task(some_background_work()) + +# Good — exceptions are logged +asyncio.create_task( + logged_task(some_background_work(), "task_name"), + name="task_name", +) +``` + +The `logged_task()` wrapper: +- Wraps your coroutine to catch any exception +- Logs the exception with `log.exception()` (structlog automatically captures the traceback) +- Adds `task_name` to the structured log context +- Never re-raises — it's safe to use with `asyncio.create_task()` + +Example: +```python +import asyncio +from app.utils.async_utils import logged_task import structlog -log: structlog.stdlib.BoundLogger = structlog.get_logger() +log = structlog.get_logger() -async def ban_ip(ip: str, jail: str) -> None: - log.info("banning_ip", ip=ip, jail=jail) +async def geo_lookup_batch(ips: list[str]) -> None: + """Look up geolocation data for IPs asynchronously.""" try: - await _execute_ban(ip, jail) - log.info("ip_banned", ip=ip, jail=jail) - except BanError as exc: - log.error("ban_failed", ip=ip, jail=jail, error=str(exc)) + for ip in ips: + # May timeout, fail network call, or fail DB write + location = await lookup_ip_location(ip) + await db.execute(INSERT_GEO_SQL, (ip, location)) + await db.commit() + except Exception: + # All exceptions are automatically logged by logged_task() wrapper raise + +# In your request handler or service: +asyncio.create_task( + logged_task(geo_lookup_batch(uncached_ips), "geo_cache_batch"), + name="geo_cache_batch", +) ``` --- +## 6.1 Database Query Conventions + +### LIKE Queries and Wildcard Escaping + +SQLite's `LIKE` operator treats `%` (any sequence of characters) and `_` (any single character) as wildcards. When querying with user-supplied filters that may contain these characters, you must escape them to prevent unintended matches. + +**The Problem:** +```python +# Bad — ip_filter="10.0.0_" matches "10.0.0.1", "10.0.0.2", etc. +ip_filter = "10.0.0_" +await db.execute( + "SELECT * FROM bans WHERE ip LIKE ?", + (f"{ip_filter}%",) # ← wildcard characters not escaped +) +``` + +**The Solution:** + +Use the `escape_like()` helper from `app.utils.fail2ban_db_utils`: + +```python +from app.utils.fail2ban_db_utils import escape_like + +# Good — wildcard characters are escaped +ip_filter = "10.0.0_" +await db.execute( + "SELECT * FROM bans WHERE ip LIKE ? ESCAPE '\\'", + (f"{escape_like(ip_filter)}%",) # ← underscores escaped to literal +) +``` + +**How `escape_like()` works:** + +The function escapes backslashes first, then `%` and `_` signs: +```python +def escape_like(s: str) -> str: + return s.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") +``` + +**Key rules:** +1. **Backslash escapes first** — to prevent double-escaping when the input contains backslashes. +2. **Add `ESCAPE '\\'` to the SQL** — tells SQLite which character to use for escaping. +3. **Dots are not wildcards** — they do not need escaping; normal IP addresses pass through unchanged. + +**Test example:** +```python +assert escape_like("10.0.0_") == "10.0.0\\_" +assert escape_like("10.0.0%test") == "10.0.0\\%test" +assert escape_like("10.0.0.1") == "10.0.0.1" # Unchanged +``` + +--- + +## 6.2 Database Migrations + +The application database schema is versioned and migrated automatically on startup via `app.db.init_db()`. + +### Migration Design Principles + +**Migrations must be atomic.** All schema changes for a single version (DDL statements) and the `schema_migrations` record insert must be wrapped in a single `BEGIN IMMEDIATE ... COMMIT` transaction. This prevents partial migrations if a process crashes mid-migration. + +If a crash occurs between migration steps, the next startup will: +1. Detect the missing `schema_migrations` record. +2. Re-apply the entire migration in a single transaction (all-or-nothing). +3. Avoid data corruption or schema inconsistency. + +### Writing a New Migration + +1. **Add the DDL statements** to `_MIGRATIONS` dict in `app/db.py`: + +```python +_MIGRATIONS: dict[int, str] = { + 1: _CREATE_INITIAL_SCHEMA, + 2: """ +-- Migration 2: Add new_column to users table. +ALTER TABLE users ADD COLUMN new_column TEXT DEFAULT 'default_value'; +CREATE INDEX idx_users_new_column ON users(new_column); +""", +} +``` + +2. **Update `_CURRENT_SCHEMA_VERSION`** to the new version number: + +```python +_CURRENT_SCHEMA_VERSION: int = 2 # was 1 +``` + +3. **Ensure idempotency where possible:** + - Use `CREATE TABLE IF NOT EXISTS` and `CREATE INDEX IF NOT EXISTS`. + - For `ALTER TABLE ADD COLUMN`, check if the column exists first using `PRAGMA table_info()` if re-applying the migration is a concern. + +4. **Verify atomicity in tests:** + +```python +async def test_migration_2_is_atomic(tmp_path: Path) -> None: + """Verify migration 2 rolls back on failure.""" + db = await open_db(str(tmp_path / "test.db")) + try: + await db.execute("CREATE TABLE schema_migrations (version INTEGER PRIMARY KEY);") + await db.commit() + + # Add a test migration that fails mid-way + original = db_module._MIGRATIONS.copy() + db_module._MIGRATIONS[99] = """ + CREATE TABLE test_table (id INTEGER PRIMARY KEY); + INSERT INTO nonexistent_table VALUES (1); + """ + + try: + with pytest.raises(Exception): + await _apply_migration(db, 99) + + # Verify rollback: migration NOT recorded + async with db.execute( + "SELECT version FROM schema_migrations WHERE version = 99;" + ) as cursor: + assert await cursor.fetchone() is None + + # Verify rollback: table NOT created + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='test_table';" + ) as cursor: + assert await cursor.fetchone() is None + finally: + db_module._MIGRATIONS = original + finally: + await db.close() +``` + +### Common Pitfalls + +- **Non-idempotent statements** — `ALTER TABLE ADD COLUMN` without `IF NOT EXISTS` will fail on re-run. Use explicit checks if needed. +- **Comments containing semicolons** — the migration parser strips comments correctly, but avoid unusual comment syntax. +- **String literals with semicolons** — the parser handles these; no special escaping needed. +- **Multiple operations in one migration** — keep migrations focused. Combine related DDL but split unrelated changes. + +--- + +## 6.3 Database Transactions + +Database transactions ensure atomicity for multi-step operations and prevent race conditions when concurrent requests interact with the database. BanGUI uses **SQLite with WAL (Write-Ahead Logging)** mode, which enables concurrent readers but serializes writers. + +### When to Use Explicit Transactions + +**Use `BEGIN IMMEDIATE ... COMMIT` for:** + +1. **Multi-step logical operations** — Operations that should succeed or fail as a unit. Example: + ```python + # Bad — two separate operations, race condition window exists + await db.execute("INSERT INTO sessions ...") + await db.commit() + + # Good — atomic single operation, no need for explicit transaction + ``` + +2. **Operations that combine multiple queries with shared state** — When the operation outcome depends on reading and then writing based on that read: + ```python + # Bad — race condition: another request checks between our read and write + existing_run = await import_run_repo.get_by_source_and_hash(db, source_id, content_hash) + if existing_run is None: + run_id = await import_run_repo.create_pending(db, source_id, content_hash) + + # Good — atomic: both operations within same transaction boundary + try: + await db.execute("BEGIN IMMEDIATE") + cursor = await db.execute("INSERT INTO import_runs ...") + await db.commit() + except aiosqlite.IntegrityError: + # Another request won the race; fetch the existing record + existing = await import_run_repo.get_by_source_and_hash(...) + ... + ``` + +3. **Bulk operations that should be all-or-nothing** — For example, upserting positive and negative geo cache entries: + ```python + try: + await db.execute("BEGIN IMMEDIATE") + await bulk_upsert_entries(db, positive_rows) + await bulk_upsert_neg_entries(db, negative_ips) + await db.commit() + except Exception: + await db.rollback() + raise + ``` + +**Do NOT use explicit transactions for:** + +- Single SQL statements — SQLite guarantees atomic writes for individual statements. No explicit transaction needed. +- Read-only queries — Queries do not modify data and do not need transaction boundaries. + +### Transaction Pattern + +Always use this pattern for wrapped operations: + +```python +try: + await db.execute("BEGIN IMMEDIATE") + # ... perform all operations ... + await db.commit() +except Exception: + await db.rollback() + raise +``` + +- **`BEGIN IMMEDIATE`** — Acquires a write lock immediately, preventing other writers from entering the transaction window. This is critical for crash-safety and consistency. +- **`COMMIT`** — Persists all changes. +- **`ROLLBACK`** — Rolls back on any exception, ensuring the database is left in a consistent state. + +### Handling Race Condition Errors + +When a `UNIQUE` constraint violation occurs due to a race condition (two concurrent requests attempt the same insert), the database raises `aiosqlite.IntegrityError`. **Handle this at the call site** by retrying the lookup: + +```python +try: + run_id = await import_run_repo.create_pending(db, source_id, content_hash) +except aiosqlite.IntegrityError: + # Another concurrent request created it first + existing = await import_run_repo.get_by_source_and_hash(db, source_id, content_hash) + if existing is None: + raise RuntimeError("Constraint error indicates row exists but lookup failed") + run_id = existing.id + log.info("lost_race", run_id=run_id) +``` + +This approach: +1. Lets the database constraint prevent data corruption. +2. Gracefully handles the concurrent case in application logic. +3. Avoids unnecessary locking overhead for the common case (no concurrent writers). + +--- + +## 7. Structured Logging Policy + +All logging in BanGUI services and tasks must use **structlog** for consistent, queryable event tracking. This policy defines when and how to log at each level. + +### 7.1 Logging Levels and When to Use Them + +**INFO** — User-facing operations and state changes +- Use for significant operational events that the operator should know about. +- Examples: service startup/completion, resource creation/deletion, state transitions, successful operations with business impact. +- Never excessive — keep volume reasonable to maintain log clarity. +- Include relevant context: resource IDs, counts, configuration changes. + +```python +log.info("jail_activated", jail=name) +log.info("blocklist_source_created", id=new_id, name=name, url=url) +log.info("session_cleanup_ran", deleted_count=count, cutoff_time=now_iso) +``` + +**WARNING** — Recoverable failures, degraded functionality, unexpected but handled conditions +- Use for issues that are not fatal but indicate something is wrong or suboptimal. +- Examples: missing optional config, fallback behavior triggered, non-critical API call failed, parsing errors, missing resources that have workarounds. +- Include error details and context to enable investigation. +- Do NOT use for expected error paths (e.g., wrong password in login attempt). + +```python +log.warning("jail_status_parse_error", jail=name, error=str(exc)) +log.warning("geo_lookup_failed", ip=ip, error=type(exc).__name__) +log.warning("geoip_mmdb_not_found", path=mmdb_path) +``` + +**ERROR** — Fatal/unrecoverable failures within a request or task +- Use for errors that prevent an operation from completing. +- Examples: database write failed, critical resource is missing, state is corrupted. +- Include the full error context to enable debugging. +- Pair with exception handling — if you log an error, you've decided the caller should handle it or return a failure. + +```python +log.error("jail_activation_rollback_restore_failed", jail=name, error=str(exc)) +log.error("fail2ban_probe_parse_error", error=str(exc)) +``` + +**EXCEPTION** — Unhandled exceptions in background tasks and scheduled jobs +- Use `log.exception()` (not `log.error()`) in catch-all exception handlers to automatically capture the full traceback. +- This level is ONLY for surprises that should never happen in production. +- Use in task callback exception handlers and top-level task runners. + +```python +try: + result = await blocklist_service.import_all(...) +except Exception: + log.exception("blocklist_import_unexpected_error") +``` + +**DEBUG** — Low-level details for development and troubleshooting +- Use for details too verbose for normal operation (e.g., successful lookups, parsed files, state transitions, loop iterations). +- Include data samples and validation results. +- Safe to leave in the code — debug logs are not emitted by default in production. + +```python +log.debug("geo_lookup_success_mmdb", ip=ip, country=result.country_code) +log.debug("action_file_parsed", name=raw.name) +log.debug("backend_cmd_supported_detected") +``` + +### 7.2 Event Naming Convention + +All log event names must follow a consistent pattern for queryability: + +**Pattern:** `{domain}_{entity_or_action}[_{result_or_detail}]` + +- **Domain** — The service or feature area (e.g., `jail`, `blocklist`, `geo`, `auth`, `ban`). +- **Entity or Action** — The noun or verb describing what happened (e.g., `activated`, `created`, `failed`, `synced`). +- **Result or Detail** (optional) — Additional specificity for complex scenarios (e.g., `_restore_failed`, `_non_200`, `_no_fallback`). + +**Examples (organized by domain):** + +| Domain | INFO | WARNING | ERROR | DEBUG | +|--------|------|---------|-------|-------| +| **jail** | `jail_activated`, `jail_deactivated`, `jail_reloaded` | `jail_status_parse_error`, `jail_rollback_failed` | `jail_activation_rollback_restore_failed` | `jail_file_parsed`, `backend_cmd_supported_detected` | +| **geo** | `geo_cache_loaded_from_db`, `geo_flush_dirty_complete` | `geo_lookup_failed`, `geo_persist_failed` | - | `geo_lookup_success_mmdb`, `geo_cache_cleanup_ran` | +| **blocklist** | `blocklist_import_starting`, `blocklist_source_created` | `blocklist_schedule_invalid`, `blocklist_preview_failed` | - | `blocklist_ban_failed` | +| **ban** | `active_bans_fetched`, `all_ips_unbanned` | `ban_service_geo_lookup_failed` | `ban_service_geo_lookup_unexpected_error` | `ban_entry_parse_error` | +| **auth** | `bangui_login_success`, `bangui_logout` | `bangui_login_wrong_password`, `bangui_login_no_hash` | - | - | +| **config** | `filter_created`, `action_updated` | `filter_read_error`, `action_d_not_found` | - | `filter_file_parsed` | +| **setup** | `bangui_setup_started`, `bangui_setup_completed` | - | `bangui_setup_failed` | - | + +**Key rules:** +- Use **snake_case** (never camelCase or PascalCase). +- Keep event names short but descriptive — aim for 2–4 words. +- Use consistent terminology across the codebase (e.g., always `_created`, never `_added` or `_new`). +- Prefix task/background job events with the job name (e.g., `blocklist_import_starting`, `session_cleanup_ran`). + +### 7.3 Structured Context and Key-Value Pairs + +Always log with **structured context** — key-value pairs that make logs queryable and analyzable. + +**Essential patterns:** + +```python +# Operation with count +log.info("active_bans_fetched", total=len(bans)) + +# Resource manipulation with ID +log.info("blocklist_source_deleted", id=source_id) + +# State transition with reason or error +log.warning("geo_persist_failed", ip=ip, error=type(exc).__name__) + +# Time-bounded operation +log.info("session_cleanup_ran", deleted_count=count, cutoff_time=now_iso) + +# Config or feature change +log.info("blocklist_schedule_updated", frequency=config.frequency, hour=config.hour) + +# Batch operation with metrics +log.info("blocklist_import_finished", total_imported=result.total_imported, + total_skipped=result.total_skipped, errors=result.errors_count) +``` + +**What to include:** +- Resource IDs (jail names, IP addresses, source IDs). +- Counts and metrics (rows processed, items synced, errors). +- Configuration or decision points (enabled/disabled flags, thresholds). +- Timestamps and durations for long-running operations (optional but useful). +- Error types and short error messages (use `type(exc).__name__` or `str(exc)` depending on context). + +**What to NEVER log:** +- Sensitive data: passwords, tokens, session IDs, API keys, hashes. + - For session correlation without leaking token material, use a one-way hash fragment: `hashlib.sha256(token.encode()).hexdigest()[:12]`. + - Use numeric database IDs for entity correlation instead of raw identifiers: `session_id=session.id` instead of `token=session.token`. +- Full exception tracebacks in INFO/WARNING (use `log.exception()` only in catch-all handlers). +- Redundant system time (structlog adds `timestamp` automatically). +- User PII in most cases (name, email, phone, etc.) — unless directly relevant to debugging and anonymized. + +### 7.4 Background Tasks and Scheduled Jobs + +Every background task (APScheduler job) must follow this pattern: + +1. **On startup:** Log `{job_name}_scheduled` with interval or cron expression. +2. **On execution:** Log `{job_name}_starting` (INFO) and `{job_name}_finished` or `{job_name}_ran` (INFO) with metrics. +3. **On exception:** Use `log.exception("{job_name}_unexpected_error")` in the top-level try/except. + +```python +# app/tasks/blocklist_import.py +async def _run_import_with_resources(settings: Settings, http_session: ClientSession) -> None: + """APScheduler callback that imports all enabled blocklist sources.""" + log.info("blocklist_import_starting") + try: + result = await blocklist_service.import_all(...) + log.info("blocklist_import_finished", + total_imported=result.total_imported, + total_skipped=result.total_skipped, + errors=result.errors_count) + except Exception: + log.exception("blocklist_import_unexpected_error") + +# Register and log in the lifespan or register function: +log.info("blocklist_import_scheduled", interval_seconds=INTERVAL_SECONDS) +``` + +### 7.5 Service Functions and Methods + +Service functions should log at entry/exit for significant operations, or when errors occur: + +**Entry logging** (optional, use for complex or long-running operations): +```python +async def import_all(...) -> ImportResult: + log.debug("blocklist_import_starting", count=len(sources)) + try: + ... + except SomeError: + log.warning("blocklist_import_partial_failure", imported=count, error=str(exc)) +``` + +**Exit/success logging** (log results with metrics): +```python +async def get_all_jails(socket_path: str) -> list[JailResponse]: + jails = await _fetch_jails(socket_path) + log.info("jail_list_fetched", count=len(jails)) + return jails +``` + +**Error handling** (log with context, let caller decide how to respond): +```python +try: + result = await fetch_external_data(url) +except TimeoutError: + log.warning("external_fetch_timeout", url=url, timeout_seconds=TIMEOUT) + raise +except Exception as exc: + log.error("external_fetch_failed", url=url, error=type(exc).__name__) + raise +``` + +### 7.6 Domain Model Pattern for Services + +Services **return domain models** (frozen dataclasses), not Pydantic response models. Conversion to response models happens at the **router boundary**. + +**Example (correct):** + +```python +# app/services/jail_service.py — returns domain model +async def get_jail(socket_path: str, name: str) -> DomainJailDetail: + ... + return DomainJailDetail(name=name, ...) + +# app/routers/jails.py — converts at boundary +@router.get("/{name}") +async def get_jail(...) -> JailDetailResponse: + domain = await jail_service.get_jail(socket_path, name) + return jail_mappers.map_domain_jail_detail_to_response(domain) +``` + +**When adding a new service:** +1. Define domain model in `app/models/{domain}_domain.py` (frozen dataclass) +2. Add mapper in `app/mappers/{domain}_mappers.py`: `map_domain_X_to_response(domain: DomainX) -> XResponse` +3. Service returns domain model type +4. Router calls mapper before returning + +**Reference:** `ban_service.py` + `ban_mappers.py` is canonical example. See `Docs/DOMAIN_MODELS.md`. + +--- + +### 7.7 Third-Party Library Log Levels + +Application code must use **structlog** for all logging. Third-party libraries that emit logs through Python's standard `logging` module are configured centrally in `backend/app/main.py::_configure_logging()`. + +**Current overrides:** + +| Library | Logger | Level | Reason | +|---------|--------|-------|--------| +| APScheduler | `apscheduler` | `WARNING` | Routine scheduler polling is too verbose at DEBUG. | +| aiosqlite | `aiosqlite` | `WARNING` | Database operation traces clutter logs. | + +**Adding a new override:** + +```python +# In backend/app/main.py, inside _configure_logging() +logging.getLogger("new_library").setLevel(logging.WARNING) +``` + +- Prefer `WARNING` over `ERROR` so legitimate warnings (e.g., connection retries) are still visible. +- Place the override immediately after `logging.basicConfig()` so it takes effect before any library initializes its own loggers. + +**Disabling suppression:** + +Set `BANGUI_SUPPRESS_THIRD_PARTY_LOGS=false` to allow APScheduler and aiosqlite to emit their normal DEBUG/INFO logs. This is useful when troubleshooting scheduler or database issues in development. + +**Stdlib interception:** + +All stdlib logs are intercepted by `structlog.stdlib.ProcessorFormatter` and rendered as JSON. Even third-party library logs therefore appear as structured JSON in `bangui.log`, not plain text. + +--- + ## 8. Error Handling - Define **custom exception classes** for domain errors (e.g., `JailNotFoundError`, `BanFailedError`). @@ -238,16 +1703,222 @@ async def ban_ip(ip: str, jail: str) -> None: - Map domain exceptions to HTTP status codes via FastAPI **exception handlers** registered on the app. - Always log errors with context before raising. -```python -class JailNotFoundError(Exception): - def __init__(self, name: str) -> None: - self.name: str = name - super().__init__(f"Jail '{name}' not found") +### Service Error Contracts -# In main.py -@app.exception_handler(JailNotFoundError) -async def jail_not_found_handler(request: Request, exc: JailNotFoundError) -> JSONResponse: - return JSONResponse(status_code=404, content={"detail": f"Jail '{exc.name}' not found"}) +Each service method must document which error handling pattern it follows. This +lets callers know what to expect without reading the implementation. See +`Docs/Service-Development.md` for the full guide. + +**ABORT_ON_ERROR** — Raise an exception, let the router handle it. Used for: +auth, writes, state changes, any operation where partial success is meaningless. + +**RETURN_DEFAULT** — Return empty result and log warning. Never raises. Used for: +informational reads where infrastructure unavailability should not block the UI. + +**PARTIAL_RESULT** — Return a result that contains both successful items and a +list of errors. Caller decides what to do with each. + +```python +async def get_settings(socket_path: str) -> DomainServerSettingsResult: + """Return current fail2ban server-level settings. + + Error contract: RETURN_DEFAULT. Returns DomainServerSettingsResult with + default values if socket is unreachable. Never raises. + """ + ... + +async def start_jail(socket_path: str, name: str) -> None: + """Start a stopped fail2ban jail. + + Error contract: ABORT_ON_ERROR. Raises JailNotFoundError (404), + JailOperationError (409), Fail2BanConnectionError (503). + """ + ... +``` + +```python +class ServiceErrorContract: + """ + ABORT_ON_ERROR: Raise exception, let router handle + RETURN_DEFAULT: Return empty result, log warning + PARTIAL_RESULT: Return partial success with error list + """ +``` + +The error contract enum and helper are in `app.services.error_handling`. + +### Routers and Exception Propagation + +- **Routers must NOT construct `HTTPException` for domain errors** — let domain exceptions propagate. +- Routers should never have helper functions like `_bad_gateway()`, `_not_found()`, `_conflict()` etc. that convert domain exceptions to `HTTPException`. +- All domain exception types must have corresponding handlers registered in `main.py` via `app.add_exception_handler()`. +- Exception handlers are registered in order from most specific to least specific — FastAPI evaluates them in registration order. + +```python +# ❌ BAD — routers constructing HTTPException for domain exceptions +@router.get("/{name}") +async def get_jail(name: str, socket_path: Fail2BanSocketDep) -> JailDetailResponse: + try: + return await jail_service.get_jail(socket_path, name) + except JailNotFoundError: + raise HTTPException(status_code=404, detail=f"Jail not found: {name!r}") from None + +# ✅ GOOD — domain exception propagates to global handler +@router.get("/{name}") +async def get_jail(name: str, socket_path: Fail2BanSocketDep) -> JailDetailResponse: + return await jail_service.get_jail(socket_path, name) +``` + +All domain exceptions raised by services propagate to handlers in `main.py`, ensuring: +1. Consistent error response format across the entire API. +2. No duplicated exception-to-HTTP-status mapping logic. +3. Easy to audit all error codes — they are all in one place. + +### Error Message Hygiene + +HTTP responses must never leak sensitive internal details that aid attackers or expose infrastructure: + +- **Never include system paths** in HTTP error messages (e.g., `/var/run/fail2ban/fail2ban.sock`, `/etc/fail2ban/`). +- **Never include raw exception messages** that expose internal parsing or implementation logic. +- **Log full details server-side only** — exception handlers must log `error=str(exc)` with full exception context, but return generic user-friendly messages in the HTTP response. + +```python +# ❌ BAD — leaks socket path and internal details to the client +async def _fail2ban_connection_handler(request: Request, exc: Fail2BanConnectionError) -> JSONResponse: + return JSONResponse( + status_code=502, + content={"detail": f"Cannot reach fail2ban: {exc}"}, # exc includes socket path! + ) + +# ✅ GOOD — generic message in response, full details in server logs +async def _fail2ban_connection_handler(request: Request, exc: Fail2BanConnectionError) -> JSONResponse: + log.warning( + "fail2ban_connection_error", + path=request.url.path, + method=request.method, + error=str(exc), # Full details logged server-side + ) + return JSONResponse( + status_code=502, + content={"detail": "Cannot reach the fail2ban service. Check the server status page."}, + ) +``` + +### Exception Taxonomy and HTTP Mapping + +BanGUI uses a **standardized exception taxonomy** that maps domain errors to HTTP status codes consistently across all services. This allows routers to handle exceptions by category rather than by individual type, reducing code duplication and ensuring consistent client-facing error responses. + +#### Exception Categories + +All domain exceptions inherit from one of six base categories defined in `app.exceptions`: + +| Base Exception | HTTP Status | Meaning | Example | +|---|---|---|---| +| `NotFoundError` | 404 | Requested domain entity not found | `JailNotFoundError`, `FilterNotFoundError` | +| `BadRequestError` | 400 | Invalid input, validation failure, or invalid identifier | `ConfigValidationError`, `JailNameError` | +| `ConflictError` | 409 | State conflict or resource constraint violation | `JailAlreadyActiveError`, `FilterAlreadyExistsError` | +| `OperationError` | 500 | Domain operation failure (write, update, delete) | `ConfigWriteError`, `ConfigFileWriteError` | +| `ServiceUnavailableError` | 503 | Infrastructure or external service unreachable | `Fail2BanConnectionError`, `ConfigDirError` | + +#### Service Exception Mapping + +Every service-specific exception inherits from exactly one category. This allows `main.py` to register just **5 exception handlers** instead of 25+: + +```python +# In app/exceptions.py — define each exception once with its category +class JailNotFoundError(NotFoundError): + def __init__(self, name: str) -> None: + self.name = name + super().__init__(f"Jail not found: {name!r}") + +class JailAlreadyActiveError(ConflictError): + def __init__(self, name: str) -> None: + self.name = name + super().__init__(f"Jail is already active: {name!r}") + +# In app/main.py — register category handlers +app.add_exception_handler(NotFoundError, _not_found_handler) +app.add_exception_handler(BadRequestError, _bad_request_handler) +app.add_exception_handler(ConflictError, _conflict_handler) +app.add_exception_handler(OperationError, _domain_error_handler) +app.add_exception_handler(ServiceUnavailableError, _service_unavailable_handler) +``` + +#### Service Exception Reference + +When writing a new service, determine which category each exception belongs to: + +- **Not found**: Always `NotFoundError` (e.g., jail, filter, action, config file not found) +- **Invalid input**: Always `BadRequestError` (e.g., validation errors, invalid names, regex compile failure) +- **State conflicts**: Always `ConflictError` (e.g., already exists, already active, readonly resource) +- **Operation failures**: Always `OperationError` (e.g., write failed, update failed, command failed) +- **Infrastructure**: Always `ServiceUnavailableError` (e.g., config dir missing, socket unreachable, fail2ban protocol error) + +#### Client Expectations + +Clients should expect the following HTTP status codes and response format for all domain errors: + +```json +HTTP 400 Bad Request +{ + "detail": "Jail name contains invalid characters" +} + +HTTP 404 Not Found +{ + "detail": "Jail not found: 'sshd'" +} + +HTTP 409 Conflict +{ + "detail": "Jail is already active: 'sshd'" +} + +HTTP 500 Internal Server Error +{ + "detail": "Failed to write configuration: permission denied" +} + +HTTP 503 Service Unavailable +{ + "detail": "Cannot reach the fail2ban service. Check the server status page." +} +``` + +The `detail` field always contains the exception's message (from `str(exc)`). Sensitive details (socket paths, file paths, internal error messages) are never included — they are logged server-side only. + + + +- **Network I/O**: `TimeoutError`, `aiohttp.ClientError`, `asyncio.TimeoutError` +- **File I/O**: `OSError` (includes `IOError`, `FileNotFoundError`, `PermissionError`) +- **JSON parsing**: `json.JSONDecodeError`, `ValueError` +- **Database errors**: `aiosqlite.Error` and derivatives (caught as `OSError`) +- **Third-party libraries**: Specific exception classes (e.g., `geoip2.errors.GeoIP2Error`) + +**When catching service-critical exceptions**: +1. Catch the specific exception types for the operation. +2. Log with the exception type and relevant context. +3. Return a safe fallback (empty dict, None, etc.) or re-raise if the service cannot function. + +**When truly unavoidable broad catches are needed** (e.g., retrying transient network failures): +1. Place specific catches first. +2. Add one final `except Exception` **after** specific cases, with `error_type="unexpected"` logged to flag surprises. +3. Document why broad catching is necessary (e.g., "tests use mock objects that may raise arbitrary exceptions"). + +**Example:** +```python +async def lookup_batch(ips: list[str], http_session: aiohttp.ClientSession) -> dict[str, GeoInfo]: + """Resolve multiple IPs, returning empty map on failure.""" + try: + result = await http_session.post(url, json=payload, timeout=timeout) + except (TimeoutError, aiohttp.ClientError) as exc: + # Expected network failures — log and return empty result + log.warning("geo_batch_http_failed", error=type(exc).__name__) + return {} + except Exception as exc: + # Unexpected — log as error for investigation + log.error("geo_batch_unexpected_error", error=type(exc).__name__) + return {} ``` --- @@ -276,15 +1947,237 @@ async def client() -> AsyncClient: @pytest.mark.asyncio async def test_list_jails_returns_200(client: AsyncClient) -> None: - response = await client.get("/api/jails/") + response = await client.get("/api/v1/jails/") assert response.status_code == 200 data: dict = response.json() assert "jails" in data ``` +See [API_VERSIONING.md](API_VERSIONING.md) for the full versioning strategy, deprecation policy, and instructions for adding versioned endpoints. + +--- + +## 9.1 Background Tasks and Scheduler Architecture + +BanGUI uses **APScheduler 4.x** (async mode) to manage background jobs that execute on a schedule without user interaction. This section documents how to write and register background tasks. + +### Task Location and Structure + +All background tasks live in `backend/app/tasks/` as separate modules. Each task: +- Exports a `register(app: FastAPI) -> None` or `async def register(app: FastAPI) -> None` function. +- Opens its own database connection using `app.db.open_db()` or the `task_db()` helper. +- Closes connections when work completes (use the async context manager pattern). +- Runs independently of the FastAPI request/response cycle. + +### Example Task + +```python +# backend/app/tasks/my_task.py +import structlog +from fastapi import FastAPI +from apscheduler.schedulers.asyncio import AsyncIOScheduler + +log = structlog.get_logger() + +async def my_background_job(app: FastAPI) -> None: + """Do important work on a schedule.""" + log.info("my_background_job_started") + try: + db = await app.db.open_db(app.state.settings.database_path) + try: + # Do work... + pass + finally: + await db.close() + except Exception: + log.error("my_background_job_failed", exc_info=True) + +def register(app: FastAPI) -> None: + """Register the job with the scheduler.""" + scheduler: AsyncIOScheduler = app.state.scheduler + scheduler.add_job( + my_background_job, + args=(app,), + trigger="interval", + seconds=60, + id="my_task", + name="My Background Job", + ) +``` + +### Accessing Shared Resources in Tasks + +Since tasks do not have access to `Depends(get_db)` (no request scope), they must: +1. **Open their own DB connection** via `app.state.db_factory.open_db(path)`. +2. **Access app-level state** — `app.state.http_session`, `app.state.geo_cache`, `app.state.settings`, etc. +3. **Use structlog** for all logging (never `print()`). + +### Single-Worker Requirement + +**The scheduler is bound to a single asyncio event loop and cannot be shared across multiple worker processes.** BanGUI enforces single-worker mode to prevent duplicate task execution. + +- **Deployment constraint:** Set `BANGUI_WORKERS=1` (default). +- **Startup validation:** `startup_shared_resources()` raises `RuntimeError` if `BANGUI_WORKERS > 1`. +- See [Architekture.md § 9.2](Architekture.md) for full details. + +### Timeout Protection for Background Tasks + +**All background tasks must wrap their async work with timeout protection.** If a task hangs (API unreachable, network partition, database lock), it runs forever — never completes → lock never released → duplicate work starts → resource exhaustion. Timeouts prevent this. + +**Rule:** Every task function must use `run_with_timeout()` from `app.tasks.timeout_utils` to enforce a timeout on its async work. + +```python +from app.tasks.timeout_utils import run_with_timeout + +async def _run_import_with_resources(settings: Settings, http_session: ClientSession) -> None: + """Imports blocklists with timeout protection.""" + async def _do_import() -> None: + # ... your async work ... + result = await blocklist_service.import_all(...) + log.info("import_finished", total=result.total_imported) + + # Wrap with timeout: abort after 300 seconds + await run_with_timeout("blocklist_import", _do_import(), timeout_seconds=300) +``` + +**Why this pattern:** +1. `run_with_timeout()` enforces strict time limits using `asyncio.wait_for()`. +2. If timeout is exceeded, `TimeoutError` is raised and logged with elapsed time. +3. If task approaches timeout (>80% of time budget), a warning is logged for observability. +4. Failures are logged at `warning` level (not `error`) — timeouts are expected sometimes, but worth investigating. + +**Timeout Values by Task:** + +| Task | Timeout | Rationale | +|------|---------|-----------| +| `blocklist_import` | 300s (5 min) | Downloads, validates, applies external lists. Network delays expected. | +| `health_check` | 10s | Socket probe to fail2ban. Should complete quickly or fail2ban is unresponsive. | +| `geo_cache_flush` | 60s | Writes dirty cache entries to DB. Handles contention gracefully. | +| `session_cleanup` | 30s | Deletes expired sessions. DB contention unlikely but possible. | +| `rate_limiter_cleanup` | 5s | In-memory cleanup, no I/O. Should always be instant. | +| `geo_cache_cleanup` | 60s | Deletes stale geo entries from DB. May scan large table. | +| `geo_re_resolve` | 120s | Retries failed IP lookups with backoff. API rate-limit delays expected. | +| `history_sync` | 60s | Syncs records from fail2ban DB to archive. May read/write many rows. | +| `scheduler_lock_heartbeat` | 5s | Updates lock timestamp. Must be quick or lock is lost. | + +**Timeout Events Are Logged:** + +On timeout: +``` +task_timeout task_name=blocklist_import timeout_seconds=300 elapsed_seconds=300.45 +``` + +On approaching timeout (>80% of budget used): +``` +task_approaching_timeout task_name=blocklist_import timeout_seconds=300 elapsed_seconds=298.5 usage_percent=99.5 +``` + +The logs include `elapsed_seconds` for observability — if you see tasks consistently near timeout, the value may need adjustment. + +**Testing Timeout Behavior:** + +Tests for timeout scenarios are in `backend/tests/test_tasks/test_timeout_utils.py`: +- Verify timeout is raised and logged. +- Verify approaching-timeout warning is logged. +- Verify task exceptions (not timeout) propagate correctly. + +Add timeout tests to your task test file: +```python +@pytest.mark.asyncio +async def test_task_timeout_is_logged(self) -> None: + """Task must be logged and raise TimeoutError on timeout.""" + with patch("app.tasks.my_task.log") as mock_log: + with pytest.raises(TimeoutError): + await my_task._run_with_resources(settings) # exceeds timeout + + timeout_calls = [ + c for c in mock_log.warning.call_args_list + if c[0][0] == "task_timeout" + ] + assert len(timeout_calls) == 1 +``` + +--- + +### Task Idempotency + +**Background tasks must be idempotent** — retrying after a crash must produce the same result as running once. + +If a task crashes or times out mid-execution, the scheduler may retry. Without idempotency, retries cause duplicate work: +- **blocklist_import**: banned IPs appear twice → database corruption +- **geo_cache_flush**: entries written twice → cache inconsistency +- Any multi-step operation: partial state remains + +**Pattern: Content-Hash Idempotency for Blocklist Imports** + +Track imports by source + content hash to detect retries: + +```python +from app.repositories import import_run_repo + +async def import_source(source, db, ...): + # Download content + status, content = await downloader.download(url) + + # Compute hash for idempotency detection + content_hash = hashlib.sha256(content.encode()).hexdigest() + + # Check if this exact import already completed + existing_run = await import_run_repo.get_by_source_and_hash( + db, source.id, content_hash + ) + + if existing_run and existing_run.status == "completed": + # Already done — skip banning, optionally re-warm cache + log.info("blocklist_import_already_completed", ...) + return ImportSourceResult(ips_imported=existing_run.imported_count, ...) + + # First run: create pending record + if not existing_run: + run_id = await import_run_repo.create_pending( + db, source.id, content_hash + ) + else: + run_id = existing_run.id # Retry case + + # Do work (ban IPs, etc.) + imported, errors = await ban_executor.ban_ips(...) + + # Mark as completed or failed (atomically) + if errors: + await import_run_repo.mark_failed(db, run_id, str(errors)) + else: + await import_run_repo.mark_completed(db, run_id, imported, skipped) +``` + +**Key points:** + +1. **Operation ID must be deterministic** — Use content hash, not timestamp + - Same content = same operation ID → retry safe + - Different content = different operation ID → new import run + +2. **Check before doing work** — Query `import_runs` table before banning + - If completed: skip banning (already done) + - If pending: retry was interrupted, try again + - If failed: retry to recover + +3. **Atomic state updates** — Mark as completed AFTER all work succeeds + - All-or-nothing: either import succeeded + logged, or failed + retryable + +4. **Test idempotency** — Verify retrying same content doesn't duplicate bans + ```python + # First import: ban 2 IPs + result1 = await import_source(source, content, db) + assert result1.ips_imported == 2 + + # Second import (same content): skip bans + result2 = await import_source(source, content, db) + assert result2.ips_imported == 2 + assert ban_ip.call_count == 2 # Only called once, not twice + ``` + --- -## 10. Code Style & Tooling | Tool | Purpose | |---|---| @@ -300,7 +2193,67 @@ async def test_list_jails_returns_200(client: AsyncClient) -> None: --- -## 11. Configuration & Secrets +## 11. fail2ban Response Utilities + +All services that interact with the fail2ban daemon must use the canonical response parsing utilities from `app.utils.fail2ban_response`. This ensures consistent error handling, type safety, and makes it easy to fix bugs in response handling across the entire codebase. + +### Available Functions + +**`ok(response: object) -> object`** +Extracts the payload from a fail2ban ``(return_code, data)`` response tuple. +- Raises `ValueError` if return code ≠ 0 or response shape is invalid. +- Use this on every response from `Fail2BanClient.send()`. + +**`to_dict(pairs: object) -> dict[str, object]`** +Converts a list of ``(key, value)`` pairs (fail2ban's native response format) to a Python dict. +- Silently ignores malformed entries and non-list/tuple inputs. +- Always returns a dict (empty if input is invalid). + +**`ensure_list(value: object | None) -> list[str]`** +Coerces fail2ban response values (which may be `None`, a single string, or a list) to a normalized list of strings. +- Handles all three cases consistently. +- Returns empty list for `None` or empty strings. + +**`is_not_found_error(exc: Exception) -> bool`** +Checks if an exception indicates a jail does not exist. +- Checks for multiple error message patterns (case-insensitive). +- Use this to distinguish "jail not found" errors from other failures. + +### Example Usage + +```python +from app.utils.fail2ban_response import ok, to_dict, ensure_list, is_not_found_error +from app.utils.fail2ban_client import Fail2BanClient + +client = Fail2BanClient(socket_path="/var/run/fail2ban/fail2ban.sock") + +try: + # Get jail status + response = await client.send(["status", "sshd", "short"]) + status_dict = to_dict(ok(response)) # Extract payload and convert to dict + + # Get list of banned IPs + ban_response = await client.send(["get", "sshd", "banip"]) + banned_ips = ensure_list(ok(ban_response)) # Normalize to list of strings + +except ValueError as exc: + if is_not_found_error(exc): + raise JailNotFoundError("sshd") from exc + raise +``` + +### Why This Matters + +Before this utility module, every service implemented its own copy of these functions, leading to: +- Code duplication across 7+ service files. +- Subtle inconsistencies in error handling. +- Difficult maintenance — every bug fix required touching multiple files. + +Now, all services import from a single authoritative source, making response handling consistent, maintainable, and type-safe. + +--- + +## 12. Configuration & Secrets - All configuration lives in **environment variables** loaded through **pydantic-settings**. - Secrets (master password hash, session key) are **never** committed to the repository. @@ -320,23 +2273,748 @@ class Settings(BaseSettings): model_config = {"env_prefix": "BANGUI_", "env_file": ".env"} ``` +### Session Secret Configuration + +The `session_secret` is the HMAC key used to sign all session tokens. It must be at least 32 characters (256 bits) to provide sufficient cryptographic strength for HMAC-SHA256. + +**Minimum Length:** 32 characters + +**Why 32 characters?** Session tokens are signed using HMAC-SHA256. A secret shorter than 32 bytes (256 bits) significantly weakens the signature, potentially allowing attackers to forge valid tokens. The constraint is enforced at startup — the application will fail to start if `session_secret` is shorter than 32 characters. + +**Generation:** Generate a secure secret using Python: + +```bash +python -c "import secrets; print(secrets.token_hex(32))" +``` + +This produces a 64-character hexadecimal string (256 bits) suitable for production use. + +**Environment Variable:** + +```bash +BANGUI_SESSION_SECRET="your-32-character-minimum-secret-here" +``` + +**Never** commit the actual secret to the repository. Provide a `.env.example` with a placeholder: + +```bash +# .env.example +BANGUI_SESSION_SECRET="set-this-to-a-32-character-minimum-secret" +``` + +### Session Secret Rotation + +**Problem:** If a session secret leaks, all active sessions become compromised and an attacker can forge new tokens. Rotating the secret invalidates forged tokens but may require all users to log out if rotation is done all at once. + +**Solution:** BanGUI supports gradual secret rotation without forcing logout. During rotation: + +1. All new tokens are signed with the current secret +2. Old tokens signed with the previous secret are still accepted +3. Tokens using the previous secret are transparently validated and logged +4. Once all old tokens expire naturally, disable the rotation by unsetting the previous secret + +**Rotation Strategy (Step-by-Step):** + +#### 1. Generate a New Secret + +Before rotation, generate a fresh secret: + +```bash +python -c "import secrets; print(secrets.token_hex(32))" +``` + +#### 2. Start Rotation (Without Stopping the Service) + +Update your configuration **simultaneously** on all deployment replicas: + +```bash +# .env (or ConfigMap in Kubernetes) +BANGUI_SESSION_SECRET="" # Current (new) secret +BANGUI_SESSION_SECRET_PREVIOUS="" # Previous (old) secret +``` + +Or in Kubernetes: + +```yaml +env: + - name: BANGUI_SESSION_SECRET + valueFrom: + secretKeyRef: + name: bangui-secrets + key: current-secret + - name: BANGUI_SESSION_SECRET_PREVIOUS + valueFrom: + secretKeyRef: + name: bangui-secrets + key: previous-secret +``` + +**Key Point:** All replicas must know both secrets to accept old tokens. + +#### 3. Monitor Token Rotation + +Tokens signed with the previous secret are automatically validated and logged: + +``` +event=session_token_rotated_in_place session_id=42 old_secret_fragment=abc123 new_secret_fragment=def456 +``` + +These logs let you track how many sessions are still using old tokens. + +#### 4. Wait for Old Tokens to Expire + +Monitor the application logs and wait until: +- No new `session_token_rotated_in_place` events appear (all old tokens have been used or expired) +- Session duration (default: 480 minutes) + grace period has elapsed since the previous secret was enabled +- Example: If sessions last 480 minutes, wait at least 8 hours from enabling the previous secret + +#### 5. Complete Rotation + +Once all old tokens have expired, remove the previous secret: + +```bash +# .env +BANGUI_SESSION_SECRET="" +# BANGUI_SESSION_SECRET_PREVIOUS is now unset or empty +``` + +**Important:** Keep the previous secret configured for at least `session_duration_minutes` (default 480 minutes / 8 hours) to avoid rejecting tokens that are still valid. + +**Metrics & Logging:** + +The auth service logs rotation events for observability: + +- `session_token_rotated_in_place` — Logged when a token signed with the previous secret is validated during the rotation window +- `session_token_re_signed_after_rotation` — Logged in `unwrap_session_token_with_rotation()` when the previous secret validates a token +- `old_secret_fragment` / `new_secret_fragment` — First 6 characters of the SHA256 hash of each secret (for non-sensitive correlation without logging actual secrets) +- `session_id` — Database ID of the rotated session + +**Example Rotation Sequence:** + +``` +Time Config Event Logged +──────────────────────────────────────────────────────────────── +T=0 current=old (normal operation) + previous= + +T=5m current=new session_token_rotated_in_place + previous=old (user session S1 validated with old secret) + +T=30m current=new (no more old tokens, all new tokens use current) + previous=old + → Still keep old secret set + +T=500m Enough time passed (old session S1 has expired) + (480 min session + grace) + +T=510m current=new (rotation complete) + previous= +``` + +**Avoiding Common Mistakes:** + +❌ **Don't:** Rotate the secret and immediately unset the previous one → Old tokens will be rejected, forcing logout +✓ **Do:** Keep the previous secret for at least `session_duration_minutes` + +❌ **Don't:** Rotate without updating all replicas → Some replicas reject old tokens, others accept them → Inconsistent behavior +✓ **Do:** Deploy config to all replicas simultaneously (via ConfigMap, Helm, or orchestrator) + +❌ **Don't:** Use the same secret for development and production → Leaked dev secret can compromise prod +✓ **Do:** Generate unique secrets per environment + +### Session Cookie Security + +The `session_cookie_secure` configuration controls the `Secure` flag on the session cookie. This flag prevents browsers from sending the session cookie over unencrypted HTTP. + +**Default:** `true` — Production deployments are secure by default. Cookies are only sent over HTTPS. + +**Local Development:** Set `BANGUI_SESSION_COOKIE_SECURE=false` in your compose file or `.env` to allow cookies over HTTP (required for `localhost:8000`). + +```yaml +# Docker/compose.debug.yml +environment: + BANGUI_SESSION_COOKIE_SECURE: "false" # Allow HTTP during local development +``` + +**Important:** If `Secure=true` is set, browsers will reject the session cookie when the backend is served over HTTP. Ensure your nginx/reverse proxy terminates TLS and passes `X-Forwarded-Proto: https` so FastAPI knows the connection is secure. + +### CSRF Protection Middleware + +State-mutating endpoints (POST, PUT, DELETE, PATCH) authenticated via session cookies are protected by the `CsrfMiddleware`, which enforces a custom header check. + +**How It Works:** + +1. For every request using a mutating HTTP method, the middleware checks: + - Is this request authenticated via session cookie (not Bearer token)? + - If yes, require the custom header `X-BanGUI-Request: 1`. + - If missing or incorrect, return `403 Forbidden`. + +2. **Bearer token requests** (via `Authorization: Bearer` header) bypass the check because tokens are not CSRF-vulnerable — they are never automatically sent on cross-origin requests. + +3. **Safe HTTP methods** (GET, HEAD, OPTIONS) bypass the check. + +4. **Cross-site protection:** Cross-site JavaScript (`fetch()` calls from other origins) cannot set custom headers without CORS preflight, which the backend rejects for non-allowed origins. This provides defense-in-depth against subdomain attacks and XSS injection. + +**Implementation Location:** +- Middleware: `backend/app/middleware/csrf.py` +- Registered in: `backend/app/main.py` via `app.add_middleware(CsrfMiddleware)` + +**Example:** +```python +# ✓ Cookie-authenticated POST with CSRF header — allowed +POST /api/bans +Cookie: bangui_session=... +X-BanGUI-Request: 1 + +# ✗ Cookie-authenticated POST without CSRF header — rejected with 403 +POST /api/bans +Cookie: bangui_session=... +(no X-BanGUI-Request header) + +# ✓ Bearer token authentication without CSRF header — allowed +POST /api/bans +Authorization: Bearer +(no X-BanGUI-Request header needed) + +# ✓ Safe GET method without CSRF header — allowed +GET /api/jails +Cookie: bangui_session=... +(no X-BanGUI-Request header needed) +``` + +### Setup Guard Route Policy + +BanGUI requires a one-time setup wizard to be completed before the application is usable. The `SetupRedirectMiddleware` enforces this by redirecting unauthenticated API requests to `/api/setup` until setup is complete. + +**How It Works:** + +1. **Explicit Allowlist:** The middleware maintains two allowlists: + - `_EXACT_ALLOWED`: Exact paths that bypass the guard (e.g., `/api/setup`, `/api/health`, `/api/docs`) + - `_PREFIX_ALLOWED`: Route prefixes that bypass the guard (e.g., `/api/setup/` for nested routes like `/api/setup/timezone`) + +2. **Path Matching Strategy:** The middleware uses **exact matching for exact paths** and **prefix matching with trailing slashes for nested routes**. This prevents fragile prefix-based allowlists (e.g., using `startswith("/api/setup")` would accidentally allow `/api/setup-debug`). + +3. **When Setup is Complete:** Once setup completes, the middleware becomes a no-op and all routes are accessible normally. + +**Allowlisted Paths:** +- `/api/setup` — Setup status check and initialization endpoint +- `/api/setup/timezone` — Timezone configuration (reaches via `/api/setup/` prefix) +- `/api/health` — Health check endpoint (used by monitoring and load balancers) +- `/api/docs` — Swagger UI documentation +- `/api/redoc` — ReDoc documentation +- `/api/openapi.json` — OpenAPI schema (required by docs frontends) + +**Adding New Setup Routes:** + +When adding new routes to the setup flow: +1. If the route is an exact path (e.g., `/api/setup/validate`), add it to `_EXACT_ALLOWED` +2. If the route is nested under `/api/setup/` (e.g., `/api/setup/validate/config`), ensure `/api/setup/` is in `_PREFIX_ALLOWED` (it already is) +3. Never use prefix matching without a trailing slash — it leads to security issues with future route additions + +**Implementation Location:** +- Middleware: `backend/app/main.py` — `SetupRedirectMiddleware` class +- Configuration: Lines 584–601 in `backend/app/main.py` — `_EXACT_ALLOWED` and `_PREFIX_ALLOWED` constants +- Guard logic: Lines 638–648 in `backend/app/main.py` — `dispatch()` method + +**Example:** +```python +# If setup is incomplete: +GET /api/jails +→ 307 Temporary Redirect to /api/setup + +# Allowlisted paths are always accessible: +GET /api/setup → 200 OK (setup status) +POST /api/setup → 201 Created (run setup) +GET /api/setup/timezone → 200 OK (get timezone) +GET /api/health → 200 OK (health check) +GET /api/docs → 200 OK (documentation) + +# If setup is complete, all routes are accessible: +GET /api/jails → 200 OK (jail list) +``` + +### fail2ban_start_command Configuration + +The `fail2ban_start_command` setting specifies the shell command used to start the fail2ban daemon during recovery operations (e.g., after a rollback). + +**Format & Parsing:** +- The command is split into arguments using `shlex.split()`, which respects shell quoting rules. +- Paths with spaces must be quoted. Example: `"/opt/my tools/fail2ban-client" start`. +- The command is **not** executed through a shell — no shell variables or globbing are interpreted. + +**Validation:** +- The command is validated at startup using `shlex.split()`. Mismatched quotes will raise a `ValueError` with the problematic command in the error message. + +**Environment Variables:** +```bash +BANGUI_FAIL2BAN_START_COMMAND="fail2ban-client start" # Default +BANGUI_FAIL2BAN_START_COMMAND="systemctl start fail2ban" # systemd +BANGUI_FAIL2BAN_START_COMMAND='"/opt/my tools/fail2ban" start' # Quoted path +``` + +**Common Pitfall:** +Using `.split()` instead of `shlex.split()` would break commands with spaces in paths. Always use quoted strings for paths that contain whitespace. + +### Trusted Proxy Configuration (Reverse Proxy Deployments) + +When BanGUI is deployed behind a reverse proxy (nginx, HAProxy, etc.), the proxy forwards the original client IP via HTTP headers (`X-Forwarded-For`, `X-Real-IP`). To extract the correct client IP for rate limiting and logging, you must configure which proxies are trusted. + +**Why This Is Needed:** + +Rate limiting (`POST /api/auth/login`) relies on accurate client IP detection to prevent brute-force attacks. Without proper proxy configuration: +- Rate limits are applied per **proxy IP** (always the same) instead of per **client IP** — attackers can bypass limits by making many requests from the same proxy. +- Logging shows proxy IPs instead of actual attacker IPs. + +**Trusted Proxies Configuration:** + +```bash +BANGUI_TRUSTED_PROXIES="10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" +``` + +Accepted formats: +- **Single IP:** `BANGUI_TRUSTED_PROXIES="192.168.1.1"` +- **CIDR range:** `BANGUI_TRUSTED_PROXIES="10.0.0.0/8"` (matches any IP in 10.0.0.0 to 10.255.255.255) +- **Multiple entries (comma-separated):** `BANGUI_TRUSTED_PROXIES="192.168.1.1,10.0.0.0/8"` +- **Whitespace is stripped:** `BANGUI_TRUSTED_PROXIES="192.168.1.1 , 10.0.0.0/8"` is valid +- **IPv6 supported:** `BANGUI_TRUSTED_PROXIES="2001:db8::/32"` + +**Default:** Empty list (no proxies trusted). Proxy headers are ignored, and only the direct connection IP is used. + +**Validation:** + +The application validates all entries at startup: +- Each entry must be a valid IP address or CIDR range. +- Invalid entries (e.g., `"not-an-ip"`, `"10.0.0.0/33"`) will cause a `ValidationError` at startup. +- The application will not start if any entry is invalid. + +**How It Works:** + +1. When a request arrives, the middleware checks the immediate connection source (e.g., `client.host`). +2. If the immediate connection is **not** in the `trusted_proxies` list, it is used directly as the client IP (proxy headers are ignored). +3. If the immediate connection **is** trusted, the middleware extracts the original client IP from headers in this order: + - `X-Forwarded-For` (leftmost IP in the chain, if present) + - `X-Real-IP` (fallback) + - Immediate connection IP (if no forwarded headers found) + +**Example Docker Compose Configuration:** + +```yaml +version: '3.8' +services: + nginx: + image: nginx:latest + ports: + - "80:80" + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + networks: + - bangui-net + + backend: + image: bangui:latest + environment: + BANGUI_TRUSTED_PROXIES: "10.0.0.0/8" # Trust Docker internal network + BANGUI_SESSION_COOKIE_SECURE: "false" # nginx terminates TLS + networks: + - bangui-net + +networks: + bangui-net: + driver: bridge +``` + +**Example nginx Configuration:** + +```nginx +upstream bangui_backend { + server backend:8000; +} + +server { + listen 80; + server_name bangui.example.com; + + location /api/ { + proxy_pass http://bangui_backend; + + # Forward the original client IP + proxy_set_header X-Forwarded-For $remote_addr; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-Proto $scheme; + + # Required for FastAPI to recognize the original protocol + proxy_set_header Host $host; + } +} +``` + +**Important Security Notes:** + +- **Only trust IPs you control.** Never include untrusted networks or `0.0.0.0/0`. An attacker with network access to a trusted IP can forge `X-Forwarded-For` headers and bypass rate limits. +- **Validate proxy IPs carefully.** Use CIDR ranges that match your infrastructure (e.g., `10.0.0.0/8` for Docker, `172.31.0.0/16` for specific Docker networks). +- **Use HTTPS in production.** Ensure your nginx terminates TLS (uses HTTPS) and passes `X-Forwarded-Proto: https` so FastAPI's `Secure` cookie flag works correctly. +- **Beware of Header Spoofing.** `X-Forwarded-For` can contain multiple IPs (client, proxy1, proxy2). The leftmost IP is used as the original client. If an untrusted proxy is between the client and your BanGUI instance, attackers can still spoof headers. Always filter at the network level — only allow traffic from trusted proxies. + +### IP Geolocation Resolution + +BanGUI resolves IP addresses to country codes and network organization information for ban analytics and geomapping. The geolocation system implements a **primary + fallback** resolution strategy to balance security and availability: + +1. **Primary Resolver (MaxMind GeoLite2-Country):** All IP lookups first attempt resolution using a local MaxMind GeoLite2-Country MMDB database file (if available). The MMDB is downloaded offline and mounted into the container — no IP data is sent over the network. +2. **Fallback Resolver (ip-api.com HTTP):** If the MMDB is unavailable or returns no result, the system can fall back to the ip-api.com HTTP API. **This fallback must be explicitly enabled** and only sends unresolved IPs over HTTP. HTTP is disabled by default for security (to avoid sending IP addresses in cleartext). + +**Download & Configure MaxMind GeoLite2:** + +The MaxMind GeoLite2-Country MMDB requires a free account and license key. To set up the database: + +1. **Create a free MaxMind account** at https://www.maxmind.com/en/geolite2/signup and download your license key. +2. **Download the GeoLite2-Country MMDB** using the provided script or manually from the MaxMind downloads page. +3. **Mount the MMDB into the BanGUI container** at a known path (e.g., `/data/GeoLite2-Country.mmdb`). +4. **Set `BANGUI_GEOIP_DB_PATH`** to the mounted path in your environment. + +Example Docker Compose configuration: + +```yaml +services: + bangui: + volumes: + - ./GeoLite2-Country.mmdb:/data/GeoLite2-Country.mmdb:ro + environment: + BANGUI_GEOIP_DB_PATH: /data/GeoLite2-Country.mmdb +``` + +**Fallback to HTTP (Not Recommended):** + +If the MMDB cannot be mounted (e.g., in restricted environments), you can enable the HTTP fallback: + +```yaml +services: + bangui: + environment: + BANGUI_GEOIP_ALLOW_HTTP_FALLBACK: "true" +``` + +**⚠️ Security Warning:** Enabling HTTP fallback causes unresolved IP addresses to be sent **unencrypted** to ip-api.com. This is a privacy and GDPR/CCPA concern. Only enable this if the MMDB absolutely cannot be provisioned, and understand the implications. + +**Data Structure:** + +The `GeoInfo` returned by the resolution system includes: +- `country_code` (str | None): ISO 3166-1 alpha-2 country code (e.g., `"US"`, `"DE"`). +- `country_name` (str | None): Human-readable country name (e.g., `"United States"`). +- `asn` (str | None): Autonomous System Number (e.g., `"AS3320"`). Only populated when using the HTTP API; local MMDB lookups return `None`. +- `org` (str | None): Organization name associated with the ASN. Only populated when using the HTTP API; local MMDB lookups return `None`. + +**Environment Variables:** + +```bash +BANGUI_GEOIP_DB_PATH=/data/GeoLite2-Country.mmdb # Path to MaxMind MMDB (primary) +BANGUI_GEOIP_ALLOW_HTTP_FALLBACK="false" # Default: false (MMDB-only) +BANGUI_GEOIP_ALLOW_HTTP_FALLBACK="true" # Enable HTTP fallback (not recommended) +``` + +**Caching & Performance:** + +- Resolved IPs are cached in-memory and persisted to SQLite for fast subsequent lookups. +- Failed lookups are cached for 5 minutes to avoid hammering external APIs. +- The background `geo_cache_flush` task (runs every 60 seconds) persists newly resolved entries to the database. +- The background `geo_re_resolve` task (configurable schedule) periodically re-resolves stale entries to keep data fresh. +- The background `geo_cache_cleanup` task (runs nightly) removes entries not referenced in the configured retention period (default: 90 days) to prevent unbounded database growth and maintain query performance. + +**Retention & Cleanup:** + +The `geo_cache` table tracks the last time each IP was referenced via a `last_seen` timestamp. Over time, as unique IPs accumulate, the table can grow very large, degrading query performance on every geo lookup. To manage this: + +- The `geo_cache_cleanup` background task runs once per day (default: midnight UTC). +- It removes all entries where `last_seen` is older than the configured retention period (default: 90 days). +- If a purged IP is encountered again after cleanup, it will be re-resolved from the MaxMind database or ip-api.com (if configured). +- The retention period is controlled by the constant `GEO_CACHE_RETENTION_DAYS` in `backend/app/tasks/geo_cache_cleanup.py`. + +### API Documentation Configuration + +The `enable_docs` setting controls whether FastAPI serves interactive API documentation at `/api/docs` (Swagger UI) and `/api/redoc` (ReDoc). + +**Default:** `false` — API documentation is disabled by default to prevent information disclosure in production. + + +**When to Enable:** +- Set `BANGUI_ENABLE_DOCS=true` in development and debugging environments only. +- Never enable in production. Exposed API documentation reveals all endpoints, request/response schemas, and allows direct API invocation from the browser. + +**Environment Variables:** +```bash +BANGUI_ENABLE_DOCS="true" # Enable docs in development +BANGUI_ENABLE_DOCS="false" # Disable docs (default) +# Unset # Defaults to false (production) +``` + +**Debug Compose File:** +The `Docker/compose.debug.yml` sets `BANGUI_ENABLE_DOCS: "true"` for local development. Production compose files (`Docker/compose.prod.yml`) leave this unset, defaulting to `false`. + +**Middleware Allowlist:** +The `SetupRedirectMiddleware` in `main.py` includes `/api/docs`, `/api/redoc`, and `/api/openapi.json` in its `_ALWAYS_ALLOWED` paths so documentation can be accessed before setup completes (if enabled). + +### Log Path Validation & Allowlisting + +Authenticated users can instruct fail2ban to monitor additional log files through the API endpoint `POST /api/config/jails/{name}/logpath`. To prevent path-traversal attacks and unauthorized reads of sensitive system files, all requested log paths must resolve to locations within a configurable allowlist of safe directories. + +**Allowed Directories:** +- Configured via the `BANGUI_ALLOWED_LOG_DIRS` environment variable (comma-separated list). +- Defaults to: `["/var/log", "/config/log"]`. + +**Path Validation Rules:** +1. The requested path is resolved to its canonical form using `Path(log_path).resolve()`, which: + - Expands relative paths to absolute paths. + - Resolves symbolic links to their real targets. + - Normalizes `.` and `..` components. +2. The resolved path is checked using `Path.is_relative_to()` against each allowed directory prefix. +3. If the resolved path is not relative to any allowed directory, a `ValueError` is raised with a descriptive error message. + +**Implementation:** +- Validation occurs in the Pydantic model `AddLogPathRequest` using a `@field_validator`. +- The validator runs at request time, before the service layer is invoked. +- Symlinks that escape allowed directories are rejected (see [symlink bypass tests](../../backend/tests/test_models.py)). + +**Important:** Use `is_relative_to()`, not `startswith()` or string prefix matching. The latter is bypassable with paths like `/var/log_evil/file.log`. + +**Environment Variables:** +```bash +BANGUI_ALLOWED_LOG_DIRS="/var/log,/config/log" # Default +BANGUI_ALLOWED_LOG_DIRS="/var/log,/config/log,/home/app/logs" # Custom directory +``` + +### Log Target Validation (fail2ban) + +The `log_target` field on the global config endpoint (`PUT /api/config/global`) is critical for security because fail2ban runs as root. Users can only set log targets to: + +1. **Special values:** `STDOUT`, `STDERR`, `SYSLOG` (case-insensitive) +2. **File paths:** Must resolve to one of the configured allowed directories (same allowlist as log paths) + +**Why This Matters:** +- fail2ban creates/opens files with root privileges. Without validation, an attacker could write to arbitrary system paths (e.g., `/etc/cron.d/malicious_script`). +- Validation occurs at **both** the Pydantic model layer (`GlobalConfigUpdate.validate_log_target()`) **and** the service layer (`update_global_config()`) for defense in depth. +- This prevents both HTTP and non-HTTP attack vectors. + +**Implementation:** +```python +# Model layer: Automatic validation via @field_validator +update = GlobalConfigUpdate(log_target="/etc/passwd") # Raises ValidationError → HTTP 422 + +# Service layer: Defense in depth +await config_service.update_global_config(socket_path, update) # Validates again before sending to fail2ban +``` + +### Global Rate Limiting + +In addition to login-specific rate limiting, all API endpoints are protected by global per-IP rate limiting to prevent resource exhaustion, CPU spikes, and network bandwidth attacks from malicious or misconfigured clients. + +**Design:** +- Uses a `dict[str, deque[float]]` keyed by client IP, storing request timestamps within a time window. +- Implements a sliding-window algorithm: when an IP exceeds the limit, subsequent requests are blocked until the oldest request timestamp in the window expires. +- Applied globally via middleware that runs on every request. +- Respects the same IP extraction logic (trusted proxies) as login rate limiting. + +**Rate Limit Rules:** +- **Default limit:** 200 requests per 60 seconds per IP. +- Blocked requests return **HTTP 429 Too Many Requests** with a `Retry-After` header indicating the estimated seconds until the IP can retry. +- The `Retry-After` value is dynamically calculated based on when the oldest request in the window will expire. +- Different endpoints can be configured with different limits by adjusting the global rate limiter settings or using per-endpoint decorators (future enhancement). + +**IP Extraction (Proxy Safety):** +- Same as login rate limiting: reads real client IP from `X-Forwarded-For` or `X-Real-IP` headers when the immediate connection is from a trusted proxy. +- Falls back to direct connection IP when headers cannot be trusted. + +**Process-Local Limitation:** +- The global rate limiter is process-local (in-memory), like the login rate limiter. +- In single-worker deployments (enforced elsewhere), this is not a constraint. +- Each worker in a multi-worker setup maintains independent counters, which is acceptable under the single-worker enforcement model. + +**Memory Management:** +- Old request timestamps outside the rate-limit window are automatically pruned during validation checks. +- A scheduled background task (`rate_limiter_cleanup` in `app.tasks.rate_limiter_cleanup`) runs every 30 minutes to remove dormant IPs from memory, preventing unbounded growth. + +**Implementation:** +- Rate limiter: `app.utils.rate_limiter.GlobalRateLimiter` +- Middleware: `app.middleware.rate_limit.RateLimitMiddleware` +- IP extraction: `app.utils.client_ip.get_client_ip()` +- Cleanup task: `app.tasks.rate_limiter_cleanup` (registered in `app.startup`) +- Initialized in: `app.main.create_app()` and the lifespan handler + + --- -## 12. Git & Workflow +## 12. Authentication Endpoints + +#### Browser SPA (Cookie-Based) + +The **primary** authentication flow for the frontend is **cookie-based** and protects the session token from JavaScript access: + +1. **Login (`POST /api/auth/login`)** + - Accepts `LoginRequest` (password field) + - Returns `LoginResponse` containing **only** `expires_at` (ISO 8601 UTC timestamp) + - **Crucially:** The session token is **not** included in the JSON response body + - Instead, the token is set as an **HttpOnly** `SameSite=Lax` cookie named `bangui_session` + - Frontend automatically includes this cookie in all requests via `credentials: "include"` + +2. **Why not return token in response body?** + - Third-party JavaScript (analytics, ads, XSS injections) can intercept `fetch()` response bodies + - If the token were in the response, malicious code could extract and store it in `localStorage` + - An attacker could then use it via the `Authorization: Bearer ` header, bypassing the HttpOnly cookie protection + - By returning **only** the expiry timestamp, we ensure the token stays exclusively in the HttpOnly cookie + +3. **Session Validation (`GET /api/auth/session`)** + - Frontend calls this on app mount to verify the session is still valid on the server + - Works with both cookie and Bearer token authentication + - Returns `{"valid": true}` if the session exists and is not expired + - Returns **401 Unauthorized** if the session is invalid or expired + +4. **Logout (`POST /api/auth/logout`)** + - Revokes the session in the database + - Clears the `bangui_session` cookie via `Set-Cookie` header + - Works with both cookie and Bearer token authentication + - Idempotent — calling without a session returns 200 without error + +#### Programmatic API Clients (Bearer Token) + +For non-browser clients (CLI tools, batch scripts, automation) that cannot use cookies, use the **Bearer token authentication path** by sending: + +```http +Authorization: Bearer +``` + +The token can be obtained by parsing the cookie from a login response or, in a future implementation, via a dedicated `POST /api/auth/token` endpoint (currently, these clients extract the token from cookies or use Bearer directly from the signed token value). + +**Note:** Bearer token authentication is not recommended for browser-based clients because: +- Tokens must be stored somewhere (localStorage, sessionStorage, or request body) +- All storage mechanisms are accessible to JavaScript and thus vulnerable to XSS +- HttpOnly cookies provide better protection + +--- + +## 13. Password Hashing + +The master password is hashed using **bcrypt** with an auto-generated salt. All password validation uses the models in `app.models.auth` and `app.models.setup`. + +### The 72-Byte Bcrypt Limitation + +**Important:** bcrypt silently truncates all input at **72 bytes** before hashing. This means: +- A user who sets a 100-character password is actually authenticated by only the first 72 bytes +- Extra characters beyond 72 bytes provide **zero additional security** +- An attacker who has reduced their search space to 72 bytes can brute-force the password more efficiently than intended + +**Solution:** Both password fields enforce a **maximum length of 72 bytes**: +- `LoginRequest.password` — max 72 characters (enforced via Pydantic `Field(max_length=72)`) +- `SetupRequest.master_password` — max 72 characters (enforced via Pydantic `Field(max_length=72)`) + +**Validation flow:** +1. Frontend → hashes password with SHA256 using `SubtleCrypto` before transmission +2. Backend receives SHA256 hash, validates length (≤ 72 bytes) +3. Backend → hashes with bcrypt using `run_blocking(bcrypt.hashpw)` to avoid event loop stall +4. Hash stored in SQLite `settings` table + +**If a password exceeds 72 bytes:** +- Pydantic raises `ValidationError` with error code `string_too_long` +- The router returns **HTTP 422 Unprocessable Entity** +- The frontend should inform the user to choose a shorter password + +**Implementation:** +- Models: `app.models.auth.LoginRequest`, `app.models.setup.SetupRequest` +- Service layer: `app.services.auth_service._check_password()`, `app.services.setup_service.run_setup()` + +--- + +## 15. File I/O Conventions + +All file write operations to critical configuration files must be **atomic** to prevent corruption if the process is killed mid-write. + +### Atomic File Writes + +Configuration files (e.g., fail2ban jail configs in `jail.d/`) are essential for system operation. A truncated or corrupt config file can break fail2ban's ability to reload and may disable active protection. + +**Rule: Always use write-to-temp + atomic rename** + +Never use `Path.write_text()` or `file.write()` directly for critical files. Instead: + +1. Create a temporary file in the **same directory** as the target (crucial for atomic `os.replace()`). +2. Write content to the temp file. +3. Atomically rename the temp file to replace the target. +4. Clean up the temp file if an error occurs. + +**Implementation Pattern:** + +```python +import os +import tempfile +from pathlib import Path + +target = Path("/path/to/config/file.conf") + +tmp_name: str | None = None +try: + # Create temp file in target's directory (same filesystem = atomic) + with tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + dir=target.parent, + delete=False, + suffix=".tmp", + ) as tmp: + tmp.write(content) + tmp_name = tmp.name + # Atomic rename (single syscall on POSIX systems) + os.replace(tmp_name, target) +except OSError as exc: + # Clean up temp file on error + with contextlib.suppress(OSError): + if tmp_name is not None: + os.unlink(tmp_name) + raise ConfigWriteError(f"Cannot write config: {exc}") from exc +``` + +**Why this matters:** + +- `Path.write_text()` overwrites in place. If the process dies mid-write, the file is left truncated or partially written. +- `os.replace()` is atomic on POSIX systems (single rename syscall) **only if source and target are on the same filesystem**. +- Creating the temp file in `target.parent` ensures atomicity. +- On Linux containers, this prevents config corruption and service degradation. + +**Atomic write helper:** + +A shared `atomic_write(path: Path, content: str)` helper is available in `app/services/config_file_helpers.py`. This is the preferred way to perform atomic writes — it handles all the temp file and cleanup logic: + +```python +from app.services.config_file_helpers import atomic_write + +atomic_write(path, updated_content) # Atomic write, auto-cleanup on error +``` + +**Files requiring atomic writes:** + +- All config files under `jail.d/` (created/modified by `_write_conf_file`, `_create_conf_file`, `set_jail_config_enabled`, and `write_jail_config_file`) +- Any critical state files that fail2ban relies on + +**Examples in the codebase:** + +- `app/services/config_file_helpers.py`: `_write_conf_file`, `_create_conf_file`, `atomic_write` +- `app/services/raw_config_io_service.py`: `set_jail_config_enabled`, `write_jail_config_file` +- `app/services/jail_config_service.py`: `_write_local_file_sync`, `_restore_local_file_sync` + +--- + +## 16. Git & Workflow - **Branch naming:** `feature/`, `fix/`, `chore/`. - **Commit messages:** imperative tense, max 72 chars first line (`Add jail reload endpoint`, `Fix ban history query`). - Every merge request must pass: ruff, type checker, all tests. - Do not merge with failing CI. - Keep pull requests small and focused — one feature or fix per PR. +- **E2E test results** (`e2e/results/`) are gitignored — never commit test outputs or HTML reports. --- -## 13. Coding Principles +## 17. Coding Principles These principles are **non-negotiable**. Every backend contributor must internalise and apply them daily. -### 13.1 Clean Code +### 17.1 Clean Code - Write code that **reads like well-written prose** — a new developer should understand intent without asking. - **Meaningful names** — variables, functions, and classes must reveal their purpose. Avoid abbreviations (`cnt`, `mgr`, `tmp`) unless universally understood. @@ -367,7 +3045,7 @@ async def check(ip, j): raise Exception("not found") ``` -### 13.2 Separation of Concerns (SoC) +### 17.2 Separation of Concerns (SoC) - Each module, class, and function must have a **single, well-defined responsibility**. - **Routers** → HTTP layer only (parse requests, return responses). @@ -377,29 +3055,29 @@ async def check(ip, j): - **Tasks** → scheduled background jobs. - Never mix layers — a router must not execute SQL, and a repository must not raise `HTTPException`. -### 13.3 Single Responsibility Principle (SRP) +### 17.3 Single Responsibility Principle (SRP) - A class or module should have **one and only one reason to change**. - If a service handles both ban management *and* email notifications, split it into `BanService` and `NotificationService`. -### 13.4 Don't Repeat Yourself (DRY) +### 17.4 Don't Repeat Yourself (DRY) - Extract shared logic into utility functions, base classes, or dependency providers. - If the same block of code appears in more than one place, **refactor it** into a single source of truth. - But don't over-abstract — premature DRY that couples unrelated features is worse than a little duplication (see **Rule of Three**: refactor when something appears a third time). -### 13.5 KISS — Keep It Simple, Stupid +### 17.5 KISS — Keep It Simple, Stupid - Choose the simplest solution that works correctly. - Avoid clever tricks, premature optimisation, and over-engineering. - If a standard library function does the job, prefer it over a custom implementation. -### 13.6 YAGNI — You Aren't Gonna Need It +### 17.6 YAGNI — You Aren't Gonna Need It - Do **not** build features, abstractions, or config options "just in case". - Implement what is required **now**. Extend later when a real need emerges. -### 13.7 Dependency Inversion Principle (DIP) +### 17.7 Dependency Inversion Principle (DIP) - High-level modules (services) must not depend on low-level modules (repositories) directly. Both should depend on **abstractions** (protocols / interfaces). - Use FastAPI's `Depends()` to inject implementations — this makes swapping and testing trivial. @@ -417,17 +3095,288 @@ class SqliteBanRepository: async def save_ban(self, ban: Ban) -> None: ... ``` -### 13.8 Composition over Inheritance +#### 13.7.1 Repository Module Pattern — Module-as-Protocol Structural Compatibility + +BanGUI uses **module-level functions** for repository implementations, not classes. Each repository module (e.g., `session_repo.py`, `blocklist_repo.py`) exports async functions that match the signatures defined in the Protocol interface in `protocols.py`. This is a **structural typing pattern** — mypy accepts the module as a valid Protocol implementation because the function signatures match, *even though* the module is not explicitly annotated as implementing the Protocol. + +This approach works correctly with FastAPI's dependency injection via `cast()`: + +```python +# In app/repositories/session_repo.py +async def create_session(db: aiosqlite.Connection, token: str, created_at: str, expires_at: str) -> Session: + """Insert a new session row.""" + ... + +# In app/repositories/protocols.py +class SessionRepository(Protocol): + async def create_session( + self, + db: aiosqlite.Connection, + token: str, + created_at: str, + expires_at: str, + ) -> Session: + ... + +# In app/dependencies.py +async def get_session_repo() -> SessionRepository: + """Provide the concrete session repository implementation.""" + from app.repositories import session_repo + return session_repo # ← mypy accepts this because the module has matching functions +``` + +**Why this pattern is used:** +- **Simplicity** — no boilerplate class/instance wrapping. +- **Compatibility** — Python's **structural typing** (PEP 544) means the module automatically satisfies the Protocol interface if function signatures match. +- **Testability** — the same DIP principle applies; services depend on the Protocol, not the module directly, so tests can mock the Protocol. + +**Risks and mitigations:** +- **Silent breakage if function signatures change** — If a parameter is added or removed from a module function, the module no longer satisfies the Protocol, but mypy does not flag this as an error because the module is loosely coupled. To prevent this, **Protocol signatures in `protocols.py` are the source of truth**. Always check that module functions match the Protocol definitions before merging changes. The CI/CD pipeline validates this compatibility at build time. + +**How the validation works (CI check):** +- Before each deployment, run `mypy --strict` to ensure all dependency providers return values compatible with their Protocol types. +- The `cast()` calls in `dependencies.py` are a documented signal that structural compatibility is being verified externally, not via explicit class inheritance. +- Automated tests in `backend/tests/test_repositories/test_protocol_compliance.py` verify that each repository module implements all protocol methods, preventing silent protocol drift. + +#### 13.7.1.1 Repository Protocol Coverage Checklist + +All public repository functions must be defined in a corresponding Protocol. To add a new repository: + +1. **Create the repository module** — `backend/app/repositories/my_repo.py` with async functions. +2. **Define the Protocol** — Add a `MyRepository(Protocol)` class in `backend/app/repositories/protocols.py` with methods matching every public function signature. +3. **Add imports** — If the Protocol uses custom return types, import them in `protocols.py`. +4. **Run compliance tests** — Execute `pytest backend/tests/test_repositories/test_protocol_compliance.py` to verify coverage. +5. **Verify type safety** — Run `mypy --strict backend/app/repositories/protocols.py` to ensure all types are correct. + +**Current repository protocol coverage** (all 7 repositories fully covered): +- `SessionRepository` — 4 methods +- `SettingsRepository` — 4 methods +- `BlocklistRepository` — 6 methods +- `ImportLogRepository` — 4 methods +- `GeoCacheRepository` — 13 methods +- `HistoryArchiveRepository` — 5 methods +- `Fail2BanDbRepository` — 8 methods + +#### 13.7.2 Session Token Hashing — One-Way Protection Against Database Exposure + +Session tokens must be protected against database exposure. **Session tokens are stored as one-way SHA256 hashes in the database** to ensure that if the database file is compromised (volume mount misconfiguration, backup leak, etc.), the session tokens themselves cannot be directly used to hijack sessions. + +**Implementation pattern:** + +```python +import hashlib +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import aiosqlite + +from app.models.auth import Session + +def _hash_token(token: str) -> str: + """Return the SHA256 hash of a session token.""" + return hashlib.sha256(token.encode()).hexdigest() + +async def create_session( + db: "aiosqlite.Connection", + token: str, + created_at: str, + expires_at: str, +) -> Session: + """Insert a new session row with the token hash.""" + token_hash = _hash_token(token) + cursor = await db.execute( + "INSERT INTO sessions (token_hash, created_at, expires_at) VALUES (?, ?, ?)", + (token_hash, created_at, expires_at), + ) + await db.commit() + # Return the Session with the ORIGINAL token (not the hash) + # so the service layer can sign and return it to the client. + return Session( + id=int(cursor.lastrowid) if cursor.lastrowid else 0, + token=token, # ← raw token, not the hash + created_at=created_at, + expires_at=expires_at, + ) + +async def get_session( + db: "aiosqlite.Connection", + token: str +) -> Session | None: + """Look up a session by token hash.""" + token_hash = _hash_token(token) + async with db.execute( + "SELECT id, token_hash, created_at, expires_at FROM sessions WHERE token_hash = ?", + (token_hash,), + ) as cursor: + row = await cursor.fetchone() + + if row is None: + return None + + # Return the Session with the INCOMING token (the one the client sent). + return Session( + id=int(row[0]), + token=token, # ← the raw token passed in + created_at=str(row[2]), + expires_at=str(row[3]), + ) +``` + +**Key points:** + +1. **Hash on write** — When inserting a session, hash the token before storage. +2. **Hash on read** — When validating a session, hash the incoming token before the database lookup. +3. **Never store raw tokens** — The `token_hash` column contains only hashes; raw tokens are never persisted. +4. **Return raw tokens to the service layer** — The `Session` model's `token` field contains the raw token (for signing and response), not the hash. +5. **Database schema** — Use `token_hash TEXT NOT NULL UNIQUE` instead of `token TEXT NOT NULL UNIQUE`, and create an index on `token_hash`. +6. **Migration strategy** — When upgrading from plaintext to hashed tokens, drop the old table and recreate it. This invalidates all existing sessions, which is acceptable because the database was exposed in plaintext. + +**Why one-way hashing is safe:** +- If an attacker obtains a token hash from the database, they cannot reverse the SHA256 hash to recover the original token. +- The attacker cannot use the hash directly in a client request — they would need the original token to pass the hash check. +- This forces the attacker to either compromise the client (where they'd also get the raw token) or perform a brute-force attack against the hash space (infeasible for random 128-bit tokens). + +**Never use symmetric encryption** — symmetric encryption stores a key in the database or environment, which merely shifts the exposure risk. A one-way hash is the correct choice for protecting tokens. + +#### 13.7.2a Session Token Signing Format — HMAC-SHA256 Integrity Protection + +**All session tokens sent to clients are signed using HMAC-SHA256.** The signed token format is: + +``` +. +``` + +where: +- `` is a 16-byte (128-bit) random hex string generated by `secrets.token_hex(16)`. +- `.` is the separator (defined in `app.utils.constants.SESSION_TOKEN_SIGNATURE_SEPARATOR`). +- `` is the HMAC-SHA256 hex digest of `` using the configured `session_secret`. + +**Example:** `a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6.f7e6d5c4b3a2918f7e6d5c4b3a29180` + +**Signing and verification pattern:** + +```python +import hashlib +import hmac + +def _session_token_signature(token: str, secret: str) -> str: + """Return the HMAC-SHA256 signature for a session token.""" + return hmac.new(secret.encode(), token.encode(), hashlib.sha256).hexdigest() + +def sign_session_token(token: str, secret: str) -> str: + """Return a signed session token string for the client.""" + return f"{token}.{_session_token_signature(token, secret)}" + +def unwrap_session_token(token: str, secret: str) -> str: + """Verify and return the raw token from a signed session token. + + Raises ValueError if the token lacks a signature or signature is invalid. + """ + if "." not in token: + raise ValueError("Invalid session token.") + + raw_token, signature = token.rsplit(".", 1) + expected_signature = _session_token_signature(raw_token, secret) + if not hmac.compare_digest(expected_signature, signature): + raise ValueError("Invalid session token.") + return raw_token +``` + +**Key points:** + +1. **All tokens must be signed** — Tokens without a signature (no separator) are rejected immediately. +2. **Signature is mandatory** — The `unwrap_session_token()` function raises `ValueError` if the separator is absent. +3. **Use HMAC-SHA256** — Always use `hmac.compare_digest()` for signature verification to prevent timing attacks. +4. **Sign on login** — `login()` creates a raw token, stores it (hashed) in the database, then returns the signed token to the client. +5. **Verify on every request** — The `validate_session()` service verifies the signature by calling `unwrap_session_token()` with the `session_secret`, then looks up the raw token in the database. +6. **Session invalidation** — When upgrading from plaintext to signed tokens (TASK-022), all existing sessions must be invalidated because raw tokens will no longer be stored unencrypted. + +**Why HMAC signing is necessary:** + +- **Prevents token forgery** — An attacker cannot create a valid token without knowing the `session_secret`. +- **Works alongside hashed storage** — Even if the database is compromised (plaintext before hashing), the attacker gets only the raw token, not a signed token. A raw token without a valid signature is rejected by `unwrap_session_token()`. +- **Timing attack resistance** — `hmac.compare_digest()` compares signatures in constant time, preventing attackers from using timing differences to guess valid signatures. + +#### 13.7.3 Session Cache Pluggability — Process-Local vs. Shared Backends + +Session validation is expensive (SQLite lookup + password verification). To improve performance, **validated session tokens are cached** using the `SessionCache` interface (`app.utils.session_cache`). The default implementation, `InMemorySessionCache`, stores cached sessions in process-local memory. + +**Current implementation (single-worker):** + +```python +from app.utils.session_cache import SessionCache, InMemorySessionCache, NoOpSessionCache + +class SessionCache(Protocol): + """Interface for session token validation cache backends.""" + def get(self, token: str) -> Session | None: ... + def set(self, token: str, session: Session, ttl_seconds: float) -> None: ... + def invalidate(self, token: str) -> None: ... + def clear(self) -> None: ... + +# Default in-memory implementation — PROCESS-LOCAL +class InMemorySessionCache: + def __init__(self) -> None: + self._entries: dict[str, tuple[Session, float]] = {} +``` + +**Single-worker constraint:** + +`InMemorySessionCache` is **process-local** — each worker process has its own dict. In single-worker mode (enforced by TASK-002), this is safe and improves performance. In multi-worker deployments: +- A logout by worker A clears the session from A's cache, but worker B still has it → logout doesn't work. +- Enabling/disabling the cache requires restarting all workers to take effect. + +**Multi-worker solution:** + +To support multiple workers (future enhancement), implement a shared backend behind the same `SessionCache` Protocol: + +```python +# Example Redis implementation (not yet in codebase) +class RedisSessionCache: + """Session cache backed by Redis.""" + def __init__(self, redis_url: str) -> None: + self.client = aioredis.from_url(redis_url) + + async def get(self, token: str) -> Session | None: + data = await self.client.get(f"session:{token}") + return Session.model_validate_json(data) if data else None + + async def set(self, token: str, session: Session, ttl_seconds: float) -> None: + await self.client.setex( + f"session:{token}", + int(ttl_seconds), + session.model_dump_json() + ) + + async def invalidate(self, token: str) -> None: + await self.client.delete(f"session:{token}") + + async def clear(self) -> None: + await self.client.flushdb() +``` + +To adopt a Redis backend: +1. Create `RedisSessionCache` in `app.utils.session_cache`. +2. Update `app.utils.runtime_state.set_runtime_settings()` to instantiate `RedisSessionCache` when `REDIS_URL` env var is set. +3. Update `app.config.Settings` to accept optional `REDIS_URL`. +4. Tests continue to use `InMemorySessionCache` (no Redis dependency in dev). + +**Implementation rules:** +- All cache methods must be `async` (even if the backend is sync). +- Never log session tokens or session data. +- TTL must be respected — expired entries must be removed on access. +- See `app/utils/session_cache.py` for the full Protocol definition and current implementations. + +### 17.8 Composition over Inheritance - Favour **composing** small, focused objects over deep inheritance hierarchies. - Use mixins or protocols only when a clear "is-a" relationship exists; otherwise, pass collaborators as constructor arguments. -### 13.9 Fail Fast +### 17.9 Fail Fast - Validate inputs as early as possible — at the API boundary with Pydantic, at service entry with assertions or domain checks. - Raise specific exceptions immediately rather than letting bad data propagate silently. -### 13.10 Law of Demeter (Principle of Least Knowledge) +### 17.10 Law of Demeter (Principle of Least Knowledge) - A function should only call methods on: 1. Its own object (`self`). @@ -435,22 +3384,104 @@ class SqliteBanRepository: 3. Objects it creates. - Avoid long accessor chains like `request.state.db.cursor().execute(...)` — wrap them in a meaningful method. -### 13.11 Defensive Programming +### 17.11 Defensive Programming - Never trust external input — validate and sanitise everything that crosses a boundary (HTTP request, file, socket, environment variable). - Handle edge cases explicitly: empty lists, `None` values, negative numbers, empty strings. - Use type narrowing and exhaustive pattern matching (`match` / `case`) to eliminate impossible states. ---- +### 17.12 SSRF Prevention (Server-Side Request Forgery) -## 14. Quick Reference — Do / Don't +When user-supplied URLs are fetched by the backend, validate them before making any HTTP requests: + +1. **Use Pydantic's `AnyHttpUrl` type** to restrict schemes to `http://` and `https://` only. + - Rejects `file://`, `ftp://`, `gopher://`, and other non-http schemes at the model boundary. + +2. **Validate resolved IP addresses** before fetching: + - Parse the hostname and resolve it via DNS (using `socket.getaddrinfo()`). + - Use `ipaddress.ip_address().is_private` to reject private/reserved ranges: + - RFC 1918: `10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16` + - Loopback: `127.0.0.0/8`, `::1/128` + - Link-local: `169.254.0.0/16`, `fe80::/10` + - IPv6 site-local, multicast, and reserved ranges. + - Raise `ValueError` if validation fails; let the router convert it to HTTP 400. + +3. **Guard against DNS rebinding**: + - Validate DNS at URL creation/validation time (performed during request deserialization). + - For additional safety, re-validate the connection IP at HTTP client time (e.g., custom `aiohttp.TCPConnector` can inspect the resolved address during connect). + +4. **Example implementation** (see `backend/app/utils/ip_utils.py`): + - `is_private_ip(ip_str: str) → bool`: Checks if IP is private/reserved/loopback/link-local. + - `async validate_blocklist_url(url: AnyHttpUrl) → None`: Async DNS resolution + private IP check. + - Service layer calls `await validate_blocklist_url(url)` before persisting; router catches `ValueError` and returns 400. + +### 17.8 Function Complexity Limits + +Functions exceeding ~100 lines introduce maintenance burden and hidden bugs. Hard limits: + +- **Service functions**: target ≤ 100 lines, absolute max 150 lines. +- **Utility functions**: target ≤ 50 lines, absolute max 80 lines. +- **Router handlers**: target ≤ 40 lines, absolute max 60 lines. + +When a function grows beyond its target: + +1. **Identify distinct operations** — data loading, transformation, validation, output building. +2. **Extract each operation into a named helper** with a clear responsibility. +3. **Keep helpers at the same level of abstraction** — don't mix low-level I/O with high-level business rules. + +Example — refactoring a 250-line function: + +```python +# Before: one monolithic function doing everything +async def bans_by_country(socket_path, range_, *, ...): + # 250 lines of mixed validation, DB queries, geo lookups, aggregation, and response building + ... + +# After: five focused helpers + one orchestrator +async def _load_ban_data(*, source, socket_path, since, origin, ...): + """Step 1: Query per-IP ban counts from the right source.""" ... + +async def _resolve_geo(unique_ips, *, http_session, geo_cache_lookup, ...): + """Step 2: Resolve geo info from cache or enricher.""" ... + +async def _load_companion_rows(*, source, country_code, geo_map, ...): + """Step 3: Load companion ban rows, optionally filtered by country.""" ... + +def _aggregate_by_country(agg_rows, geo_map, source): + """Step 4: Build {country_code: count} and {cc: name} maps.""" ... + +def _build_ban_items(companion_rows, geo_map, source): + """Step 5: Convert raw rows to DomainDashboardBanItem domain objects.""" ... + +async def bans_by_country(socket_path, range_, *, ...): + agg_rows, total, unique_ips = await _load_ban_data(...) + geo_map = await _resolve_geo(unique_ips, ...) + companion_rows, _ = await _load_companion_rows(...) + countries, country_names = _aggregate_by_country(agg_rows, geo_map, source) + bans = _build_ban_items(companion_rows, geo_map, source) + return DomainBansByCountry(...) +``` + +**Why this works**: +- Each helper is independently testable. +- Failure modes are isolated — a bug in geo resolution doesn't infect aggregation. +- Code review becomes line-based rather than block-based. +- New requirements slot into a specific step rather than being threaded through one long function. + +**Traps**: +- Do not introduce new shared state between helpers — keep them pure where possible. +- Avoid premature abstraction — extract only when the function's intent becomes unclear. +- Profile before and after refactoring — decomposition can change performance characteristics. + +## 18. Quick Reference — Do / Don't | Do | Don't | |---|---| | Type every function, variable, return | Leave types implicit | | Use `async def` for I/O | Use sync functions for I/O | | Validate with Pydantic at the boundary | Pass raw dicts through the codebase | -| Log with structlog + context keys | Use `print()` or format strings in logs | +| Log with structlog + context keys (INFO/WARNING/ERROR/DEBUG) | Use `print()` or format strings in logs | +| Use `log.exception()` in catch-all handlers (captures traceback) | Use `log.error()` for exceptions; let exceptions get lost | | Write tests for every feature | Ship untested code | | Use `aiohttp` for HTTP calls | Use `requests` | | Handle errors with custom exceptions | Use bare `except:` | diff --git a/Docs/CONFIGURATION.md b/Docs/CONFIGURATION.md new file mode 100644 index 0000000..fdeade1 --- /dev/null +++ b/Docs/CONFIGURATION.md @@ -0,0 +1,194 @@ +# Configuration Reference + +All runtime settings are environment variables prefixed with `BANGUI_`. Values are validated at startup — missing required fields or invalid values cause the application to refuse to start. + +For setup instructions, see [Instructions.md](./Instructions.md). For deployment, see [Deployment.md](./Deployment.md). + +--- + +## Database + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_DATABASE_PATH` | string | `bangui.db` | Filesystem path to the SQLite application database. Parent directory must exist and be writable at startup. | + +--- + +## Session & Security + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_SESSION_SECRET` | string | **(required)** | Secret key for signing session tokens. Must be ≥ 32 characters. Generate with `python -c "import secrets; print(secrets.token_hex(32))"`. Never reuse across environments. | +| `BANGUI_SESSION_SECRET_PREVIOUS` | string | `null` | Previous session secret used during rotation. Set to the old secret while rotating; unset once all old tokens expire. | +| `BANGUI_SESSION_DURATION_MINUTES` | int | `60` | Session lifetime in minutes. Must be ≥ 1. | +| `BANGUI_SESSION_CACHE_ENABLED` | bool | `false` | Enable in-memory session validation cache. Disable in multi-worker deployments to avoid stale revoked sessions. | +| `BANGUI_SESSION_CACHE_TTL_SECONDS` | float | `10.0` | TTL for cached session entries. Ignored when `BANGUI_SESSION_CACHE_ENABLED` is `false`. Must be ≥ 0. | +| `BANGUI_SESSION_COOKIE_HTTPONLY` | bool | `true` | Mark the session cookie as `HttpOnly` (JavaScript cannot access it). | +| `BANGUI_SESSION_COOKIE_SAMESITE` | string | `lax` | SameSite policy for the session cookie. Valid values: `lax`, `strict`, `none`. | +| `BANGUI_SESSION_COOKIE_SECURE` | bool | `true` | Set the `Secure` flag on the session cookie. `true` required for HTTPS. Set to `false` only for local HTTP development. | + +--- + +## fail2ban Integration + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_FAIL2BAN_SOCKET` | string | `/var/run/fail2ban/fail2ban.sock` | Path to the fail2ban Unix domain socket. Socket must exist and be readable at startup (warning issued if not). | +| `BANGUI_FAIL2BAN_CONFIG_DIR` | string | `/config/fail2ban` | Path to the fail2ban configuration directory. Must contain `jail.d/`, `filter.d/`, and `action.d/`. | +| `BANGUI_FAIL2BAN_START_COMMAND` | string | `fail2ban-client start` | Shell command to start the fail2ban daemon (no shell interpretation). Used during recovery rollback. Must be parseable by `shlex.split`. | +| `BANGUI_ALLOWED_LOG_DIRS` | list | `/var/log,/config/log` | Allowed directory prefixes for jail log paths. Any log path must resolve within one of these directories. | +| `BANGUI_TRUSTED_PROXIES` | list | `[]` | Trusted reverse proxy IP addresses or CIDR ranges (e.g., `192.168.1.1,10.0.0.0/8`). Only these sources can set `X-Forwarded-For` and `X-Real-IP`. | + +--- + +## HTTP Client + +These settings control outbound HTTP requests made by the backend (geolocation fallback, blocklist downloads). + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_HTTP_REQUEST_TIMEOUT_SECONDS` | float | `20.0` | Maximum total time for an outbound HTTP request. Must be ≥ 0. | +| `BANGUI_HTTP_CONNECT_TIMEOUT_SECONDS` | float | `5.0` | Maximum time to establish a TCP connection. Must be ≥ 0. | +| `BANGUI_HTTP_MAX_CONNECTIONS` | int | `10` | Maximum concurrent outbound HTTP connections. Must be ≥ 1. | +| `BANGUI_HTTP_KEEPALIVE_TIMEOUT_SECONDS` | float | `15.0` | How long idle keepalive connections are retained. Must be ≥ 0. | + +--- + +## Geolocation + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_GEOIP_DB_PATH` | string | `null` | Path to a MaxMind GeoLite2-Country `.mmdb` file. Primary resolver for IP geolocation when set. Download from https://dev.maxmind.com/geoip/geolite2-country. | +| `BANGUI_GEOIP_ALLOW_HTTP_FALLBACK` | bool | `false` | Allow HTTP fallback to `ip-api.com` when the MMDB is unavailable. **Warning**: sends IP addresses unencrypted. Only enable when MMDB cannot be mounted. | + +--- + +## Cross-Origin (CORS) + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_CORS_ALLOWED_ORIGINS` | list | `http://localhost:5173,http://127.0.0.1:5173,https://localhost:5173,https://127.0.0.1:5173` | Allowed CORS origins. Comma-separated string or YAML list. Empty list disables CORS. **Never use `"*"` in production** when credentials are enabled. | + +--- + +## Display + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_TIMEZONE` | string | `UTC` | IANA timezone name used when displaying timestamps in the UI (e.g., `America/New_York`, `Europe/London`). | + +--- + +## External Logging + +Enable with `BANGUI_EXTERNAL_LOGGING_ENABLED=true`, then set the provider and provider-specific variables. + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_EXTERNAL_LOGGING_ENABLED` | bool | `false` | Send logs to a centralized logging platform instead of stdout only. | +| `BANGUI_EXTERNAL_LOGGING_PROVIDER` | string | `null` | Logging provider: `datadog`, `papertrail`, or `elasticsearch`. Required when external logging is enabled. | +| `BANGUI_EXTERNAL_LOGGING_BUFFER_SIZE` | int | `1000` | Max log records buffered in memory before dropping oldest. Must be ≥ 10. | +| `BANGUI_EXTERNAL_LOGGING_FLUSH_INTERVAL_SECONDS` | float | `5.0` | Max seconds before flushing a log batch. Must be > 0. | + +### Datadog + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_DATADOG_API_KEY` | string | `null` | Datadog API key. Required when provider is `datadog`. | +| `BANGUI_DATADOG_SITE` | string | `datadoghq.com` | Datadog site: `datadoghq.com` (US) or `datadoghq.eu` (EU). | +| `BANGUI_DATADOG_BATCH_SIZE` | int | `10` | Number of log records per batch. Must be ≥ 1. | + +### Papertrail + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_PAPERTRAIL_HOST` | string | `null` | Papertrail host address (e.g., `logs1.papertrailapp.com`). Required when provider is `papertrail`. | +| `BANGUI_PAPERTRAIL_PORT` | int | `null` | Papertrail port. Required when provider is `papertrail`. Range: 1–65535. | +| `BANGUI_PAPERTRAIL_PROGRAM_NAME` | string | `bangui` | Program name in Syslog messages sent to Papertrail. | + +### Elasticsearch + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_ELASTICSEARCH_HOSTS` | list | `[]` | Elasticsearch host URLs (e.g., `http://elasticsearch:9200`). Required when provider is `elasticsearch`. | +| `BANGUI_ELASTICSEARCH_INDEX_PREFIX` | string | `bangui` | Prefix for Elasticsearch indices. | +| `BANGUI_ELASTICSEARCH_BATCH_SIZE` | int | `10` | Number of log documents per batch. Must be ≥ 1. | + +--- + +## Rate Limiting + +Per-IP rate limits applied to API endpoints. + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_RATE_LIMIT_BANS_PER_MINUTE` | int | `100` | Max ban/unban requests per IP per minute. | +| `BANGUI_RATE_LIMIT_BLOCKLIST_IMPORT_PER_HOUR` | int | `100` | Max blocklist import requests per IP per hour. | +| `BANGUI_RATE_LIMIT_CONFIG_UPDATE_PER_MINUTE` | int | `50` | Max config update requests per IP per minute. | + +**Rate limit reset mechanism:** Each limit is applied per-client IP. To bypass the blocklist import rate limit in automated tests (E2E-4), send a unique `X-Forwarded-For` header with each import request — e.g., `X-Forwarded-For: 10.0.0.99`. The header is only honoured when the client IP falls within `BANGUI_TRUSTED_PROXIES`; otherwise the real client IP is used. + +--- + +## Pagination & Display Limits + +Configurable limits that affect API response sizes and data retention. + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_MAX_PAGE_SIZE` | int | `500` | Maximum records returned per paginated API response. Individual endpoints may further limit this. Must be 1–10000. | +| `BANGUI_PREVIEW_MAX_LINES` | int | `100` | Maximum IP lines returned in a blocklist source preview. Must be ≥ 1. | +| `BANGUI_HISTORY_RETENTION_DAYS` | int | `90` | Number of days historical ban records are retained before archival cleanup. Must be ≥ 1. | + +--- + +## Observability + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BANGUI_LOG_LEVEL` | string | `info` | Application log level. Valid values: `debug`, `info`, `warning`, `error`, `critical`. | +| `BANGUI_ENABLE_DOCS` | bool | `false` | Enable FastAPI interactive docs at `/api/docs` (Swagger UI) and `/api/redoc` (ReDoc). Enable only in development. | + +--- + +## Quick Reference + +```bash +# Generate a session secret +python -c "import secrets; print(secrets.token_hex(32))" + +# Minimal production .env +BANGUI_SESSION_SECRET= +BANGUI_CORS_ALLOWED_ORIGINS=https://your-frontend.example.com +BANGUI_TIMEZONE=America/New_York +``` + +--- + +## `manual-Jail` Fail2ban Jail (E2E Test Dependency) + +The E2E test **E2E-3** (`e2e/tests/02_ban_records.robot`) writes authentication-failure lines via `Docker/simulate_failed_logins.sh` and asserts that the resulting ban appears in the BanGUI UI. The test depends on the following `manual-Jail` configuration in `Docker/fail2ban-dev-config/fail2ban/jail.d/manual-Jail.conf`: + +| Parameter | Value | Relevance to E2E-3 | +|-----------|-------|---------------------| +| `maxretry` | `3` | Ban triggers after 3 matching lines. `simulate_failed_logins.sh` writes 5 lines by default — enough to trigger the ban reliably. | +| `findtime` | `120` | Time window in seconds during which `maxretry` failures accumulate. | +| `bantime` | `60` | Ban duration in seconds. Teardown unbans via `check_ban_status.sh --unban` regardless of bantime. | +| `logpath` | `/remotelogs/bangui/auth.log` | fail2ban reads this path inside the container. `simulate_failed_logins.sh` writes to `Docker/logs/auth.log`, which must be volume-mapped to `/remotelogs/bangui/auth.log`. | +| `backend` | `polling` | fail2ban re-reads the log file on its own interval (not event-driven). A 15 s sleep in the E2E test gives fail2ban time to detect the ban. | +| `ignoreip` | `127.0.0.0/8 ::1 172.16.0.0/12` | Test IP `192.168.100.99` is not ignored. Ensure local overrides do not add this IP to `ignoreip`. | + +**Log path mapping (Docker/Podman compose):** The host file `Docker/logs/auth.log` must be mounted to `/remotelogs/bangui/auth.log` inside the `bangui-fail2ban-dev` container. If the volume mapping is changed, `simulate_failed_logins.sh` will write to a path fail2ban does not watch, and the test will fail at Step 2 with no ban recorded. + +**Test IP:** `192.168.100.99` (non-routable link-local test subnet, RFC 3927). Safe to use because it is outside all `ignoreip` ranges and unlikely to appear in real traffic. + +**Scheduling note:** The backend does not receive push notifications from fail2ban. `GET /api/bans/active` queries the fail2ban Unix socket directly (on-demand). The history archive is populated by `history_sync`, a periodic job running every 300 s (`HISTORY_SYNC_INTERVAL` in `backend/app/tasks/history_sync.py`). The E2E test uses `GET /api/bans/active` for the API assertion (avoids the archive lag) and the History page with `?page_size=500` for the UI assertion. + +--- + +## Cross-References + +- [Deployment.md](./Deployment.md) — Docker configuration, health checks, graceful shutdown +- [Security.md](./Security.md) — Security recommendations and hardening +- [Observability.md](./Observability.md) — Logging, metrics, and monitoring +- [Backend-Development.md](./Backend-Development.md) — Backend coding conventions diff --git a/Docs/DATABASE_SCHEMA.md b/Docs/DATABASE_SCHEMA.md new file mode 100644 index 0000000..9f73d89 --- /dev/null +++ b/Docs/DATABASE_SCHEMA.md @@ -0,0 +1,347 @@ +# Database Schema Documentation + +BanGUI uses two SQLite databases: + +| Database | Purpose | Location | +|---|---|---| +| **BanGUI app DB** | Own configuration, sessions, blocklist sources, import logs, geo cache | `bangui.db` | +| **fail2ban DB** | fail2ban's internal ban/jail data (read-only) | Configured via `FAIL2BAN_DB` env var | + +--- + +## 1. BanGUI Application Schema + +Single source of truth: `backend/app/db.py`. + +### 1.1 `settings` + +Key-value store for application configuration. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `key` | TEXT | NOT NULL UNIQUE | +| `value` | TEXT | NOT NULL | +| `created_at` | TEXT | NOT NULL DEFAULT ISO 8601 | +| `updated_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Indexes:** PK only. + +**Purpose:** Stores app-wide settings (e.g., timezone, UI preferences). All settings access goes through `settings_repo` / `settings_service`. + +--- + +### 1.2 `sessions` + +Session tokens for web authentication. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `token_hash` | TEXT | NOT NULL UNIQUE | +| `created_at` | TEXT | NOT NULL DEFAULT ISO 8601 | +| `expires_at` | TEXT | NOT NULL | + +**Indexes:** `idx_sessions_token_hash` (UNIQUE) on `token_hash`. + +**Purpose:** Web session management. Tokens are SHA-256 hashed before storage. Sessions expire and are cleaned up by `session_cleanup` task. See `auth_service.py`. + +--- + +### 1.3 `blocklist_sources` + +Blocklist source definitions for the import pipeline. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `name` | TEXT | NOT NULL | +| `url` | TEXT | NOT NULL UNIQUE | +| `enabled` | INTEGER | NOT NULL DEFAULT 1 (boolean) | +| `created_at` | TEXT | NOT NULL DEFAULT ISO 8601 | +| `updated_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Indexes:** PK only. + +**Purpose:** Defines sources for blocklist imports. See `blocklist_repo`, `blocklist_service`, `blocklist_import_workflow`. + +--- + +### 1.4 `import_log` + +Audit log of individual blocklist import operations. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `source_id` | INTEGER | REFERENCES `blocklist_sources(id)` ON DELETE RESTRICT | +| `source_url` | TEXT | NOT NULL | +| `timestamp` | INTEGER | NOT NULL (UNIX epoch) | +| `ips_imported` | INTEGER | NOT NULL DEFAULT 0 | +| `ips_skipped` | INTEGER | NOT NULL DEFAULT 0 | +| `errors` | TEXT | | + +**Indexes:** +- `idx_import_log_id_desc` on `(id DESC)` — cursor pagination +- `idx_import_log_source_id_desc` on `(source_id, id DESC)` — filtered pagination + +**Purpose:** Audit trail for imports. `source_id` RESTRICT prevents source deletion when logs exist. See migration 9. + +**Migration 8:** `timestamp` migrated from TEXT ISO 8601 to INTEGER UNIX epoch. + +--- + +### 1.5 `geo_cache` + +Geo-IP lookup cache for ban IP metadata. + +| Column | Type | Constraints | +|---|---|---| +| `ip` | TEXT | PRIMARY KEY | +| `country_code` | TEXT | | +| `country_name` | TEXT | | +| `asn` | TEXT | | +| `org` | TEXT | | +| `cached_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Additional (migration 3):** +| Column | Type | Constraints | +|---|---|---| +| `last_seen` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Indexes:** PK only. + +**Purpose:** Caches GeoIP results to reduce third-party API calls. TTL managed by `geo_cache_cleanup` task. See `geo_cache_repo`, `geo_service`. + +--- + +### 1.6 `history_archive` + +Archived ban/unban history mirrored from fail2ban DB. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `jail` | TEXT | NOT NULL | +| `ip` | TEXT | NOT NULL | +| `timeofban` | INTEGER | NOT NULL (UNIX epoch) | +| `bancount` | INTEGER | NOT NULL | +| `data` | TEXT | NOT NULL (JSON) | +| `action` | TEXT | NOT NULL CHECK IN ('ban', 'unban') | +| `created_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Constraints:** `UNIQUE(ip, jail, action, timeofban)` prevents duplicate archive rows. + +**Indexes:** +- `idx_history_archive_jail_timeofban` on `(jail, timeofban DESC)` — dashboard filter by jail + time ordering +- `idx_history_archive_timeofban_jail_action` on `(timeofban DESC, jail, action)` — timeline filters +- `idx_history_archive_ip` on `(ip)` — IP prefix/exact searches +- `idx_history_archive_action` on `(action)` — ban/unban filtering + +**Purpose:** Long-term ban history. Synced from fail2ban DB by `history_sync` task. See `history_archive_repo`, `history_service`. + +--- + +### 1.7 `scheduler_lock` + +Database-backed mutex for multi-worker scheduler safety. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY CHECK (id = 1) — singleton row | +| `pid` | INTEGER | NOT NULL | +| `hostname` | TEXT | NOT NULL | +| `created_at` | REAL | NOT NULL (UNIX epoch) | +| `heartbeat_at` | REAL | NOT NULL (UNIX epoch) | + +**Indexes:** PK only (singleton constraint). + +**Purpose:** Only one worker process holds the scheduler lock at a time. Lock is heartbeat-renewed by `scheduler_lock_heartbeat` task. Uses `BEGIN IMMEDIATE` transaction to acquire atomically. See `scheduler_lock.py`. + +--- + +### 1.8 `import_runs` + +Tracks unique blocklist imports for idempotent retries. + +| Column | Type | Constraints | +|---|---|---| +| `id` | INTEGER | PRIMARY KEY AUTOINCREMENT | +| `source_id` | INTEGER | NOT NULL REFERENCES `blocklist_sources(id)` ON DELETE CASCADE | +| `content_hash` | TEXT | NOT NULL | +| `status` | TEXT | NOT NULL CHECK IN ('pending', 'completed', 'failed') | +| `imported_count` | INTEGER | NOT NULL DEFAULT 0 | +| `skipped_count` | INTEGER | NOT NULL DEFAULT 0 | +| `error_message` | TEXT | | +| `created_at` | TEXT | NOT NULL DEFAULT ISO 8601 | +| `updated_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Constraints:** `UNIQUE(source_id, content_hash)` — same source + content = same import run. + +**Indexes:** `idx_import_runs_source_status` on `(source_id, status)` — lookup completed imports by source. + +**Purpose:** Prevents duplicate IP bans on import crash/retry. See migration 6 and `blocklist_import_workflow`. + +--- + +### 1.9 `schema_migrations` + +Tracks applied schema versions. + +| Column | Type | Constraints | +|---|---|---| +| `version` | INTEGER | PRIMARY KEY | +| `migrated_at` | TEXT | NOT NULL DEFAULT ISO 8601 | + +**Indexes:** PK only. + +**Purpose:** Idempotent schema migration tracker. Records each applied version number. See `init_db()` and `_migrate_schema()`. + +--- + +## 2. Fail2ban Database Schema + +Read-only access via `fail2ban_db_repo`. Fail2ban manages this DB; BanGUI mirrors data into `history_archive`. + +### 2.1 `fail2banDb` + +| Column | Type | Constraints | +|---|---|---| +| `version` | INTEGER | | + +Single row tracking DB schema version. + +--- + +### 2.2 `jails` + +| Column | Type | Constraints | +|---|---|---| +| `name` | TEXT | NOT NULL UNIQUE | +| `enabled` | INTEGER | NOT NULL DEFAULT 1 | + +**Indexes:** `jails_name` on `(name)`. + +--- + +### 2.3 `logs` + +| Column | Type | Constraints | +|---|---|---| +| `jail` | TEXT | NOT NULL FK → `jails(name)` ON DELETE CASCADE | +| `path` | TEXT | | +| `firstlinemd5` | TEXT | | +| `lastfilepos` | INTEGER | DEFAULT 0 | +| `UNIQUE(jail, path)` | | | +| `UNIQUE(jail, path, firstlinemd5)` | | | + +**Indexes:** `logs_path` on `(path)`, `logs_jail_path` on `(jail, path)`. + +--- + +### 2.4 `bans` + +| Column | Type | Constraints | +|---|---|---| +| `jail` | TEXT | NOT NULL FK → `jails(name)` | +| `ip` | TEXT | | +| `timeofban` | INTEGER | NOT NULL | +| `bantime` | INTEGER | NOT NULL | +| `bancount` | INTEGER | NOT NULL DEFAULT 1 | +| `data` | JSON | | + +**Indexes:** +- `bans_jail_timeofban_ip` on `(jail, timeofban)` +- `bans_jail_ip` on `(jail, ip)` +- `bans_ip` on `(ip)` + +--- + +### 2.5 `bips` + +Backup IPs table (ban backup). + +| Column | Type | Constraints | +|---|---|---| +| `ip` | TEXT | NOT NULL | +| `jail` | TEXT | NOT NULL FK → `jails(name)` | +| `timeofban` | INTEGER | NOT NULL | +| `bantime` | INTEGER | NOT NULL | +| `bancount` | INTEGER | NOT NULL DEFAULT 1 | +| `data` | JSON | | +| PRIMARY KEY | `(ip, jail)` | | + +**Indexes:** `bips_timeofban` on `(timeofban)`, `bips_ip` on `(ip)`. + +--- + +## 3. Relationships and Constraints + +``` +blocklist_sources (1) ──(id)──→ import_log.source_id [RESTRICT on delete] + └──→ import_runs.source_id [CASCADE on delete] + +settings: standalone (key-value, no FK) +sessions: standalone (token hash, no FK) +geo_cache: standalone (IP → geo data, no FK) +history_archive: standalone (archived ban history, no FK) +scheduler_lock: singleton row (id=1), no FK +schema_migrations: standalone (migration tracking, no FK) +``` + +Fail2ban tables are separate and read-only from BanGUI's perspective. + +--- + +## 4. Indexes Summary + +| Table | Index | Columns | +|---|---|---| +| `sessions` | `idx_sessions_token_hash` | `token_hash` UNIQUE | +| `import_log` | `idx_import_log_id_desc` | `id DESC` | +| `import_log` | `idx_import_log_source_id_desc` | `source_id, id DESC` | +| `import_runs` | `idx_import_runs_source_status` | `source_id, status` | +| `history_archive` | `idx_history_archive_jail_timeofban` | `jail, timeofban DESC` | +| `history_archive` | `idx_history_archive_timeofban_jail_action` | `timeofban DESC, jail, action` | +| `history_archive` | `idx_history_archive_ip` | `ip` | +| `history_archive` | `idx_history_archive_action` | `action` | +| `jails` | `jails_name` | `name` | +| `logs` | `logs_path` | `path` | +| `logs` | `logs_jail_path` | `jail, path` | +| `bans` | `bans_jail_timeofban_ip` | `jail, timeofban` | +| `bans` | `bans_jail_ip` | `jail, ip` | +| `bans` | `bans_ip` | `ip` | +| `bips` | `bips_timeofban` | `timeofban` | +| `bips` | `bips_ip` | `ip` | + +--- + +## 5. Migration History + +| Version | Description | +|---|---| +| 1 | Initial schema: `settings`, `sessions`, `blocklist_sources`, `import_log`, `geo_cache`, `history_archive`, `schema_migrations` | +| 2 | Hash session tokens (`token_hash` column). Invalidates all existing sessions. | +| 3 | Add `last_seen` to `geo_cache` for retention policy. | +| 4 | Add `scheduler_lock` table for multi-worker scheduler mutex. | +| 5 | Add indexes to `history_archive` for query performance (4 indexes). | +| 6 | Add `import_runs` table for idempotent import tracking. | +| 7 | Add indexes to `import_log` for cursor-based pagination. | +| 8 | Migrate `import_log.timestamp` from TEXT ISO 8601 → INTEGER UNIX epoch. | +| 9 | Change `import_log.source_id` FK to `ON DELETE RESTRICT` (prevents orphaned logs). Recreate table with new FK semantics. | + +**Current schema version:** 9 (`_CURRENT_SCHEMA_VERSION` in `db.py`). + +--- + +## 6. Performance Notes + +- **WAL mode** (`PRAGMA journal_mode=WAL`) — concurrent reads allowed, better write performance under concurrency. +- **Foreign keys enforced** (`PRAGMA foreign_keys=ON`) — data integrity at DB level. +- **Busy timeout** 5000 ms — prevents "database is locked" errors under contention. +- **`history_archive` indexes** — tuned for dashboard filter + time ordering + pagination. See migration 5 and `PERFORMANCE.md`. +- **`import_log` indexes** — tuned for cursor-based pagination (newest-first by id). See migration 7. +- **`geo_cache` PK on `ip`** — O(1) lookup for geo enrichment on ban events. +- **`scheduler_lock` singleton** (`CHECK (id = 1)`) — trivial lock existence check. + +For detailed query patterns and benchmarks, see `Docs/PERFORMANCE.md`. diff --git a/Docs/DOMAIN_MODELS.md b/Docs/DOMAIN_MODELS.md new file mode 100644 index 0000000..6445cf1 --- /dev/null +++ b/Docs/DOMAIN_MODELS.md @@ -0,0 +1,124 @@ +# Domain Models — Reference Guide + +This document explains the domain model pattern used in BanGUI's backend and where to find examples. + +--- + +## What Are Domain Models? + +Domain models (e.g., `DomainActiveBanList`, `DomainJailConfig`) are **frozen dataclasses** that represent pure business logic. They are defined in `app/models/{domain}_domain.py` and are **returned by services**. + +Response models (e.g., `ActiveBanListResponse`, `JailConfigResponse`) are **Pydantic models** defined in `app/models/{domain}.py`. They are used **only by routers** for HTTP serialization. + +--- + +## Why This Separation? + +``` +Service (returns domain model) + ↓ +Router (converts domain → response via mapper) + ↓ +HTTP Response (Pydantic model) +``` + +**Benefits:** +- Domain logic evolves without affecting API shape +- Services are reusable across different frontends (GraphQL, gRPC, CLI) +- Testing is simpler (no Pydantic overhead) +- Changes to endpoint responses don't require service changes + +--- + +## Existing Domain Models + +| Domain | Domain Model(s) | Mapper Module | +|--------|----------------|---------------| +| **Ban** | `DomainActiveBanList`, `DomainActiveBan`, `DomainBansByCountry` | `ban_mappers.py` | +| **Jail** | `DomainJailList`, `DomainJailDetail`, `DomainJailBannedIps`, `DomainActiveBan` | `jail_mappers.py` | +| **Config** | `DomainJailConfig`, `DomainJailConfigList`, `DomainGlobalConfig`, `DomainServiceStatus`, `DomainBantimeEscalation`, `DomainFilterConfig`, `DomainFilterList`, `DomainRegexTest`, `DomainMapColorThresholds` | `config_mappers.py` | +| **History** | `DomainHistoryList`, `DomainHistoryBanItem`, `DomainIpDetail`, `DomainIpTimelineEvent` | `history_mappers.py` | +| **Server** | `DomainServerSettings`, `DomainServerSettingsResult` | `server_mappers.py` | +| **Blocklist** | `DomainBlocklistSource`, `DomainImportLogEntry`, `DomainImportLogList`, `DomainImportSourceResult`, `DomainImportRunResult`, `DomainPreviewResult`, `DomainScheduleConfig`, `DomainScheduleInfo` | `blocklist_mappers.py` | + +--- + +## The Pattern — Step by Step + +### Step 1: Define Domain Model in `app/models/{domain}_domain.py` + +```python +from dataclasses import dataclass + +@dataclass(frozen=True) +class DomainJailConfig: + """Configuration snapshot of a single jail (domain model).""" + + name: str + ban_time: int + max_retry: int + find_time: int + fail_regex: list[str] + actions: list[str] # ← no default BEFORE default = FIELD ORDER ERROR + date_pattern: str | None = None # ← all fields with defaults come AFTER + log_encoding: LogEncoding = "UTF-8" +``` + +**⚠️ Field Order Rule:** All fields without defaults must appear before all fields with defaults. + +### Step 2: Add Mapper in `app/mappers/{domain}_mappers.py` + +```python +def map_domain_jail_config_to_response(domain: DomainJailConfig) -> JailConfig: + """Convert domain jail config to response model.""" + return JailConfig( + name=domain.name, + ban_time=domain.ban_time, + ... + ) +``` + +### Step 3: Service Returns Domain Model + +```python +# In app/services/jail_service.py +from app.models.config_domain import DomainJailConfig, DomainJailConfigList + +async def get_jail_config(socket_path: str, name: str) -> DomainJailConfig: + ... + return DomainJailConfig(...) # ← return domain model +``` + +### Step 4: Router Uses Mapper at Boundary + +```python +# In app/routers/jail_config.py +from app.mappers import config_mappers + +@router.get("/{name}", response_model=JailConfigResponse) +async def get_jail_config(...) -> JailConfigResponse: + domain_result = await config_service.get_jail_config(socket_path, name) + return config_mappers.map_domain_jail_config_to_response(domain_result) +``` + +--- + +## Reference Implementation + +`ban_service.py` + `ban_mappers.py` is the canonical example of the correct pattern. Study it first when adding a new service. + +--- + +## Common Issues + +### Field Ordering Error + +``` +TypeError: non-default argument 'actions' follows default argument +``` + +**Fix:** Move all fields with defaults (`field: T | None = None`) after all fields without defaults. + +### Forgetting the Mapper + +If you refactor a service to return a domain model but forget to update the router, you'll get a type mismatch at the boundary. Always update router + service together. diff --git a/Docs/Deployment.md b/Docs/Deployment.md new file mode 100644 index 0000000..4e55a52 --- /dev/null +++ b/Docs/Deployment.md @@ -0,0 +1,1071 @@ +# Deployment Guide + +## Graceful Shutdown + +BanGUI implements graceful shutdown to ensure in-flight operations complete before the process exits. This prevents: +- Incomplete blocklist imports leaving stale data +- Interrupted ban requests +- Corrupted background job states +- Unclean database connection closures + +### How It Works + +1. **SIGTERM received** — Docker sends SIGTERM when `docker stop` is called +2. **Uvicorn catches SIGTERM** — Notifies the FastAPI lifespan handler +3. **Lifespan shutdown begins** — Scheduler stops accepting new jobs +4. **In-flight tasks drain** — Up to 25 seconds for running jobs to complete +5. **Resources cleaned up** — HTTP session, external logging, scheduler lock, DB connection + +### Docker Configuration + +```yaml +backend: + stop_grace_period: 30s # Give lifespan 30s to complete before SIGKILL +``` + +The `stop_grace_period` of 30s gives the Python code a 25s graceful timeout, leaving a 5s safety margin before Docker sends SIGKILL. + +### Shutdown Sequence + +| Step | Action | Timeout | +|------|--------|---------| +| 1 | Scheduler stops accepting new jobs | Immediate | +| 2 | Wait for pending background tasks | 25s max | +| 3 | Close HTTP session | Immediate | +| 4 | Flush external logging handler | Immediate | +| 5 | Release scheduler lock | Immediate | +| 6 | Close database connection | Immediate | + +### Background Tasks That Drain + +- Blocklist imports +- Geo IP cache resolutions +- History sync operations +- Geo cache cleanup +- Geo cache flush +- Session cleanup +- Rate limiter cleanup +- Scheduler lock heartbeat + +### Monitoring Shutdown + +Logs during shutdown: + +``` +bangui_shutting_down timeout_seconds=25.0 +scheduler_stopped_accepting_jobs +waiting_for_pending_tasks count=3 timeout_seconds=25.0 +pending_tasks_completed +http_session_closed +external_logging_shutdown_complete +scheduler_lock_released +bangui_shut_down +``` + +If tasks exceed the timeout: +``` +pending_tasks_timeout cancelled_count=3 +``` + +### Rolling Deployments + +During rolling deployments: +1. Old instance releases scheduler lock immediately on shutdown +2. New instance acquires lock without waiting for TTL expiry +3. Zero downtime for background job execution + +--- + +## Health Checks + +The backend container includes **three** health check endpoints: + +### Combined Health Check — `GET /api/v1/health` + +Reports application and component status for Docker HEALTHCHECK and legacy +monitoring integration: + +- **HTTP 200** with `{"status": "ok", ...}` — all components healthy +- **HTTP 200** with `{"status": "degraded", ...}` — some components unhealthy (e.g., database error) but fail2ban reachable +- **HTTP 503** with `{"status": "unavailable", ...}` — fail2ban is unreachable (backend will restart) + +**Component checks performed:** + +| Component | Check | Notes | +|---|---|---| +| fail2ban | Socket ping via cached status | Returns 503 when offline | +| database | Opens and closes a test connection | Returns degraded when failing | +| scheduler | `scheduler.running` attribute | Returns degraded when stopped | +| cache | Session cache presence | Returns degraded when not initialised | +| external_logging | Handler initialization status | Returns degraded when failed | + +### Kubernetes Probes — Liveness and Readiness + +Two separate probes following Kubernetes conventions: + +| Endpoint | Purpose | HTTP Code | Kubernetes Action | +|---|---|---|---| +| `GET /api/v1/health/live` | Process alive | Always 200 | Restart container if non-2xx | +| `GET /api/v1/health/ready` | All subsystems ready | 200 (all pass) / 503 (any fail) | Stop routing traffic if non-2xx | + +**`/health/live` — Liveness probe:** +Returns 200 when the Python process and event loop are responsive. No subsystem checks are performed — this endpoint is always fast. Use for Kubernetes `livenessProbe`. + +**`/health/ready` — Readiness probe:** +Verifies all critical sub-systems are reachable before routing traffic. Returns 200 only when all pass; returns 503 with a JSON body listing every failed check otherwise. + +| Subsystem | Check | Timeout | +|---|---|---| +| database | Opens and closes a test connection | 2 s | +| fail2ban | Socket reachability via cached server status | N/A (instant) | +| config_dir | Config directory read access (`os.R_OK`) | 2 s | +| scheduler | `scheduler.running` attribute | N/A (instant) | + +**Readiness response example (all healthy — HTTP 200):** +```json +{ + "status": "ok", + "checks": [ + {"name": "database", "healthy": true}, + {"name": "fail2ban", "healthy": true}, + {"name": "config_dir", "healthy": true}, + {"name": "scheduler", "healthy": true} + ], + "failed_count": 0 +} +``` + +**Readiness response example (fail2ban offline — HTTP 503):** +```json +{ + "status": "error", + "checks": [ + {"name": "database", "healthy": true}, + {"name": "fail2ban", "healthy": false, "message": "Socket not reachable"}, + {"name": "config_dir", "healthy": true}, + {"name": "scheduler", "healthy": true} + ], + "failed_count": 1 +} +``` + +**Why separate liveness and readiness?** +Liveness (`/health/live`) must be cheap — a slow or hanging liveness probe causes Kubernetes to restart a perfectly healthy container. Readiness (`/health/ready`) can afford to check sub-systems because traffic is only held back temporarily while a pod recovers. + +**Docker Health Check:** + +The Dockerfile includes a HEALTHCHECK that queries the endpoint. Docker interprets HTTP 503 as unhealthy and restarts the container after 3 consecutive failures (90 seconds by default). + +**Why 503 for offline fail2ban?** + +If fail2ban goes offline but the backend always returns 200, Docker treats the container as healthy. This masks infrastructure failures. By returning 503 when fail2ban is unreachable, orchestration tools (Docker, Kubernetes, Docker Swarm) automatically restart the backend container until fail2ban recovers. + +**Docker Compose health check parameters:** + +| Parameter | Value | Rationale | +|---|---|---| +| `interval` | 30s | Balance between responsiveness and load | +| `timeout` | 10s | Allows for slow probe on busy system | +| `retries` | 3 | ~90 seconds before restart (3 × 30s) | +| `start_period` | 40s | Allows app and fail2ban to fully start | + +--- + + +## Rate Limiting + +Rate limiting is enforced at two levels: + +1. **Global middleware** — Per-IP request rate limit across all endpoints (default: 200 requests/minute per IP) +2. **Per-bucket limits** — Stricter limits on specific operations: + + | Bucket | Limit | Window | Purpose | + |--------|-------|--------|---------| + | `bans:ban` | 100/min | 60s | Ban operations | + | `bans:unban` | 100/min | 60s | Unban operations | + | `blocklist:import` | 10/hour | 3600s | Import operations | + | `config:update` | 50/min | 60s | Config write operations | + | `jail:*` | 100/min | 60s | Jail management | + | `filter:*` | 50/min | 60s | Filter management | + | `action:*` | 50/min | 60s | Action management | + +### Process-Local Scope + +**Current implementation is process-local.** Each worker maintains independent in-memory counters. In a multi-worker deployment (N workers), an attacker can send up to N × limit requests before any single worker triggers a block — effectively multiplying the allowed request rate by the number of workers. + +**Short-term mitigation:** The scheduler lock enforces single-worker mode. The startup warning log (`rate_limiting_process_local_only`) documents this constraint. Deploy with one worker. + +**Long-term solution:** Replace the in-process GlobalRateLimiter with a Redis-backed adapter. The `check_allowed()` and `check_allowed_for_bucket()` interfaces are designed for a drop-in replacement using atomic `INCR` + `EXPIRE` semantics — no changes needed in middleware or router code. + +### Redis Migration (Future) + +When migrating to Redis, replace the in-memory deque store with: + +```python +# Atomic increment with expiry (pseudo-code) +count = redis.incr(f"rl:{ip}") +if count == 1: # First request, set expiry + redis.expire(f"rl:{ip}", window_seconds) +if count > max_requests: + return False, window_seconds - redis.ttl(f"rl:{ip}") +return True, 0 +``` + +The bucket variants use `INCR` + `EXPIRE` on `rl:{bucket}:{ip}` keys. This preserves the sliding-window semantics while providing shared state across all workers. + +### Monitoring + +Check logs for these events: +- `global_rate_limit_exceeded` — Global middleware blocked a request (WARNING) +- `rate_limiting_process_local_only` — Startup warning about multi-worker limitation (WARNING) +- `rate_limiter_cleanup` — Periodic cleanup of expired entries (DEBUG) + +--- + +## CORS Configuration + +Cross-Origin Resource Sharing (CORS) must be explicitly configured when the frontend and backend are served from different origins. + +### Development + +By default, the backend allows requests from common localhost development origins: + +- `http://localhost:5173` +- `http://127.0.0.1:5173` +- `https://localhost:5173` +- `https://127.0.0.1:5173` + +No additional configuration is needed for local development — just run the frontend and backend normally. + +### Production + +In production, override the default with your actual frontend origin(s): + +**Docker Compose:** +```yaml +environment: + BANGUI_CORS_ALLOWED_ORIGINS: "https://example.com,https://www.example.com" +``` + +**Environment File (.env):** +``` +BANGUI_CORS_ALLOWED_ORIGINS=https://example.com,https://www.example.com +``` + +**Multiple Origins:** +Separate multiple allowed origins with commas (no spaces): +``` +BANGUI_CORS_ALLOWED_ORIGINS=https://example.com,https://app.example.com,https://admin.example.com +``` + +**Disable CORS:** +To disable CORS entirely (e.g., when the frontend is served from the same origin as the backend): +``` +BANGUI_CORS_ALLOWED_ORIGINS= +``` + +### Security Considerations + +- **Always specify exact origins** — never use wildcard `*` in production, especially with `allow_credentials=true` (credentials mode is required for the session cookie). +- **Use HTTPS in production** — the backend enforces the Secure cookie flag, which requires HTTPS (or localhost for development). +- **Validate in reverse proxy** — if using Nginx or a CDN reverse proxy, validate the `Origin` header before forwarding requests to ensure only legitimate origins reach the backend. + +### Troubleshooting + +| Symptom | Cause | Solution | +|---------|-------|----------| +| `Access-Control-Allow-Origin` header missing from response | CORS not configured or origin not whitelisted | Check `BANGUI_CORS_ALLOWED_ORIGINS` and ensure your frontend origin is included | +| Browser blocks requests with CORS error | Credentials mode enabled but origin not exactly whitelisted | Ensure `BANGUI_CORS_ALLOWED_ORIGINS` includes the exact origin (protocol + domain + port) of your frontend | +| Works in development but fails in production | Default localhost origins used instead of production frontend domain | Override `BANGUI_CORS_ALLOWED_ORIGINS` in production environment | + +--- + + + +In multi-instance deployments (e.g., Kubernetes, Docker Swarm), the scheduler lock prevents duplicate execution of background tasks by ensuring only one instance runs the scheduler at a time. + +### How It Works + +The lock is stored in the SQLite database and enforced via: + +1. **Lock Acquisition** — At startup, each instance tries to insert a lock record. Only one succeeds; others reject startup with a clear error message. +2. **Heartbeat** — The lock-holding instance sends a heartbeat every 5 seconds to prove it's still alive. +3. **Stale Lock Cleanup** — On startup, any lock older than 60 seconds (without a heartbeat) is automatically deleted, allowing recovery from instance crashes. + +### Configuration + +| Parameter | Value | Rationale | +|-----------|-------|-----------| +| **Heartbeat Interval** | 5 seconds | Allows ~12 missed heartbeats before lock expires | +| **Lock TTL** | 60 seconds | Time before a lock without heartbeat is considered abandoned | +| **Min Safe Ratio** | 12x (TTL / interval) | Robust protection against temporary delays or high load | + +With a 60-second TTL and 5-second heartbeat interval, the lock survives even if the instance becomes unresponsive for up to ~55 seconds. This provides strong protection against false positives while still detecting genuine crashes. + +### Monitoring + +Check logs for these key events: + +- `scheduler_lock_acquired` — Lock successfully acquired at startup (INFO) +- `scheduler_lock_heartbeat_updated` — Heartbeat successfully updated (DEBUG) +- `scheduler_lock_heartbeat_failed` — Heartbeat update failed; lock may be lost (WARNING) +- `scheduler_lock_heartbeat_timeout` — Heartbeat exceeded 5-second timeout (ERROR) +- `scheduler_lock_held_by_other_instance` — Another instance holds the lock (WARNING at startup) + +### Troubleshooting: "Blocklist import runs twice" + +**Symptom:** Blocklist import task executes simultaneously in two instances, causing duplicate entries or data corruption. + +**Cause:** The scheduler lock was released prematurely (e.g., instance crash, database timeout) while a task was still running. + +**Solution:** + +1. **Check heartbeat timing** — Ensure the instance isn't hanging for >60 seconds (monitor CPU/memory/disk). +2. **Verify database health** — Run `SELECT * FROM scheduler_lock;` to see if a stale lock exists. If present, delete it: `DELETE FROM scheduler_lock;` +3. **Review logs** — Look for `scheduler_lock_heartbeat_failed` or `scheduler_lock_heartbeat_timeout` errors in the time window when duplication occurred. +4. **Increase resource limits** — If the backend is memory/CPU constrained, increase limits in `docker-compose.yml` to prevent slowdowns that trigger false lock timeouts. +5. **Check database performance** — Slow database queries can delay heartbeat updates. Run `PRAGMA integrity_check;` to check for corruption. + +If duplication occurs frequently, consider migrating to Redis-backed locking (see Advanced section below) for higher reliability. + +### Troubleshooting: "Scheduler stops completely" + +**Symptom:** Background tasks (blocklist import, geo cache cleanup, history sync, session cleanup) stop running. No errors in logs but tasks don't execute. + +**Cause:** Instance holding the scheduler lock crashed without releasing it, or heartbeat is failing silently. + +**Diagnosis:** + +1. Check if lock exists: `SELECT * FROM scheduler_lock;` +2. If lock exists with a PID that no longer runs, it's orphaned +3. Check logs for `scheduler_lock_heartbeat_lost` warnings + +**Solution:** + +1. **Clear the orphaned lock:** `DELETE FROM scheduler_lock;` +2. **Restart the instance** that should hold the lock +3. Verify lock acquisition: `grep "scheduler_lock_acquired" logs` +4. If heartbeat keeps failing, check database latency (SQLite heartbeats should be <100ms) + +**Prevention:** + +- Monitor `scheduler_lock_heartbeat_lost` events — more than 3 in an hour indicates a problem +- Ensure database I/O is not bottlenecked (SSD recommended for SQLite) +- Consider reducing heartbeat interval if network latency causes false timeouts + +### Advanced: Migrating to Redis + +For very high-traffic deployments with strict data consistency requirements, you can replace the SQLite-backed lock with Redis: + +- **Why:** Redis is single-threaded and atomic by design; clock skew and timeout issues are eliminated. +- **How:** Install `redlock-py` or `aioredis`, replace `scheduler_lock.py` with a Redis implementation, update heartbeat interval to 2-3 seconds. +- **Trade-off:** Adds a Redis dependency but eliminates database lock contention and provides microsecond-precision atomicity. + +This is not required for typical deployments but is recommended if you see frequent scheduler conflicts in logs. + +--- + +All containers have hard limits (max usage) and soft reservations (guaranteed allocation). This ensures: +- **Isolation**: A misbehaving container cannot crash others or the host +- **Predictability**: Reservations guarantee minimum resources even under load +- **Efficiency**: Unused reserved capacity can be borrowed by other containers + +### Container Resource Limits + +| Container | Limit CPU | Limit Memory | Reserved CPU | Reserved Memory | Purpose | +|-----------|-----------|--------------|--------------|-----------------|---------| +| **fail2ban** | 0.5 | 128M | 0.1 | 64M | Monitors logs, bans IPs—typically idle | +| **backend** | 2.0 | 512M | 1.0 | 256M | Core app: database, fail2ban API, config management | +| **frontend** | 0.5 | 128M | 0.25 | 64M | Nginx: serves SPA + API proxy | + +### Rationale + +- **fail2ban**: Lightweight log monitoring. Occasionally CPU spikes during ban processing but memory usage is minimal. +- **backend**: Heavy lifting—Python runtime, SQLite database, background jobs. May need extra memory for large blocklists. Reservation of 1.0 CPU ensures responsive API even when frontend is busy. +- **frontend**: Nginx is efficient. Limit of 0.5 CPU and 128M memory is more than sufficient for reverse proxy duties. + +--- + +## Memory Considerations + +### Backend Memory Requirements + +The backend typically runs in 256–512M under normal load. Memory usage depends on: +- **Blocklist size**: Large blocklists (>1M entries) require more heap space +- **Cache warmth**: First query after startup may require more memory as caches fill +- **Concurrent connections**: Each active user session uses a small amount of memory + +**Tuning:** If you see OOM kills in logs, increase backend limits and reservations (e.g., 1024M limit). Test under realistic load before finalizing. + +### Frontend Memory Usage + +Nginx is typically <50M. If you see memory pressure on frontend, check for: +- Misconfigured cache headers on static assets +- Large log volumes (nginx access logs) + +--- + +## Docker Swarm & Kubernetes + +For production deployments using orchestration platforms: + +### Docker Swarm + +The `deploy` sections in `docker-compose.yml` are compatible with `docker stack deploy`: + +```bash +docker stack deploy -c Docker/docker-compose.yml bangui +``` + +Swarm respects the same `limits` and `reservations` fields. + +### Kubernetes + +For Kubernetes, translate resource constraints to equivalent `resources` fields in your deployment manifests: + +```yaml +containers: + - name: backend + image: git.lpl-mind.de/lukas.pupkalipinski/bangui/backend:latest + resources: + limits: + cpu: "2" + memory: "512Mi" + requests: + cpu: "1" + memory: "256Mi" +``` + +Kubernetes equivalent mappings: +- Docker `deploy.limits` → Kubernetes `resources.limits` +- Docker `deploy.reservations` → Kubernetes `resources.requests` + +--- + +## Monitoring Resource Usage + +### Docker Compose (Development) + +```bash +docker stats +``` + +Shows real-time CPU and memory usage for all running containers. + +### Production (Docker Swarm / Kubernetes) + +Use native monitoring: +- **Docker Swarm**: Prometheus + Grafana +- **Kubernetes**: Metrics Server + dashboard or Prometheus + +--- + +## Configuration + +All runtime settings are documented in [CONFIGURATION.md](./CONFIGURATION.md), including database, session, fail2ban, HTTP client, geolocation, CORS, logging, rate limiting, and observability options. + +--- + +## Environment Variables + +Resource limits are configured in `Docker/docker-compose.yml` and cannot be overridden via environment variables. To adjust limits: + +1. Edit `Docker/docker-compose.yml` +2. Modify the `deploy.limits` and `deploy.reservations` sections +3. Restart containers: `make down && make up` + +--- + +## Troubleshooting + +| Issue | Symptom | Solution | +|-------|---------|----------| +| Backend OOM kills | "Exit code 137" in logs | Increase backend `memory` limit | +| Throttling | CPU at 100%, requests slow | Increase CPU limit or optimize code | +| Service startup timeout | Containers not becoming "healthy" | Increase reservation to guarantee capacity at startup | +| Host unresponsive | System-wide lag | Reduce container limits to prevent host starvation | + +--- + +## Disaster Recovery + +### Database Migration Failures + +If a migration fails mid-transaction, the application refuses to start. This is intentional — it prevents inconsistent schema states. + +**Diagnosis:** + +1. Check current schema version: + ```bash + sqlite3 /var/lib/bangui/bangui.db "SELECT MAX(version) FROM schema_migrations;" + ``` + +2. Check which tables exist: + ```bash + sqlite3 /var/lib/bangui/bangui.db "SELECT name FROM sqlite_master WHERE type='table';" + ``` + +3. Check application logs for the specific error. + +**Recovery Options:** + +- **Automatic rollback**: Next startup re-applies the same migration from scratch +- **Manual completion**: Apply the migration manually, then insert the version record: + ```bash + sqlite3 /var/lib/bangui/bangui.db "BEGIN IMMEDIATE;" + -- Run your SQL here + sqlite3 /var/lib/bangui/bangui.db "INSERT INTO schema_migrations (version) VALUES (?);" + sqlite3 /var/lib/bangui/bangui.db "COMMIT;" + ``` +- **Full reset** (development only): `rm bangui.db bangui.db-wal bangui.db-shm` + +**Prevention:** + +- Never modify `bangui.db` manually during running instance +- Always backup before major migrations +- Monitor startup logs for `migrating_database_schema` events + +### Orphaned WAL Files + +After crashes, SQLite WAL mode may leave orphaned `.wal` files. The database auto-recovers on next open. If you see WAL-related errors: + +```bash +# Check for orphaned WAL files +ls -la /var/lib/bangui/bangui.db* + +# Force checkpoint to merge WAL into main database +sqlite3 /var/lib/bangui/bangui.db "PRAGMA wal_checkpoint(FULL);" +``` + +See `Docs/DATABASE_MIGRATIONS.md` for full recovery procedures. + +--- + +## Next Steps + +- **Development**: Run `make up` to start with default limits +- **Staging**: Test with realistic data volumes and monitor resource usage +- **Production**: Adjust limits based on observed usage patterns, then commit changes + +--- + +## Security Best Practices + +### Secrets Management + +**Never hard-code secrets.** All secrets must be injected at runtime via environment variables. + +| Secret | Purpose | Generation | +|--------|---------|------------| +| `BANGUI_SESSION_SECRET` | Signs session cookies | `python -c 'import secrets; print(secrets.token_hex(32))'` | +| fail2ban credentials | jail config access | From fail2ban configuration | + +- Store secrets in a secrets manager (e.g., Docker secrets, Kubernetes Secrets, HashiCorp Vault) +- Rotate `BANGUI_SESSION_SECRET` periodically — sessions become invalid, users must re-login +- Never log or expose session secrets + +### Container Security Hardening + +**Non-root user**: Backend runs as `bangui:bangui` (UID 1000). Frontend runs as nginx default. This limits container breakout damage. + +**Filesystem permissions**: +```bash +# Data directory (SQLite DB) — only bangui user rw +chmod 700 /data +chown 1000:1000 /data + +# Config directory — read-only for backend (it reads fail2ban config) +# Write access only for config management operations via BanGUI +chmod 755 /config +``` + +**Capabilities**: fail2ban container requires `NET_ADMIN` and `NET_RAW` for raw socket manipulation and iptables interaction. No additional capabilities needed for app containers. + +**No privileged mode**: BanGUI containers must not run `--privileged`. The fail2ban container needs only specific capabilities, not full host access. + +### Network Security + +- **Internal network only**: All BanGUI containers communicate on `bangui-net`. Only the frontend port (default 8080) is exposed to the host. +- **fail2ban socket**: Mounted read-only (`ro`) from host — backend reads status only +- **fail2ban config**: Mounted read-write — BanGUI modifies jail configurations as requested +- **Drop traffic between containers**: Use Docker network isolation to prevent lateral movement: + ```yaml + networks: + bangui-net: + driver: bridge + internal: false # Allow external only for frontend + ``` + +### TLS / HTTPS + +BanGUI does not terminate TLS. Handle TLS at the reverse proxy or load balancer level: + +**Nginx (existing frontend container)**: +```nginx +server { + listen 443 ssl http2; + server_name bangui.example.com; + + ssl_certificate /etc/ssl/certs/bangui.crt; + ssl_certificate_key /etc/ssl/private/bangui.key; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256; + + # Proxy to existing frontend container + location / { + proxy_pass http://bangui-frontend:80; + ... + } +} +``` + +**Security headers** (already in nginx.conf): +- CSP, X-Frame-Options, X-Content-Type-Options, Referrer-Policy, Permissions-Policy +- Uncomment HSTS header when HTTPS is fully configured + +**HTTP to HTTPS redirect**: Add in your TLS terminator: +```nginx +server { + listen 80; + server_name bangui.example.com; + return 301 https://$host$request_uri; +} +``` + +### Dependency Scanning + +Scan base images for vulnerabilities regularly: + +```bash +# Trivy (Docker/Podman compatible) +trivy image python:3.12-slim +trivy image nginx:1.27-alpine +trivy image node:22-alpine + +# CI integration +trivy image --exit-code 1 --severity HIGH,CRITICAL git.lpl-mind.de/lukas.pupkalipinski/bangui/backend:latest +``` + +Update base images quarterly or when CVEs are published. + +### Rate Limiting at Deployment Level + +The application-level rate limiter (`BANGUI_RATE_LIMIT_*` env vars) handles API requests. Add deployment-level protection: + +**Nginx** (existing reverse proxy): +```nginx +# Limit concurrent connections per IP +limit_conn_zone $binary_remote_addr zone=conn_limit:10m; +server { + limit_conn conn_limit 100; +} +``` + +**Fail2ban** (already running): +- BanGUI manages fail2ban jails +- Additional deployment-level rate limits should target infrastructure endpoints (SSH, management UIs), not BanGUI itself + +### Audit Logging + +All authentication events are logged via structlog: + +| Event | Log Key | Severity | +|-------|---------|----------| +| Login success | `auth_login_success` | INFO | +| Login failure | `auth_login_failure` | WARNING | +| Session created | `session_created` | INFO | +| Session destroyed | `session_destroyed` | INFO | +| Session expired | `session_expired` | INFO | + +Forward these logs to a SIEM or log aggregator for security monitoring. See [Structured Logging](#structured-logging) below. + +--- + +## Performance Tuning + +### SQLite Performance + +SQLite is single-writer. Under write-heavy load (blocklist imports, history writes), writes may queue. + +**WAL mode** (default, do not disable): +``` +PRAGMA journal_mode=WAL; -- Already enabled by default +``` + +**Synchronous mode** for production: +``` +PRAGMA synchronous=NORMAL; -- Balanced (not FULL, not OFF) +``` +This survives process crashes without corruption while maintaining good write performance. + +**Cache size** (increase for production): +```bash +# In-memory cache: 64MB (adjust based on available RAM) +PRAGMA cache_size=-65536; -- negative = KB +``` + +**temp_store** for large sorts: +``` +PRAGMA temp_store=MEMORY; +``` + +**Read performance**: +- Most reads are point queries by IP or jail name — indexes handle this efficiently +- Large history scans (dashboard) — paginate, use `LIMIT/OFFSET` +- Avoid `SELECT *` on large tables — always specify needed columns + +### Gzip Compression + +Already enabled in nginx.conf. Verify effective compression: +```bash +curl -H "Accept-Encoding: gzip" -I http://localhost:8080/api/v1/dashboard/status +# Should show: Content-Encoding: gzip +``` + +### Backend Performance + +**Startup warm-up**: On first request after start, caches are cold. First blocklist query may be slower. This is normal — subsequent requests hit cache. + +**Memory tuning**: +```yaml +# docker-compose.yml — increase if OOM +backend: + deploy: + limits: + memory: 1024M # Up from 512M for large blocklists +``` + +**Single worker enforced**: The session cache is process-local. Multiple workers would cause random logouts. This is intentional — scale horizontally via orchestration, not vertically via workers. + +**Single-Worker Requirement** +============================= + +BanGUI enforces single-worker mode at startup. It fails immediately with a clear error if more than one worker is configured. + +**Why this matters:** + +- **In-memory session cache** — each worker has its own cache copy. A session cached in worker A is invisible to worker B. A user validated by A may be rejected by B. +- **Rate-limit windows** — per-IP counters are process-local. With 4 workers, a client hitting different workers gets 4× the intended rate limit. +- **Runtime state** — fail2ban status, pending recovery records, and jail service capability flags are all per-process. Dashboard queries to different workers return inconsistent data. +- **Background scheduler** — the database lock ensures only one instance runs scheduled jobs, but each worker's scheduler still fires. With multi-worker, the same job runs N times. + +**Detection:** + +The check runs at application startup in ``create_app()``: + +- ``WEB_CONCURRENCY`` env var — set by gunicorn, and by uvicorn in recent versions when ``--workers N`` is passed +- ``BANGUI_WORKERS`` env var — explicit override (discouraged) + +If either is set to a value > 1, ``RuntimeError`` is raised with instructions and a reference to this document. + +**Test mode:** + +The check is automatically skipped when ``TESTING=1`` is set. This allows the test suite to run with an arbitrary number of workers. + +**What to do instead of multi-worker:** + +Scale horizontally via container orchestration — run multiple containers behind a load balancer. Each container runs a single worker. The database lock ensures only one container runs background jobs at a time. + +### Frontend Performance + +**Static asset caching** (already configured): +``` +location /assets/ { + expires 1y; + add_header Cache-Control "public, immutable"; +} +``` + +**Bundle size**: Production build uses esbuild minification. Monitor with: +```bash +du -sh frontend/dist/ +ls -lh frontend/dist/assets/*.js +``` + +### Database Maintenance + +**Periodic checkpoint** (production, monthly or after large blocklist imports): +```bash +sqlite3 /data/bangui.db "PRAGMA wal_checkpoint(FULL);" +``` + +**Analyze for query planner** (after bulk inserts/deletes): +```bash +sqlite3 /data/bangui.db "ANALYZE;" +``` + +--- + +## Monitoring Setup + +### Health Check Endpoints + +**Combined health check** — `GET /api/v1/health` — primary monitoring target for Docker HEALTHCHECK. + +| Status | HTTP Code | Meaning | +|--------|-----------|---------| +| `ok` | 200 | All components healthy | +| `degraded` | 200 | Some components unhealthy — investigate | +| `unavailable` | 503 | fail2ban unreachable — container will be restarted | + +**Kubernetes probes:** + +`GET /api/v1/health/live` — Liveness probe. Always returns 200 if the process is alive. + +`GET /api/v1/health/ready` — Readiness probe. Returns 200 when all subsystems pass, 503 otherwise. + +| Probe | URL | Success | Failure | +|-------|---|---------|---------| +| Liveness | `/api/v1/health/live` | 200 | Non-2xx → restart | +| Readiness | `/api/v1/health/ready` | 200 | Non-2xx → stop traffic | + +### Structured Logging + +All logs are structured (JSON via structlog). Key fields: + +| Log field | Description | +|-----------|-------------| +| `event` | Event name (e.g., `auth_login_success`) | +| `request_id` | Per-request correlation ID | +| `user_id` | Session user (if authenticated) | +| `duration_ms` | Request duration | +| `component` | Component name (e.g., `scheduler`, `database`) | + +**Log levels**: + +| Level | Use | +|-------|-----| +| DEBUG | Detailed debugging (query SQL, cache hits) | +| INFO | Operational events (startup, shutdown, login, ban action) | +| WARNING | Recoverable issues (cache miss, lock contention) | +| ERROR | Failures requiring attention (DB error, fail2ban offline) | + +**Configure via env**: +``` +BANGUI_LOG_LEVEL=info # debug, info, warning, error +``` + +### Log Aggregation + +**Docker Compose** — forward container logs to aggregator: +```yaml +services: + backend: + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" +``` + +**External aggregators**: +```yaml +# Fluentd example +services: + backend: + logging: + driver: fluentd + options: + fluentd-address localhost:24224 + tag bangui-backend +``` + +**ELK Stack** — send JSON logs directly to Logstash or via Filebeat. + +### Metrics to Monitor + +| Metric | Source | Alert Threshold | +|--------|--------|----------------| +| Health check failures | `/api/v1/health` | 3 consecutive → container restart | +| Backend memory | `docker stats` | >450M (of 512M limit) | +| Backend CPU | `docker stats` | >80% sustained | +| Disk usage (`/data`) | `df -h` | >80% | +| fail2ban container restarts | `docker ps` | >2/hour | +| Backend container restarts | `docker ps` | >2/hour | +| Database file size | `ls -lh /data/bangui.db` | Grows >10MB/day indicates issue | +| Session count | `/api/v1/sessions` | Sudden drop indicates cache issue | +| Blocklist import duration | Logs (`blocklist_import_completed`) | >5 minutes may indicate performance issue | + +### Uptime Monitoring + +**External checks**: +- Monitor `https://your-domain.com/api/v1/health` from multiple geographic locations +- Use services: Better Uptime, UptimeRobot, Pingdom +- Alert on: HTTP 503, HTTP 200 + `degraded` status, connection timeout + +### Alerting + +**Critical (PagerDuty / immediate)**: +- Health check HTTP 503 for >30 seconds +- Backend OOM kill (exit code 137) +- fail2ban offline for >5 minutes + +**Warning (Slack / email)**: +- Health check returns `degraded` +- Disk usage >80% +- Memory usage >450M +- Backend restarts >2/hour + +--- + +## Scaling Guidelines + +### Horizontal Scaling + +BanGUI is **designed for horizontal scaling** via container orchestration (not multiple workers): + +``` +┌─────────────────────────────────────────────────┐ +│ Load Balancer │ +│ (nginx, HAProxy, Traefik) │ +└──────────────────┬─────────────────────────────┘ + │ + ┌─────────────┼─────────────┐ + ▼ ▼ ▼ +┌──────────┐ ┌──────────┐ ┌──────────┐ +│ Backend │ │ Backend │ │ Backend │ +│ (inst 1) │ │ (inst 2) │ │ (inst 3) │ +└────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ + └────────────┼────────────┘ + ▼ + ┌───────────────┐ + │ Scheduler │ + │ Lock (DB) │ ← Only one instance runs jobs + └───────────────┘ + │ + ▼ + ┌───────────────┐ + │ SQLite │ + │ (shared fs) │ + └───────────────┘ +``` + +**How it works**: +- Scheduler lock ensures only one instance runs background jobs +- Session cache is per-instance — use sticky sessions at load balancer, OR configure `BANGUI_SESSION_CACHE=redis` for shared sessions +- SQLite on shared storage — use network file system (NFS, GlusterFS) or block storage (AWS EBS) + +### Stateless Design + +For true stateless scaling without sticky sessions, migrate session cache to Redis: + +```yaml +# docker-compose.yml +backend: + environment: + - BANGUI_SESSION_CACHE=redis + - BANGUI_REDIS_URL=redis://redis:6379/0 + depends_on: + redis: + condition: service_healthy + + redis: + image: docker.io/library/redis:7-alpine + deploy: + limits: + cpus: '0.5' + memory: 256M +``` + +Benefits: +- Sessions shared across all instances → no sticky sessions needed +- Load balancer can distribute freely +- Scales linearly + +Trade-offs: +- Redis is another dependency to monitor +- Redis persistence required for session survival across Redis restarts +- Redis failure causes mass logouts + +### Database Scaling + +SQLite does not support read replicas. Scaling reads is limited. + +**Read scaling** (if needed): +- Cache aggressively — BanGUI caches blocklist data in-memory +- Add read-only views for dashboard queries +- Consider periodic snapshot exports to separate read-optimized store + +**Write scaling**: +- Single writer only — SQLite WAL helps but doesn't parallelize writes +- If write throughput becomes a bottleneck, consider: + - Periodic batching (already used for blocklist imports) + - Sharding by jail (separate DB per jail) — architectural change + - Migration to PostgreSQL — significant effort + +### CDN for Static Assets + +For large-scale deployments, serve `/assets/` from a CDN: + +```nginx +# Replace /assets/ proxy with CDN origin +location /assets/ { + proxy_pass https://your-cdn.cloudfront.net/assets/; + proxy_cache_valid 1y; + add_header Cache-Control "public, immutable"; +} +``` + +Benefits: +- Reduces frontend container load +- Assets served from edge locations close to users +- Reduces bandwidth costs + +### Autoscaling + +**Docker Swarm**: Use the `labels` + `update_config` pattern for rolling updates. Autoscaling requires external metrics (Prometheus + VPA or similar). + +**Kubernetes**: HorizontalPodAutoscaler (HPA) based on CPU/memory: +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: bangui-backend +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: bangui-backend + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +``` + +### Load Balancer Configuration + +**Health check**: +```yaml +# HAProxy example +backend-check: + option httpchk GET /api/v1/health + http-check expect status 200 +``` + +**Sticky sessions** (if NOT using Redis): +```yaml +# HAProxy +appsession _SESSION_ID len 64 timeout 24h +``` + +**Connection limits**: +```yaml +# Per-backend limit to prevent overload +server backend1 backend:8000 maxconn 50 +``` + +--- + +## Next Steps diff --git a/Docs/Features.md b/Docs/Features.md index a4880d9..20040f8 100644 --- a/Docs/Features.md +++ b/Docs/Features.md @@ -14,7 +14,7 @@ A web application to monitor, manage, and configure fail2ban from a clean, acces ### Options -- **Master Password** — Set a single global password that protects the entire web interface. +- **Master Password** — Set a single global password that protects the entire web interface. Must be between 8 and 72 characters long (72-byte limit is due to bcrypt truncation) and include one uppercase letter, one number, and one special character from `!@#$%^&*()`. - **Database Path** — Define where the application stores its own SQLite database. - **fail2ban Connection** — Specify how the application connects to the running fail2ban instance (socket path or related settings). - **General Preferences** — Any additional application-level settings such as default time zone, date format, or session duration. @@ -30,6 +30,22 @@ A web application to monitor, manage, and configure fail2ban from a clean, acces - After entering the correct password the user is taken to the page they originally requested. - A logout option is available from every page so the user can end their session. +### Session Validation on App Load + +- On app mount (page reload or initial load), the frontend validates the cached session with the backend by calling `GET /api/auth/session`. +- While the validation check is in flight, a loading spinner is displayed to avoid UI flicker. +- If the backend returns **200**, the session is valid and the app proceeds normally. +- If the backend returns **401**, the session has expired or been revoked (server-side DB deletion, restart, etc.), and the user is logged out and redirected to the login page. +- If a **network error** occurs (backend temporarily unreachable), the user is not logged out — the app assumes the backend will recover and continues with the cached session state. The next API call will trigger a 401 if the session is actually invalid. + +### Login Rate Limiting + +- The login endpoint (`POST /api/auth/login`) is protected against brute-force attacks with per-IP rate limiting. +- **Rate limit:** 5 login attempts per minute per IP address. +- When the limit is exceeded, the server returns **HTTP 429 Too Many Requests** with a `Retry-After` header indicating when requests will be accepted again. +- Each failed login attempt triggers a progressive server-side delay (exponential back-off from 1 to 10 seconds) to further slow down attack attempts, on top of the bcrypt password hashing cost. The penalty grows with consecutive failures and resets after the rate-limit window expires. +- The rate limiter tracks attempts in memory per IP, ensuring that rapid-fire attacks from a single source are quickly throttled. + --- ## 3. Ban Overview (Dashboard) @@ -196,11 +212,12 @@ A page to inspect and modify the fail2ban configuration without leaving the web - Option to register additional log files that fail2ban should monitor. - For each new log, specify: - - The path to the log file. + - The path to the log file (must be within allowed directories to prevent unauthorized access to sensitive files). - One or more regex patterns that define what constitutes a failure. - The jail name and basic jail settings (ban time, retries, etc.). - Choose whether the file should be read from the beginning or only new lines (head vs. tail). - Preview matching lines from the log against the provided regex before saving, so the user can verify the pattern works. +- **Log Path Security:** Added log paths must resolve to locations within a configured allowlist of safe directories (default: `/var/log` and `/config/log`). This prevents authenticated users from instructing fail2ban to monitor sensitive system files. Paths containing symlinks are resolved to their canonical targets before validation. ### Regex Tester @@ -211,8 +228,10 @@ A page to inspect and modify the fail2ban configuration without leaving the web ### Server Settings -- View and change the fail2ban log level (e.g. Critical, Error, Warning, Info, Debug). -- View and change the log target (file path, stdout, stderr, syslog, systemd journal). +- View and change the fail2ban log level using valid values: `CRITICAL`, `ERROR`, `WARNING`, `NOTICE`, `INFO`, `DEBUG`. +- View and change the log target, which can be: + - Special values: `STDOUT`, `STDERR`, `SYSLOG` + - A file path that resolves to one of the configured safe log directories (default: `/var/log` and `/config/log`). Symlinks are resolved to their canonical targets before validation. - View and change the syslog socket if syslog is used. - Flush and re-open log files (useful after log rotation). - View and change the fail2ban database file location. @@ -247,8 +266,8 @@ A page to inspect and modify the fail2ban configuration without leaving the web - **Auto-refresh** toggle with interval selector (5 s / 10 s / 30 s) for live monitoring. - Truncation notice when the total log file line count exceeds the requested tail limit. - Container automatically scrolls to the bottom after each data update. -- When fail2ban is configured to log to a non-file target (STDOUT, STDERR, SYSLOG, SYSTEMD-JOURNAL), an informational banner explains that file-based log viewing is unavailable. -- The log file path is validated against a safe prefix allowlist on the backend to prevent path-traversal reads. +- When fail2ban is configured to log to a non-file target (`STDOUT`, `STDERR`, or `SYSLOG`), an informational banner explains that file-based log viewing is unavailable. +- Log file paths are validated against a configurable allowlist of safe directories on the backend to prevent unauthorized reads of sensitive system files. --- @@ -295,6 +314,17 @@ Automated downloading and applying of external IP blocklists to block known mali - Support for plain-text lists with one IP address per line. - Preview the contents of a blocklist URL before enabling it (download and display a sample of entries). +#### URL Validation & Security + +- **Scheme restriction:** Only `http://` and `https://` schemes are accepted. `file://`, `ftp://`, and other schemes are rejected. +- **Hostname validation:** The hostname is resolved via DNS and the resulting IP address is validated to prevent SSRF attacks: + - Private IP ranges (`10.0.0.0/8`, `172.16.0.0/12`, `192.168.0.0/16`) are rejected. + - Loopback addresses (`127.0.0.1`, `::1`) are rejected. + - Link-local addresses (`169.254.0.0/16`, `fe80::/10`) are rejected. + - Reserved and multicast addresses are rejected. +- **Error handling:** If a URL fails validation (invalid scheme, unresolvable hostname, or resolves to a private IP), the API returns a `400 Bad Request` with a descriptive error message. +- **Ports:** URLs may specify custom ports (e.g. `https://example.com:8443/list.txt`), but the hostname must still resolve to a public IP address. + ### Schedule - Configure when the blocklist import runs using a simple time-and-frequency picker (no raw cron syntax required). @@ -306,6 +336,12 @@ Automated downloading and applying of external IP blocklists to block known mali - Option to run an import manually at any time via a "Run Now" button. - Show the date and time of the last successful import and the next scheduled run. +#### Scheduling Reliability + +- **Deterministic updates:** Schedule changes are applied immediately and deterministically. The schedule update endpoint waits for the reschedule operation to complete and surface any errors before returning the response. +- **Error observability:** If a schedule update fails (e.g., due to a database error), the HTTP response will reflect the error with an appropriate status code and error message. The user is never left wondering whether their schedule change took effect. +- **Atomicity:** The schedule is persisted to the database and the APScheduler job is updated in a coordinated manner. Both operations are completed before the update request returns success to the client. + ### Import Behaviour - On each scheduled run, download all enabled blocklist sources. @@ -322,6 +358,13 @@ Automated downloading and applying of external IP blocklists to block known mali - Display the import log in the web interface, filterable by source and date range. - Show a warning badge in the navigation if the most recent import encountered errors. +### Data Retention & Deletion + +- Import logs are retained for audit and troubleshooting purposes. +- A blocklist source **cannot be deleted** while it has associated import logs (foreign key RESTRICT constraint). +- Before deleting a source, delete all its import logs first via the API. +- Attempting to delete a source with logs returns **HTTP 409 Conflict** with error code `blocklist_source_has_logs`. + ### Error Handling - If a blocklist URL is unreachable, log the error and continue with remaining sources. diff --git a/Docs/Instructions.md b/Docs/Instructions.md index 17670ca..a7e82d5 100644 --- a/Docs/Instructions.md +++ b/Docs/Instructions.md @@ -72,13 +72,8 @@ Supporting documentation you must know and respect: Repeat the following cycle for every task. Do not skip steps. -### Step 1 — Pick a Task -- Open `tasks.md` and pick the next unfinished task (highest priority first). -- Mark the task as **in progress**. -- Read the task description thoroughly. Understand the expected outcome before proceeding. - -### Step 2 — Plan Your Steps +### Step 1 — Plan Your Steps - Break the task into concrete implementation steps. - Identify which files need to be created, modified, or deleted. @@ -86,7 +81,7 @@ Repeat the following cycle for every task. Do not skip steps. - Identify edge cases and error scenarios. - Write down your plan before touching any code. -### Step 3 — Write Code +### Step 2 — Write Code - Implement the feature or fix following the plan. - Follow all rules from the relevant development docs: @@ -97,14 +92,14 @@ Repeat the following cycle for every task. Do not skip steps. - Write clean, well-structured, fully typed code. - Keep commits atomic — one logical change per commit. -### Step 4 — Add Logging +### Step 3 — Add Logging - Add structured log statements at key points in new or modified code. - Backend: use **structlog** with contextual key-value pairs — never `print()`. - Log at appropriate levels: `info` for operational events, `warning` for recoverable issues, `error` for failures. - Never log sensitive data (passwords, tokens, session IDs). -### Step 5 — Write Tests +### Step 4 — Write Tests - Write tests for every new or changed piece of functionality. - Backend: use `pytest` + `pytest-asyncio` + `httpx.AsyncClient`. See [Backend-Development.md § 9](Backend-Development.md). @@ -113,24 +108,24 @@ Repeat the following cycle for every task. Do not skip steps. - Mock external dependencies — tests must never touch real infrastructure. - Follow the naming pattern: `test___`. -### Step 6 — Review Your Code +### Step 5 — Review Your Code Run a thorough self-review before considering the task done. Check **all** of the following: -#### 6.1 — Warnings and Errors +#### 5.1 — Warnings and Errors - Backend: run `ruff check` and `mypy --strict` (or `pyright --strict`). Fix every warning and error. - Frontend: run `tsc --noEmit` and `eslint`. Fix every warning and error. - Zero warnings, zero errors — no exceptions. -#### 6.2 — Test Coverage +#### 5.2 — Test Coverage - Run the test suite with coverage enabled. - Aim for **>80 % line coverage** overall. - Critical paths (auth, banning, scheduling, API endpoints) must be **100 %** covered. - If coverage is below the threshold, write additional tests before proceeding. -#### 6.3 — Coding Principles +#### 5.3 — Coding Principles Verify your code against the coding principles defined in [Backend-Development.md § 13](Backend-Development.md) and [Web-Development.md](Web-Development.md): @@ -141,7 +136,7 @@ Verify your code against the coding principles defined in [Backend-Development.m - [ ] **KISS** — The simplest correct solution is used. No over-engineering. - [ ] **Type Safety** — All types are explicit. No `any` / `Any`. No `# type: ignore` without justification. -#### 6.4 — Architecture Compliance +#### 5.4 — Architecture Compliance Verify against [Architekture.md](Architekture.md) and the project structure rules: @@ -153,7 +148,7 @@ Verify against [Architekture.md](Architekture.md) and the project structure rule - [ ] Pydantic models separate request, response, and domain shapes. - [ ] Frontend types live in `types/`, not scattered across components. -### Step 7 — Update Documentation +### Step 6 — Update Documentation - If your change introduces new features, new endpoints, new components, or changes existing behaviour, update the relevant docs: - [Features.md](Features.md) — if feature behaviour changed. @@ -161,51 +156,6 @@ Verify against [Architekture.md](Architekture.md) and the project structure rule - [Backend-Development.md](Backend-Development.md) or [Web-Development.md](Web-Development.md) — if new conventions were established. - Keep documentation accurate and in sync with the code. Outdated docs are worse than no docs. -### Step 8 — Mark Task Complete - -- Open `tasks.md` and mark the task as **done**. -- Add a brief summary of what was implemented or changed. - -### Step 9 — Commit - -- Stage all changed files. -- Write a commit message in **imperative tense**, max 72 characters for the subject line. - - Good: `Add jail reload endpoint` - - Bad: `added stuff` / `WIP` / `fix` -- If the change is large, include a body explaining **why**, not just **what**. -- Branch naming: `feature/`, `fix/`, `chore/`. -- Ensure the commit passes: linter, type checker, all tests. - -### Step 10 — Next Task - -- Return to **Step 1** and pick the next task. - ---- - -## 4. Workflow Summary - -``` -┌─────────────────────────────────────────┐ -│ 1. Pick task from tasks.md │ -│ 2. Plan your steps │ -│ 3. Write code │ -│ 4. Add logging │ -│ 5. Write tests │ -│ 6. Review your code │ -│ ├── 6.1 Check warnings & errors │ -│ ├── 6.2 Check test coverage │ -│ ├── 6.3 Check coding principles │ -│ └── 6.4 Check architecture │ -│ 7. Update documentation if needed │ -│ 8. Mark task complete in tasks.md │ -│ 9. Git commit │ -│ 10. Pick next task ──────── loop ───┐ │ -│ ▲ │ │ -│ └───────────────────────────┘ │ -└─────────────────────────────────────────┘ -``` - ---- ## 5. When You Are Stuck @@ -229,7 +179,37 @@ Verify against [Architekture.md](Architekture.md) and the project structure rule --- -## 7. Dev Quick-Reference +## 7. First-Run Setup + +### Initialize the Development Environment + +Before starting the stack for the first time, set up the required environment variables: + +1. **Copy the example environment file:** + ```bash + cp .env.example .env + ``` + +2. **Generate a session secret:** + ```bash + python -c 'import secrets; print(secrets.token_hex(32))' + ``` + Copy the output and paste it as the value for `BANGUI_SESSION_SECRET` in your `.env` file. + +3. **Optional: Customize other settings** + - Edit `.env` to adjust timezone, port numbers, or other settings + - Default values are sensible for development (UTC, ports 8000/5173) + +4. **Start the stack:** + ```bash + make up + ``` + +**Note:** The session secret is critical for security. Do not commit `.env` to version control — it is already in `.gitignore`. Each environment (dev, staging, production) must have its own unique secret. + +--- + +## 8. Dev Quick-Reference ### Start / stop the stack @@ -244,16 +224,17 @@ Backend: `http://127.0.0.1:8000` · Frontend (Vite proxy): `http://127.0.0.1:517 ### API login (dev) The frontend SHA256-hashes the password before sending it to the API. +The initial setup password must be at least 8 characters long and include one uppercase letter, one number, and one special character from `!@#$%^&*()`. The session cookie is named `bangui_session`. ```bash # Dev master password: Hallo123! HASHED=$(echo -n "Hallo123!" | sha256sum | awk '{print $1}') -TOKEN=$(curl -s -X POST http://127.0.0.1:8000/api/auth/login \ +TOKEN=$(curl -s -X POST http://127.0.0.1:8000/api/v1/auth/login \ -H 'Content-Type: application/json' \ -d "{\"password\":\"$HASHED\"}" \ | python3 -c 'import sys,json; print(json.load(sys.stdin)["token"])') # Use token in subsequent requests: -curl -H "Cookie: bangui_session=$TOKEN" http://127.0.0.1:8000/api/dashboard/status +curl -H "Cookie: bangui_session=$TOKEN" http://127.0.0.1:8000/api/v1/dashboard/status ``` diff --git a/Docs/Observability.md b/Docs/Observability.md new file mode 100644 index 0000000..0aebfa6 --- /dev/null +++ b/Docs/Observability.md @@ -0,0 +1,845 @@ +# Observability + +BanGUI provides comprehensive observability through structured logging, metrics, and tracing capabilities. This document outlines the observability architecture and how to configure it for production deployments. + +--- + +## Logging Architecture + +### Overview + +BanGUI uses **structlog** to emit structured, machine-readable logs in JSON format. All logs are automatically enriched with: + +- **Timestamps** in ISO 8601 format (`timestamp`) +- **Log levels** (`level` - debug, info, warning, error, critical) +- **Logger names** (`logger_name`) +- **Correlation IDs** for request tracking (`correlation_id`) +- **Custom context** from business logic (via context variables) + +### Log Output + +By default, logs are written to **stdout** in JSON format, making them suitable for: +- Container environments (Docker, Kubernetes) +- Log aggregation systems (ELK, Datadog, Papertrail) +- CI/CD pipelines and monitoring platforms + +```bash +# Example log output (formatted for readability) +{ + "timestamp": "2024-05-01T18:17:19.080+02:00", + "level": "info", + "logger_name": "app.main", + "event": "bangui_starting_up", + "database_path": "/var/lib/bangui/bangui.db", + "pid": 1234 +} +``` + +### Sensitive Data Handling + +**CRITICAL: Never log sensitive data.** The following must NEVER appear in logs: + +- Session tokens or cookies +- API keys or secrets +- Passwords or password hashes +- Private cryptographic keys +- Personal information (PII) +- Full IP addresses (when not required for security auditing) + +When logging authentication or sensitive operations: + +```python +# ✓ Correct: Log event type and result, not credentials +log.info("user_login_attempt", username=username, ip=client_ip, success=True) + +# ✓ Correct: Log sanitized identifiers +log.error("auth_token_validation_failed", token_hash=hashlib.sha256(token).hexdigest()[:16]) + +# ✗ WRONG: Don't do this +log.debug("raw_token", token=token) # Never! +log.info("password_check", password=password_hash) # Never! +``` + +Structlog provides context variable filtering to prevent accidental logging of sensitive data. Code reviews must verify compliance with this rule. + +### Log Sanitization + +All external output (subprocess results, API responses, config file contents) passed to structlog **must** be sanitized first using `sanitize_for_logging()` from `app.utils.log_sanitizer`. + +This prevents sensitive data — passwords, API keys, tokens, private keys — from leaking into logs. + +```python +from app.utils.log_sanitizer import sanitize_for_logging + +# ✓ Correct: Sanitize before logging +log.error( + "fail2ban_start_failed", + command=" ".join(start_cmd_parts), + returncode=process.returncode, + stdout=sanitize_for_logging(stdout.decode("utf-8", errors="replace")), + stderr=sanitize_for_logging(stderr.decode("utf-8", errors="replace")), +) + +# ✗ Wrong: Raw output may contain secrets +log.error("fail2ban_start_failed", stdout=stdout_raw, stderr=stderr_raw) # Never! +``` + +`sanitize_for_logging()` redacts the following patterns: + +| Pattern | Example match | Replacement | +|---------|---------------|-------------| +| `password=X` | `password=Secret123` | `password=***` | +| `api_key=X` / `api-key=X` | `api_key=key123` | `api_key=***` | +| `token=X` | `token=eyJhbG...` | `token=***` | +| `Authorization: Bearer X` | `Authorization: Bearer tok...` | `Authorization: ***` | +| `secret=X` | `secret=myvalue` | `secret=***` | +| `-----BEGIN RSA PRIVATE KEY-----` | (key header) | `*** PRIVATE KEY ***` | +| `AKIA...` | `AKIAIOSFODNN7EXAMPLE` | `AKIA***` | + +--- + +## Third-Party Library Logs + +BanGUI uses **structlog** for all application logs, but third-party libraries often emit plain text through Python's standard `logging` module. To maintain uniform JSON output and reduce noise, the following libraries have their log levels overridden to `WARNING`: + +| Library | Logger Name | Level | Rationale | +|---------|-------------|-------|-----------| +| APScheduler | `apscheduler` | `WARNING` | Suppresses routine scheduler polling ("Looking for jobs to run", "Next wakeup is due at...") while preserving job failure warnings. | +| aiosqlite | `aiosqlite` | `WARNING` | Suppresses database operation traces and connection details while preserving connection errors. | + +These overrides are applied in `backend/app/main.py::_configure_logging()` immediately after `logging.basicConfig()`. + +### Disabling Suppression + +Set the environment variable `BANGUI_SUPPRESS_THIRD_PARTY_LOGS=false` to allow APScheduler and aiosqlite to emit their normal DEBUG/INFO logs. This is useful when troubleshooting scheduler or database issues in development. + +```bash +BANGUI_SUPPRESS_THIRD_PARTY_LOGS=false python -m uvicorn app.main:create_app +``` + +When suppression is disabled, the loggers inherit the application's `BANGUI_LOG_LEVEL` (e.g., `debug`). + +### Uniform JSON Formatting + +All stdlib logs — including those from third-party libraries — are intercepted by `structlog.stdlib.ProcessorFormatter` and rendered as JSON. This ensures every log line in `bangui.log` is machine-readable, regardless of its source. + +### Adding New Overrides + +When integrating a new library that emits verbose DEBUG logs: + +```python +# In backend/app/main.py, inside _configure_logging() +logging.getLogger("new_library").setLevel(logging.WARNING) +``` + +Use `WARNING` as the default to still capture errors and warnings. Only use `ERROR` if the library is exceptionally noisy and its warnings are not actionable. + +--- + +## Structured Logging Best Practices + +### Log Levels + +Use log levels consistently: + +| Level | Use Case | Example | +|-------|----------|---------| +| **debug** | Verbose diagnostic information | `log.debug("parsing_config_file", lines=1024)` | +| **info** | Operational events | `log.info("jail_created", jail_name="sshd", action_count=3)` | +| **warning** | Recoverable issues | `log.warning("config_reload_skipped", reason="no_changes")` | +| **error** | Failures that impact functionality | `log.error("fail2ban_connection_lost", error=str(e))` | +| **critical** | System failures | `log.critical("database_corrupted", error=str(e))` | + +### Context Variables + +Use structlog's context variables to automatically include request-scoped information in all logs within a request: + +```python +import structlog + +log = structlog.get_logger() + +# In middleware or early in request processing +structlog.contextvars.clear_contextvars() +structlog.contextvars.bind_contextvars( + correlation_id=request_id, + user_id=user_id, + client_ip=client_ip, +) + +# All subsequent logs in this request will include these context variables +log.info("user_action", action="create_jail") # Automatically includes correlation_id, user_id, etc. + +# Clear context at end of request +structlog.contextvars.clear_contextvars() +``` + +### Background Task Correlation + +Background tasks (APScheduler jobs) run outside the HTTP request context. +Use :mod:`app.utils.correlation` to propagate correlation IDs through tasks: + +```python +from app.utils.correlation import get_correlation_id, reset_correlation_id, set_correlation_id + +async def my_background_task(correlation_id: str | None = None) -> None: + # Generate a new ID if not provided (scheduled tasks have no parent request) + if correlation_id is None: + import uuid + correlation_id = str(uuid.uuid4()) + + # Set the correlation ID for all logs in this task + token = set_correlation_id(correlation_id) + try: + log.info("task_started") # Now includes correlation_id + # ... task logic ... + finally: + reset_correlation_id(token) + +# When scheduling, optionally pass the current correlation ID: +# scheduler.add_job(my_background_task, kwargs={"correlation_id": get_correlation_id()}) +``` + +Scheduled tasks (no parent request) generate a fresh UUID for each run. +Tasks triggered by a request inherit the request's correlation ID. + +### Event Naming Convention + +Use snake_case for event names, prefixed with the component or module name: + +```python +# ✓ Good naming +log.info("service_initialized", service="BanService", version="1.0") +log.warning("blocklist_import_slow", duration_ms=5000) +log.error("fail2ban_command_failed", command="list", exit_code=1) + +# ✗ Bad naming +log.info("init") # Too generic +log.warning("slow operation") # Not machine-readable +log.error("ERROR: FAIL2BAN FAILED!") # Inconsistent formatting +``` + +### Attaching Structured Data + +Always provide context as key-value pairs, not as unstructured strings: + +```python +# ✓ Correct: Structured, queryable +log.info( + "ban_executed", + jail="sshd", + ip="192.0.2.1", + duration_seconds=3600, + reason="brute_force", +) + +# ✗ Wrong: Unstructured, hard to query +log.info(f"Banned {ip} in jail {jail} for 3600 seconds because brute_force") +``` + +--- + +## Centralized Logging Configuration + +### Environment Variables + +External logging is configured via environment variables (all prefixed with `BANGUI_`): + +#### Datadog + +Enable logging to Datadog via HTTP API: + +```bash +BANGUI_EXTERNAL_LOGGING_ENABLED=true +BANGUI_EXTERNAL_LOGGING_PROVIDER=datadog +BANGUI_DATADOG_API_KEY=your-api-key-here +BANGUI_DATADOG_SITE=datadoghq.com # or datadoghq.eu for EU +BANGUI_DATADOG_BATCH_SIZE=10 # Optional: logs per batch +BANGUI_DATADOG_FLUSH_INTERVAL_SECONDS=5 # Optional: flush interval +``` + +#### Papertrail + +Enable logging to Papertrail via Syslog protocol: + +```bash +BANGUI_EXTERNAL_LOGGING_ENABLED=true +BANGUI_EXTERNAL_LOGGING_PROVIDER=papertrail +BANGUI_PAPERTRAIL_HOST=logs1.papertrailapp.com +BANGUI_PAPERTRAIL_PORT=12345 +BANGUI_PAPERTRAIL_PROGRAM_NAME=bangui # Optional: program name in syslog +``` + +#### ELK Stack + +Enable logging to Elasticsearch/Logstash: + +```bash +BANGUI_EXTERNAL_LOGGING_ENABLED=true +BANGUI_EXTERNAL_LOGGING_PROVIDER=elasticsearch +BANGUI_ELASTICSEARCH_HOSTS=http://elasticsearch:9200 +BANGUI_ELASTICSEARCH_INDEX_PREFIX=bangui # Optional: index prefix +BANGUI_ELASTICSEARCH_BATCH_SIZE=10 # Optional: docs per batch +BANGUI_ELASTICSEARCH_FLUSH_INTERVAL_SECONDS=5 # Optional: flush interval +``` + +### Local Development (Disabled by Default) + +External logging is **disabled by default**. In development, logs continue to write to stdout only: + +```bash +# No configuration needed — logs go to stdout +docker compose up +``` + +To enable external logging in development for testing: + +```bash +BANGUI_EXTERNAL_LOGGING_ENABLED=true \ +BANGUI_EXTERNAL_LOGGING_PROVIDER=datadog \ +BANGUI_DATADOG_API_KEY=test-key \ +python -m uvicorn app.main:create_app --host 0.0.0.0 --port 8000 +``` + +--- + +## Performance and Reliability + +### Non-Blocking Delivery + +External log delivery uses **asynchronous buffering** to prevent blocking the application: + +1. Logs are written to an in-memory buffer +2. After the configured flush interval or batch size, the buffer is sent asynchronously +3. Send failures do not block application logic +4. Retries use exponential backoff (up to 5 attempts) + +This ensures that external logging never degrades application performance. + +### Failure Modes + +If external logging becomes unavailable: + +- **Transient failures** (network timeouts, temporary 5xx errors): Logs are retried with exponential backoff +- **Permanent failures** (invalid API key, host unreachable): A warning is logged; application continues +- **Steady-state**: Logs are buffered up to a maximum queue size (default: 1000 logs); older logs are dropped if buffer fills + +The application **never crashes** due to external logging failures. + +### Log Volume and Rate Limiting + +Large log volumes can increase data transfer and storage costs. To manage log volume: + +1. **Reduce log level in production**: Set `BANGUI_LOG_LEVEL=warning` or `error` to suppress debug/info logs +2. **Sample logs**: Some providers (Datadog, Papertrail) support sampling rules +3. **Filter sensitive paths**: Middleware can suppress verbose logging for noisy endpoints + +Monitor actual log volume and adjust settings based on usage patterns. + +--- + +## Integration Examples + +### Docker Compose (Development with Datadog) + +```yaml +version: "3.9" +services: + bangui: + build: + context: . + dockerfile: Docker/Dockerfile.app + environment: + BANGUI_EXTERNAL_LOGGING_ENABLED: "true" + BANGUI_EXTERNAL_LOGGING_PROVIDER: "datadog" + BANGUI_DATADOG_API_KEY: "${DATADOG_API_KEY}" + BANGUI_DATADOG_SITE: "datadoghq.com" + BANGUI_LOG_LEVEL: "info" + ports: + - "8000:8000" +``` + +### Kubernetes Deployment (Papertrail) + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: bangui-logging +data: + BANGUI_EXTERNAL_LOGGING_ENABLED: "true" + BANGUI_EXTERNAL_LOGGING_PROVIDER: "papertrail" + BANGUI_PAPERTRAIL_HOST: "logs1.papertrailapp.com" + BANGUI_PAPERTRAIL_PORT: "12345" + BANGUI_PAPERTRAIL_PROGRAM_NAME: "bangui" + BANGUI_LOG_LEVEL: "info" + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bangui +spec: + template: + spec: + containers: + - name: bangui + image: bangui:latest + envFrom: + - configMapRef: + name: bangui-logging + env: + - name: BANGUI_DATADOG_API_KEY + valueFrom: + secretKeyRef: + name: bangui-secrets + key: datadog-api-key +``` + +--- + +## Monitoring Logging Infrastructure + +### Datadog Dashboard Query + +Search for all BanGUI logs: + +``` +service:bangui +``` + +Search for errors in authentication: + +``` +service:bangui status:error component:auth +``` + +### Papertrail Search + +Search for all startup events: + +``` +program:bangui bangui_starting_up +``` + +Search for authentication failures: + +``` +program:bangui auth_token_validation_failed +``` + +### Elasticsearch Query (ELK) + +```json +{ + "query": { + "bool": { + "must": [ + { "match": { "logger_name": "app.auth" } }, + { "match": { "level": "error" } } + ] + } + } +} +``` + +--- + +## Testing and Debugging + +### Verify JSON Output + +Inspect the actual JSON emitted by the logging system: + +```bash +# Start the app and capture logs +python -m uvicorn app.main:create_app --host 0.0.0.0 --port 8000 2>&1 | head -10 | python -m json.tool +``` + +Expected output: + +```json +{ + "timestamp": "2024-05-01T18:20:45.123456+02:00", + "level": "info", + "logger_name": "app.main", + "event": "bangui_starting_up", + "database_path": "/var/lib/bangui/bangui.db" +} +``` + +### Enable Debug Logging for External Log Delivery + +Set the log level to `debug` to see internal logs from the external logging system: + +```bash +BANGUI_LOG_LEVEL=debug BANGUI_EXTERNAL_LOGGING_ENABLED=true python -m uvicorn app.main:create_app +``` + +This will emit logs like: + +```json +{ + "level": "debug", + "event": "external_log_batch_sent", + "provider": "datadog", + "batch_size": 10, + "duration_ms": 42 +} +``` + +### Validate Configuration + +Validate external logging configuration on startup: + +```bash +python -c "from app.config import get_settings; s = get_settings(); print(s.model_dump())" +``` + +--- + +## Security Considerations + +### API Key Rotation + +Rotate API keys regularly: + +1. Update `BANGUI_DATADOG_API_KEY` with the new key +2. Restart the application +3. Old keys can be revoked after restart + +### Network Security + +When sending logs over the network: + +- **Datadog HTTP API**: Uses HTTPS, encrypted in transit +- **Papertrail Syslog**: Use TLS-enabled Syslog (if supported) or send over VPN/private network +- **Elasticsearch**: Use HTTPS and HTTP Basic Auth or API Key authentication + +Never send logs over unencrypted channels in production. + +### Compliance + +Ensure that your external logging platform complies with your organization's data protection requirements: + +- **GDPR**: Verify the platform's data processing agreements +- **HIPAA**: Ensure the provider is HIPAA-eligible +- **SOC 2**: Request audit reports from your logging provider +- **Data retention**: Configure appropriate log retention policies + +--- + +## Troubleshooting + +### Logs Not Appearing in External System + +1. **Verify configuration**: Check that environment variables are set correctly +2. **Check API credentials**: Ensure the API key or credentials are valid +3. **Check network connectivity**: Verify the external system is reachable +4. **Review logs locally**: Run with `BANGUI_LOG_LEVEL=debug` and check stdout for errors +5. **Check disk space**: Ensure the local buffer directory has sufficient disk space + +### Performance Degradation + +1. **Check buffer size**: If the buffer is full, logs are dropped; increase `BANGUI_EXTERNAL_LOGGING_BUFFER_SIZE` +2. **Adjust flush interval**: Decrease flush interval if experiencing large batches +3. **Reduce log level**: Set `BANGUI_LOG_LEVEL=warning` to reduce log volume +4. **Monitor network**: Check bandwidth usage between application and external system + +### Lost Logs + +In the rare event that logs are lost: + +1. **Buffer overflow**: The in-memory buffer has a maximum size; excess logs are dropped with a warning +2. **Network failure during batch send**: Logs are retried; after max retries, a warning is logged +3. **External system outage**: Logs may be dropped if buffer fills before service is restored + +To minimize data loss: + +- Increase buffer size (`BANGUI_EXTERNAL_LOGGING_BUFFER_SIZE`) +- Use persistent external logging platforms +- Monitor for warnings in application logs about dropped batches + +--- + +## Application Performance Monitoring (Metrics) + +BanGUI collects comprehensive metrics for request performance, application health, and resource utilization through **Prometheus**. Metrics are exposed in standard Prometheus text format and can be scraped by monitoring systems. + +### Backend Metrics + +#### HTTP Request Metrics + +The backend automatically tracks HTTP request performance: + +- **`bangui_http_requests_total`** (Counter) — Total HTTP requests by method, endpoint, and status code + ``` + bangui_http_requests_total{method="GET",endpoint="/api/jails",status_code="200"} 125 + ``` + +- **`bangui_http_request_duration_seconds`** (Histogram) — Request latency distribution by method and endpoint + ``` + bangui_http_request_duration_seconds_bucket{method="GET",endpoint="/api/jails",le="0.1"} 120 + bangui_http_request_duration_seconds_sum{method="GET",endpoint="/api/jails"} 45.23 + ``` + +- **`bangui_http_active_requests`** (Gauge) — Current number of in-flight requests by method and endpoint + ``` + bangui_http_active_requests{method="GET",endpoint="/api/jails"} 5 + ``` + +#### Application Metrics + +Domain-specific metrics track application state: + +- **`bangui_bans_total`** (Gauge) — Total number of currently banned IPs across all jails +- **`bangui_jails_total`** (Gauge) — Total number of fail2ban jails +- **`bangui_fail2ban_connection_errors_total`** (Counter) — Total fail2ban connection errors + +#### Accessing Metrics + +Prometheus metrics are exposed at the `/metrics` endpoint: + +```bash +curl http://localhost:8000/metrics +``` + +Response format: +``` +# HELP bangui_http_requests_total Total HTTP requests by method, endpoint, and status code +# TYPE bangui_http_requests_total counter +bangui_http_requests_total{method="GET",endpoint="/api/dashboard/status",status_code="200"} 1523.0 + +# HELP bangui_http_request_duration_seconds HTTP request latency in seconds by method and endpoint +# TYPE bangui_http_request_duration_seconds histogram +bangui_http_request_duration_seconds_bucket{method="GET",endpoint="/api/dashboard/status",le="0.01"} 1200.0 +bangui_http_request_duration_seconds_sum{method="GET",endpoint="/api/dashboard/status"} 156.78 +``` + +### Frontend Metrics + +#### Web Vitals + +The frontend automatically measures Core Web Vitals using the `web-vitals` library: + +- **Cumulative Layout Shift (CLS)** — Visual stability score (good: ≤0.1) +- **First Contentful Paint (FCP)** — Time until first content appears (good: ≤1.8s) +- **First Input Delay (FID)** — Responsiveness to user input (good: ≤100ms) +- **Largest Contentful Paint (LCP)** — Time until largest content is visible (good: ≤2.5s) +- **Time to First Byte (TTFB)** — Server response time (good: ≤600ms) + +#### API Call Metrics + +API calls are automatically tracked with: + +- HTTP method and endpoint +- Response status code +- Duration in milliseconds +- Timestamp + +### Integrating with Monitoring Systems + +#### Prometheus + Grafana + +Configure Prometheus to scrape BanGUI metrics: + +```yaml +# prometheus.yml +scrape_configs: + - job_name: "bangui" + static_configs: + - targets: ["localhost:8000"] + metrics_path: "/metrics" +``` + +Then import a Grafana dashboard to visualize: + +- Request rates by endpoint +- Latency percentiles (p50, p95, p99) +- Error rate trends +- Active request counts + +#### Datadog + +Configure BanGUI to send metrics via StatsD or HTTP API: + +```bash +BANGUI_METRICS_ENABLED=true +BANGUI_METRICS_PROVIDER=datadog +BANGUI_DATADOG_API_KEY=your-api-key +BANGUI_DATADOG_SITE=datadoghq.com +``` + +#### New Relic + +Send metrics to New Relic (custom event collection): + +```bash +BANGUI_METRICS_ENABLED=true +BANGUI_METRICS_PROVIDER=newrelic +BANGUI_NEWRELIC_API_KEY=your-api-key +BANGUI_NEWRELIC_ACCOUNT_ID=your-account-id +``` + +### Metrics Best Practices + +#### Cardinality Management + +Metric labels (tags) can cause cardinality explosion if not carefully managed. BanGUI uses: + +- Path normalization — `/api/jails/123` becomes `/api/{id}` to prevent unique labels per resource +- Status code grouping — errors are grouped by category, not individual codes +- Endpoint aggregation — only significant endpoints are tracked + +#### Performance Considerations + +- Metrics collection has negligible performance impact (<1ms per request) +- In-memory buffering prevents database writes on every request +- High-cardinality labels are avoided +- Metric export (scraping) does not block request processing + +#### PII Protection + +**NEVER include sensitive data in metric labels:** + +- User IDs or session tokens +- Passwords or API keys +- Private IP addresses +- Full request/response bodies + +Allowed: HTTP method, endpoint path (normalized), status code, duration, timestamp. + +### Query Examples + +#### Prometheus Queries + +Find p95 request latency for `/api/jails`: + +```promql +histogram_quantile(0.95, bangui_http_request_duration_seconds_bucket{endpoint="/api/jails"}) +``` + +Find error rate (5xx responses): + +```promql +rate(bangui_http_requests_total{status_code=~"5.."}[5m]) +``` + +Find active requests per endpoint: + +```promql +bangui_http_active_requests +``` + +#### Grafana Dashboard + +Recommended panels: + +1. **Request Rate** — `rate(bangui_http_requests_total[1m])` by endpoint +2. **Latency Percentiles** — `histogram_quantile([0.5, 0.95, 0.99], ...)` +3. **Error Rate** — `rate(bangui_http_requests_total{status_code=~"5.."}[5m])` +4. **Active Requests** — `bangui_http_active_requests` (gauge) +5. **fail2ban Connection Health** — `rate(bangui_fail2ban_connection_errors_total[5m])` + +### Troubleshooting Metrics + +#### Metrics endpoint not responding + +1. Verify the `/metrics` endpoint is accessible: `curl http://localhost:8000/metrics` +2. Check application logs for errors during middleware initialization +3. Ensure prometheus-client is installed: `pip show prometheus-client` + +#### High cardinality warnings + +If Prometheus warns about high cardinality: + +1. Check if custom labels are being added to metrics +2. Ensure path normalization is working (IDs should be replaced with `{id}`) +3. Consider sampling metrics for high-volume endpoints + +#### Missing metrics + +1. Check that endpoints are being called (look for 200 responses in logs) +2. Verify the metrics middleware is registered (check `app.add_middleware(MetricsMiddleware)`) +3. Ensure metrics are being recorded (call `recordApiCall()` on frontend) + +--- + +## Future Enhancements + +Planned observability improvements: + +- [x] Application metrics collection (Prometheus) +- [x] Web Vitals tracking (frontend) +- [ ] Distributed tracing (OpenTelemetry integration) +- [ ] Custom metric hooks for business events +- [ ] Alerting rules and thresholds +- [ ] Log sampling strategies +- [ ] Additional provider support (Splunk, New Relic, CloudWatch) + +--- + +## Scheduler Lock Health Monitoring + +The scheduler lock ensures only one instance runs background tasks. Monitoring its health is critical for production reliability. + +### Key Metrics + +Monitor these log events for scheduler lock health: + +| Event | Level | Meaning | +|-------|-------|---------| +| `scheduler_lock_acquired` | info | Successfully acquired the scheduler lock | +| `scheduler_lock_held_by_other_instance` | warning | Another instance holds the lock (expected during normal multi-instance operation) | +| `scheduler_lock_stale_overwrite` | info | Took over a stale lock from a crashed instance | +| `scheduler_lock_heartbeat_lost` | warning | Heartbeat update failed; we lost the lock | +| `scheduler_lock_release_mismatch` | warning | Release attempted but we don't hold the lock | + +### Lock Health Check + +Query current lock status via `get_lock_health()`: + +```python +from app.utils.scheduler_lock import get_lock_health + +health = await get_lock_health(db) +# Returns: {"locked": bool, "pid": int|None, "hostname": str|None, +# "age_seconds": float|None, "is_stale": bool, "ttl_remaining": float|None} +``` + +### Alerting Rules + +**Critical alerts:** +- `scheduler_lock_acquired` not seen for >5 minutes during startup → Instance may not have acquired lock +- `scheduler_lock_heartbeat_lost` repeated >3 times → Lock keeps being stolen, possible contention issue + +**Warning alerts:** +- `scheduler_lock_held_by_other_instance` every few minutes → Normal if multiple instances, abnormal if single instance + +### Database Query + +Check lock state directly in SQLite: + +```sql +SELECT pid, hostname, heartbeat_at, heartbeat_timeout, + (datetime('now') - datetime(heartbeat_at, 'unixepoch')) as age +FROM scheduler_lock WHERE id = 1; +``` + +### Common Issues + +1. **Lock not acquired on startup**: Check logs for `scheduler_lock_held_by_other_instance`. If another instance holds it, verify if that instance is healthy. + +2. **Background tasks not running**: Use `get_lock_health()` to verify the lock is held. If not held, the instance cannot run scheduled tasks. + +3. **Frequent lock steals**: If `scheduler_lock_stale_overwrite` occurs frequently, the heartbeat interval may be too long or network latency is causing false staleness detection. + +--- + +## References + +- [structlog Documentation](https://www.structlog.org/) +- [Datadog Logging Documentation](https://docs.datadoghq.com/logs/) +- [Papertrail Documentation](https://help.papertrailapp.com/) +- [Elasticsearch JSON Logging](https://www.elastic.co/guide/en/elasticsearch/reference/current/logging.html) +- [Observability Best Practices (OpenTelemetry)](https://opentelemetry.io/docs/concepts/observability-primer/) diff --git a/Docs/PERFORMANCE.md b/Docs/PERFORMANCE.md new file mode 100644 index 0000000..ea244c9 --- /dev/null +++ b/Docs/PERFORMANCE.md @@ -0,0 +1,146 @@ +# Performance Guidelines + +Query optimization patterns for BanGUI backend services. + +--- + +## Never Load Unbounded Result Sets + +Loading large result sets into Python memory causes OOM crashes, slow responses, and unbounded growth. Every query that processes large datasets must use one of the following strategies. + +### The Problem + +With millions of ban records: +- Loading all rows as Python dicts → 200-400 MB+ memory spike +- Python loop aggregation (O(n) per item) → seconds of CPU time +- Offset pagination on large tables → O(n) scan before returning results + +### The Solution: SQL Aggregation + +SQL GROUP BY executes inside SQLite's optimized query planner, using indexes where available, and returns only the aggregated result (typically a few KB). + +```python +# BAD: loads 1M rows into Python +all_rows = await get_all_archived_history(db, since=since) +agg = {} +for row in all_rows: # O(n) Python loop + agg[row["ip"]] = agg.get(row["ip"], 0) + 1 + +# GOOD: SQL aggregation, returns lightweight {ip, count} pairs +ip_counts = await get_ip_ban_counts(db, since=since) +# [{ip: "1.2.3.4", event_count: 42}, ...] — a few KB regardless of table size +``` + +### Aggregation Reference + +| Use Case | SQL Pattern | Repository Function | +|----------|-------------|-------------------| +| Ban count per IP | `SELECT ip, COUNT(*) FROM history_archive ... GROUP BY ip` | `get_ip_ban_counts()` | +| Ban count per jail | `SELECT jail, COUNT(*) FROM history_archive ... GROUP BY jail ORDER BY COUNT(*) DESC` | `get_jail_ban_counts()` | +| Ban count per time bucket | `SELECT CAST((timeofban - ?) / ? AS INTEGER), COUNT(*) ... GROUP BY bucket_idx` | `get_ban_counts_by_bucket()` | +| Paginated rows (no offset) | `WHERE id < ? ORDER BY id DESC LIMIT ?` | `get_archived_history_keyset()` | +| Total count | `SELECT COUNT(*) FROM ...` (fast with where clause) | included in `get_jail_ban_counts()` return | + +### Pagination vs Aggregation + +Use **aggregation** when: +- Displaying summary data (counts, totals, group-by results) +- Building country/jail/timeline dashboards +- Only need counts, not individual row data + +Use **pagination** when: +- Displaying individual records (ban list, history) +- Clients need access to specific rows +- Exporting or bulk operations + +### Batch Geo Lookups + +When you need geo data for many IPs, batch in a single call rather than per-IP: + +```python +# BAD: N sequential API calls +for ip in unique_ips: + geo = await geo_service.lookup(ip) # 45 req/min rate limit × N calls + +# GOOD: one batch call, geo_service handles rate limiting +geo_map, uncached = geo_cache_lookup(unique_ips) # uses in-memory cache +if uncached: + asyncio.create_task(geo_cache.lookup_batch(uncached, http_session)) # fire-and-forget +``` + +### Index Requirements + +SQLite needs indexes on: +- Columns used in WHERE clauses (timeofban, jail, action) +- Columns used in GROUP BY (ip, jail, bucket index) +- Sort columns for pagination (id) + +Current indexes on `history_archive`: +- `idx_history_archive_timeofban` — for time-range filtering +- `idx_history_archive_jail_timeofban` — for jail + time filtering +- `idx_history_archive_action_timeofban` — for action + time filtering +- `idx_history_archive_id` — for keyset pagination + +Before adding a new query pattern, verify it uses an existing index or add one with a benchmark test. + +### Memory Monitoring + +Watch for these warning signs: +- Python RSS > 500 MB in container metrics +- Response time > 5s for dashboard endpoints +- Query time > 1s in SQLite EXPLAIN ANALYZE output + +Use `EXPLAIN QUERY PLAN` to verify index usage: +```sql +EXPLAIN QUERY PLAN SELECT ip, COUNT(*) FROM history_archive WHERE timeofban >= ? GROUP BY ip; +``` + +Expected: `USING INDEX idx_history_archive_timeofban` in the output. + +--- + +## Fail2ban Database Indexes + +BanGUI reads from fail2ban's SQLite database (`/var/run/fail2ban/fail2ban.db`). Query performance degrades without appropriate indexes. + +### Current fail2ban bans Indexes + +Fail2ban creates these indexes on the `bans` table: +- `bans_jail_timeofban_ip` — composite (jail, timeofban, ip) +- `bans_jail_ip` — composite (jail, ip) +- `bans_ip` — single (ip) + +**Missing**: standalone index on `timeofban` alone. + +### BanGUI Automatic Index Creation + +On startup, BanGUI calls `ensure_fail2ban_indexes()` to add missing indexes idempotently: + +```python +# From fail2ban_db_utils.py +CREATE INDEX IF NOT EXISTS idx_bans_timeofban_desc ON bans(timeofban DESC); +``` + +This improves queries like: +```sql +SELECT * FROM bans WHERE timeofban >= ? ORDER BY timeofban DESC; +``` + +### Verifying Index Usage + +Check if a query uses the index: +```sql +EXPLAIN QUERY PLAN SELECT * FROM bans WHERE timeofban >= 1700000000 ORDER BY timeofban DESC; +-- With index: SEARCH USING INDEX idx_bans_timeofban_desc +-- Without: SCAN TABLE bans +``` + +### Adding Indexes to Migrations + +For BanGUI's own `history_archive` table, indexes go in migrations via `_ Migration.add_table_indexes()`: + +```python +def _add_history_archive_indexes(m: Migration) -> None: + m.add_index("history_archive", ["timeofban"], unique=False, if_not_exists=True) + m.add_index("history_archive", ["jail", "timeofban"], unique=False, if_not_exists=True) +``` \ No newline at end of file diff --git a/Docs/Refactoring.md b/Docs/Refactoring.md index 5aae694..0e9e2cf 100644 --- a/Docs/Refactoring.md +++ b/Docs/Refactoring.md @@ -3,3 +3,20 @@ This document catalogues architecture violations, code smells, and structural issues found during a full project review. Issues are grouped by category and prioritised. --- + +## Security Fixes + +- Fixed open redirect vulnerability in `frontend/src/pages/LoginPage.tsx` by validating the `?next=` parameter to ensure it is a relative path (starts with `/` but not `//`). The validation regex `/^\/(?!\/)/.test(next)` prevents protocol-relative URLs and external redirects. Invalid paths fall back to `"/"`. + +--- + +## Completed Refactors + +- Moved `Fail2BanConnectionError` and `Fail2BanProtocolError` from `backend/app/utils/fail2ban_client.py` into `backend/app/exceptions.py`. Updated all router, service, and test call sites to import these domain exceptions from `app.exceptions` and retained backward compatibility through re-exporting in `app.utils.fail2ban_client`. +- Moved config file exceptions (`ConfigDirError`, `ConfigFileNotFoundError`, `ConfigFileExistsError`, `ConfigFileWriteError`, `ConfigFileNameError`) from `backend/app/services/raw_config_io_service.py` into `backend/app/exceptions.py`. Updated router and tests to import the shared domain exceptions from `app.exceptions`. +- Added global domain exception handlers to `backend/app/main.py` so domain exceptions like `JailNotFoundError`, `ConfigValidationError`, and `ConfigWriteError` map consistently to 404, 400, and 500 responses. +- Fixed stale activation tracking in `backend/app/routers/jail_config.py` by recording `last_activation` only after a successful jail activation and preventing a failed activation attempt from leaving a stale runtime state record. +- Fixed infinite re-fetch loop in `frontend/src/hooks/useJailConfigs.ts` by wrapping the `onSuccess` callback in `useCallback` with empty dependencies. The bug occurred because `useListData` includes `onSuccess` in its internal `refresh` function's dependency array; an inline callback created a new reference on each render, causing `refresh` to be recreated, which triggered the `useEffect` again, leading to an unbounded fetch loop. Callers of `useListData` must always wrap `onSuccess` callbacks in `useCallback` to maintain reference stability. +- **T-11 — Repository module-as-Protocol structural type-safety:** Resolved the fragile `cast()` pattern where repository modules were loosely typed against Protocol interfaces. Created a **validation script** (`backend/scripts/validate_repository_protocols.py`) that runs at CI time to ensure all repository modules satisfy their Protocol interfaces. Fixed signature mismatches in `protocols.py` to match actual implementations in `session_repo`, `settings_repo`, `blocklist_repo`, `import_log_repo`, `geo_cache_repo`, `history_archive_repo`, and `fail2ban_db_repo` (correcting return types like `dict[str, Any]` vs `dict[str, object]`, `Sequence` vs `Iterable`, and typed models). Updated `backend/app/dependencies.py` with explicit documentation linking each repository provider to the pattern explained in Backend-Development.md § 13.7.1. **Option B (minimal):** Instead of refactoring to class-based repositories (Option A), the pattern is now formally documented and validated, preventing silent breakage. +- **T-3 — Blocklist import flow refactoring:** Extracted the monolithic `import_source()` function (776 lines with mixed responsibilities) into focused, testable components. Created `BlocklistDownloader` (HTTP download with retry logic), `BlocklistParser` (parsing and validation), `BanExecutor` (ban execution with error handling), and `BlocklistImportWorkflow` (thin orchestrator). This separation improves testability, evolution, and error handling. Each component has a single responsibility and clear boundaries. All 53 existing tests pass; added 17 new component unit tests achieving 96%+ coverage on new modules. + diff --git a/Docs/Security.md b/Docs/Security.md new file mode 100644 index 0000000..2cda7d6 --- /dev/null +++ b/Docs/Security.md @@ -0,0 +1,176 @@ +# Security — Guidelines and Implementation + +Security considerations and implementation details for BanGUI. + +--- + +## HTTP Security Headers + +BanGUI implements defense-in-depth against client-side attacks by sending security-related HTTP response headers on all responses. + +### Headers Implemented + +| Header | Value | Purpose | +|---|---|---| +| `Content-Security-Policy` | `default-src 'self'` | Prevents XSS attacks by restricting script, style, font, image, and other resource origins to `self` only. Browsers refuse to load resources from other origins. | +| `X-Frame-Options` | `DENY` | Prevents clickjacking attacks by forbidding the page from being embedded in `