Files
BanGUI/backend/app/services/jail_config_service.py

792 lines
28 KiB
Python

"""Jail configuration management for BanGUI.
Handles parsing, validation, and lifecycle operations (activate/deactivate)
for fail2ban jail configurations. Provides functions to discover inactive
jails, validate their configurations before activation, and manage jail
overrides in jail.d/*.local files.
"""
from __future__ import annotations
import asyncio
import contextlib
import os
import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, cast
import structlog
from app.exceptions import (
ConfigWriteError,
JailAlreadyActiveError,
JailAlreadyInactiveError,
JailNotFoundError,
JailNotFoundInConfigError,
)
import app.services.config_file_service as config_file_service
from app.models.config import (
ActivateJailRequest,
InactiveJail,
InactiveJailListResponse,
JailActivationResponse,
JailValidationResult,
RollbackResponse,
)
from app.tasks.health_check import run_probe
from app.utils.async_utils import run_blocking
from app.utils.fail2ban_client import Fail2BanClient
from app.utils.runtime_state import (
clear_activation_record,
clear_pending_recovery,
create_pending_recovery,
record_activation,
)
if TYPE_CHECKING: # pragma: no cover
from fastapi import FastAPI
log: structlog.stdlib.BoundLogger = structlog.get_logger()
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
_SOCKET_TIMEOUT: float = 10.0
# Sections that are not jail definitions.
_META_SECTIONS: frozenset[str] = frozenset({"INCLUDES", "DEFAULT"})
# Seconds to wait between fail2ban liveness probes after a reload.
_POST_RELOAD_PROBE_INTERVAL: float = 2.0
# Maximum number of post-reload probe attempts (initial attempt + retries).
_POST_RELOAD_MAX_ATTEMPTS: int = 4
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _write_local_override_sync(
config_dir: Path,
jail_name: str,
enabled: bool,
overrides: dict[str, object],
) -> None:
"""Write a ``jail.d/{name}.local`` file atomically.
Always writes to ``jail.d/{jail_name}.local``. If the file already
exists it is replaced entirely. The write is atomic: content is
written to a temp file first, then renamed into place.
Args:
config_dir: The fail2ban configuration root directory.
jail_name: Validated jail name (used as filename stem).
enabled: Value to write for ``enabled =``.
overrides: Optional setting overrides (bantime, findtime, maxretry,
port, logpath).
Raises:
ConfigWriteError: If writing fails.
"""
jail_d = config_dir / "jail.d"
try:
jail_d.mkdir(parents=True, exist_ok=True)
except OSError as exc:
raise ConfigWriteError(f"Cannot create jail.d directory: {exc}") from exc
local_path = jail_d / f"{jail_name}.local"
lines: list[str] = [
"# Managed by BanGUI — do not edit manually",
"",
f"[{jail_name}]",
"",
f"enabled = {'true' if enabled else 'false'}",
# Provide explicit banaction defaults so fail2ban can resolve the
# %(banaction)s interpolation used in the built-in action_ chain.
"banaction = iptables-multiport",
"banaction_allports = iptables-allports",
]
if overrides.get("bantime") is not None:
lines.append(f"bantime = {overrides['bantime']}")
if overrides.get("findtime") is not None:
lines.append(f"findtime = {overrides['findtime']}")
if overrides.get("maxretry") is not None:
lines.append(f"maxretry = {overrides['maxretry']}")
if overrides.get("port") is not None:
lines.append(f"port = {overrides['port']}")
if overrides.get("logpath"):
paths: list[str] = cast("list[str]", overrides["logpath"])
if paths:
lines.append(f"logpath = {paths[0]}")
for p in paths[1:]:
lines.append(f" {p}")
content = "\n".join(lines) + "\n"
try:
with tempfile.NamedTemporaryFile(
mode="w",
encoding="utf-8",
dir=jail_d,
delete=False,
suffix=".tmp",
) as tmp:
tmp.write(content)
tmp_name = tmp.name
os.replace(tmp_name, local_path)
except OSError as exc:
# Clean up temp file if rename failed.
with contextlib.suppress(OSError):
os.unlink(tmp_name) # noqa: F821 — only reachable when tmp_name is set
raise ConfigWriteError(f"Failed to write {local_path}: {exc}") from exc
log.info(
"jail_local_written",
jail=jail_name,
path=str(local_path),
enabled=enabled,
)
def _restore_local_file_sync(local_path: Path, original_content: bytes | None) -> None:
"""Restore a ``.local`` file to its pre-activation state.
If *original_content* is ``None``, the file is deleted (it did not exist
before the activation). Otherwise the original bytes are written back
atomically via a temp-file rename.
Args:
local_path: Absolute path to the ``.local`` file to restore.
original_content: Original raw bytes to write back, or ``None`` to
delete the file.
Raises:
ConfigWriteError: If the write or delete operation fails.
"""
if original_content is None:
try:
local_path.unlink(missing_ok=True)
except OSError as exc:
raise ConfigWriteError(f"Failed to delete {local_path} during rollback: {exc}") from exc
return
tmp_name: str | None = None
try:
with tempfile.NamedTemporaryFile(
mode="wb",
dir=local_path.parent,
delete=False,
suffix=".tmp",
) as tmp:
tmp.write(original_content)
tmp_name = tmp.name
os.replace(tmp_name, local_path)
except OSError as exc:
with contextlib.suppress(OSError):
if tmp_name is not None:
os.unlink(tmp_name)
raise ConfigWriteError(f"Failed to restore {local_path} during rollback: {exc}") from exc
def _validate_regex_patterns(patterns: list[str]) -> None:
"""Validate each pattern in *patterns* using Python's ``re`` module.
Args:
patterns: List of regex strings to validate.
Raises:
FilterInvalidRegexError: If any pattern fails to compile.
"""
for pattern in patterns:
try:
re.compile(pattern)
except re.error as exc:
# Import here to avoid circular dependency
from app.exceptions import FilterInvalidRegexError
raise FilterInvalidRegexError(pattern, str(exc)) from exc
# Shared functions from config_file_service are imported directly from the
# canonical shared helper module.
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def list_inactive_jails(
config_dir: str,
socket_path: str,
) -> InactiveJailListResponse:
"""Return all jails defined in config files that are not currently active.
Parses ``jail.conf``, ``jail.local``, and ``jail.d/`` following the
fail2ban merge order. A jail is considered inactive when:
- Its merged ``enabled`` value is ``false`` (or absent, which defaults to
``false`` in fail2ban), **or**
- Its ``enabled`` value is ``true`` in config but fail2ban does not report
it as running.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
socket_path: Path to the fail2ban Unix domain socket.
Returns:
:class:`~app.models.config.InactiveJailListResponse` with all
inactive jails.
"""
parsed_result: tuple[dict[str, dict[str, str]], dict[str, str]] = await run_blocking(
config_file_service._parse_jails_sync,
Path(config_dir),
)
all_jails, source_files = parsed_result
active_names: set[str] = await config_file_service._get_active_jail_names(socket_path)
inactive: list[InactiveJail] = []
for jail_name, settings in sorted(all_jails.items()):
if jail_name in active_names:
# fail2ban reports this jail as running — skip it.
continue
source = source_files.get(jail_name, config_dir)
inactive.append(config_file_service.build_inactive_jail(jail_name, settings, source, Path(config_dir)))
log.info(
"inactive_jails_listed",
total_defined=len(all_jails),
active=len(active_names),
inactive=len(inactive),
)
return InactiveJailListResponse(jails=inactive, total=len(inactive))
async def activate_jail(
app: FastAPI,
config_dir: str,
socket_path: str,
name: str,
req: ActivateJailRequest,
) -> JailActivationResponse:
"""Activate a jail and manage crash recovery state.
This wrapper records the activation timestamp, delegates the actual
file-based activation workflow to the lower-level implementation, and
updates the health-check cache immediately so the UI reflects the
current fail2ban state.
"""
activation_time = record_activation(app, name)
result = await _activate_jail(config_dir, socket_path, name, req)
if not result.fail2ban_running:
create_pending_recovery(
app,
jail_name=name,
activated_at=activation_time,
)
await run_probe(app)
return result
async def _activate_jail(
config_dir: str,
socket_path: str,
name: str,
req: ActivateJailRequest,
) -> JailActivationResponse:
"""Enable an inactive jail and reload fail2ban.
Performs pre-activation validation, writes ``enabled = true`` (plus any
override values from *req*) to ``jail.d/{name}.local``, and triggers a
full fail2ban reload. After the reload a multi-attempt health probe
determines whether fail2ban (and the specific jail) are still running.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
socket_path: Path to the fail2ban Unix domain socket.
name: Name of the jail to activate. Must exist in the parsed config.
req: Optional override values to write alongside ``enabled = true``.
Returns:
:class:`~app.models.config.JailActivationResponse` including
``fail2ban_running`` and ``validation_warnings`` fields.
Raises:
JailNameError: If *name* contains invalid characters.
JailNotFoundInConfigError: If *name* is not defined in any config file.
JailAlreadyActiveError: If fail2ban already reports *name* as running.
ConfigWriteError: If writing the ``.local`` file fails.
~app.utils.fail2ban_client.Fail2BanConnectionError: If the fail2ban
socket is unreachable during reload.
"""
config_file_service.safe_jail_name(name)
all_jails, _source_files = await run_blocking(config_file_service._parse_jails_sync, Path(config_dir))
if name not in all_jails:
raise JailNotFoundInConfigError(name)
active_names = await config_file_service._get_active_jail_names(socket_path)
if name in active_names:
raise JailAlreadyActiveError(name)
# ---------------------------------------------------------------------- #
# Pre-activation validation — collect warnings but do not block #
# ---------------------------------------------------------------------- #
validation_result: JailValidationResult = await run_blocking(config_file_service._validate_jail_config_sync, Path(config_dir), name
)
warnings: list[str] = [f"{i.field}: {i.message}" for i in validation_result.issues]
if warnings:
log.warning(
"jail_activation_validation_warnings",
jail=name,
warnings=warnings,
)
# Block activation on critical validation failures (missing filter or logpath).
blocking = [i for i in validation_result.issues if i.field in ("filter", "logpath")]
if blocking:
log.warning(
"jail_activation_blocked",
jail=name,
issues=[f"{i.field}: {i.message}" for i in blocking],
)
return JailActivationResponse(
name=name,
active=False,
fail2ban_running=True,
validation_warnings=warnings,
message=(f"Jail {name!r} cannot be activated: " + "; ".join(i.message for i in blocking)),
)
overrides: dict[str, object] = {
"bantime": req.bantime,
"findtime": req.findtime,
"maxretry": req.maxretry,
"port": req.port,
"logpath": req.logpath,
}
# ---------------------------------------------------------------------- #
# Backup the existing .local file (if any) before overwriting it so that #
# we can restore it if activation fails. #
# ---------------------------------------------------------------------- #
local_path = Path(config_dir) / "jail.d" / f"{name}.local"
original_content: bytes | None = await run_blocking(
lambda: local_path.read_bytes() if local_path.exists() else None,
)
await run_blocking(_write_local_override_sync,
Path(config_dir),
name,
True,
overrides,
)
# ---------------------------------------------------------------------- #
# Activation reload — if it fails, roll back immediately #
# ---------------------------------------------------------------------- #
try:
await config_file_service.jail_service.reload_all(socket_path, include_jails=[name])
except JailNotFoundError as exc:
# Jail configuration is invalid (e.g. missing logpath that prevents
# fail2ban from loading the jail). Roll back and provide a specific error.
log.warning(
"reload_after_activate_failed_jail_not_found",
jail=name,
error=str(exc),
)
recovered = await _rollback_activation_async(config_dir, name, socket_path, original_content)
return JailActivationResponse(
name=name,
active=False,
fail2ban_running=False,
recovered=recovered,
validation_warnings=warnings,
message=(
f"Jail {name!r} activation failed: {str(exc)}. "
"Check that all logpath files exist and are readable. "
"The configuration was "
+ ("automatically recovered." if recovered else "not recovered — manual intervention is required.")
),
)
except Exception as exc: # noqa: BLE001
log.warning("reload_after_activate_failed", jail=name, error=str(exc))
recovered = await _rollback_activation_async(config_dir, name, socket_path, original_content)
return JailActivationResponse(
name=name,
active=False,
fail2ban_running=False,
recovered=recovered,
validation_warnings=warnings,
message=(
f"Jail {name!r} activation failed during reload and the "
"configuration was "
+ ("automatically recovered." if recovered else "not recovered — manual intervention is required.")
),
)
# ---------------------------------------------------------------------- #
# Post-reload health probe with retries #
# ---------------------------------------------------------------------- #
fail2ban_running = False
for attempt in range(_POST_RELOAD_MAX_ATTEMPTS):
if attempt > 0:
await asyncio.sleep(_POST_RELOAD_PROBE_INTERVAL)
if await config_file_service._probe_fail2ban_running(socket_path):
fail2ban_running = True
break
if not fail2ban_running:
log.warning(
"fail2ban_down_after_activate",
jail=name,
message="fail2ban socket unreachable after reload — initiating rollback.",
)
recovered = await _rollback_activation_async(config_dir, name, socket_path, original_content)
return JailActivationResponse(
name=name,
active=False,
fail2ban_running=False,
recovered=recovered,
validation_warnings=warnings,
message=(
f"Jail {name!r} activation failed: fail2ban stopped responding "
"after reload. The configuration was "
+ ("automatically recovered." if recovered else "not recovered — manual intervention is required.")
),
)
# Verify the jail actually started (config error may prevent it silently).
post_reload_names = await config_file_service._get_active_jail_names(socket_path)
actually_running = name in post_reload_names
if not actually_running:
log.warning(
"jail_activation_unverified",
jail=name,
message="Jail did not appear in running jails — initiating rollback.",
)
recovered = await _rollback_activation_async(config_dir, name, socket_path, original_content)
return JailActivationResponse(
name=name,
active=False,
fail2ban_running=True,
recovered=recovered,
validation_warnings=warnings,
message=(
f"Jail {name!r} was written to config but did not start after "
"reload. The configuration was "
+ ("automatically recovered." if recovered else "not recovered — manual intervention is required.")
),
)
log.info("jail_activated", jail=name)
return JailActivationResponse(
name=name,
active=True,
fail2ban_running=True,
validation_warnings=warnings,
message=f"Jail {name!r} activated successfully.",
)
async def _rollback_activation_async(
config_dir: str,
name: str,
socket_path: str,
original_content: bytes | None,
) -> bool:
"""Restore the pre-activation ``.local`` file and reload fail2ban.
Called internally by :func:`activate_jail` when the activation fails after
the config file was already written. Tries to:
1. Restore the original file content (or delete the file if it was newly
created by the activation attempt).
2. Reload fail2ban so the daemon runs with the restored configuration.
3. Probe fail2ban to confirm it came back up.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
name: Name of the jail whose ``.local`` file should be restored.
socket_path: Path to the fail2ban Unix domain socket.
original_content: Raw bytes of the original ``.local`` file, or
``None`` if the file did not exist before the activation.
Returns:
``True`` if fail2ban is responsive again after the rollback, ``False``
if recovery also failed.
"""
local_path = Path(config_dir) / "jail.d" / f"{name}.local"
# Step 1 — restore original file (or delete it).
try:
await run_blocking( _restore_local_file_sync, local_path, original_content)
log.info("jail_activation_rollback_file_restored", jail=name)
except ConfigWriteError as exc:
log.error("jail_activation_rollback_restore_failed", jail=name, error=str(exc))
return False
# Step 2 — reload fail2ban with the restored config.
try:
await config_file_service.jail_service.reload_all(socket_path)
log.info("jail_activation_rollback_reload_ok", jail=name)
except Exception as exc: # noqa: BLE001
log.warning("jail_activation_rollback_reload_failed", jail=name, error=str(exc))
return False
# Step 3 — wait for fail2ban to come back.
for attempt in range(_POST_RELOAD_MAX_ATTEMPTS):
if attempt > 0:
await asyncio.sleep(_POST_RELOAD_PROBE_INTERVAL)
if await config_file_service._probe_fail2ban_running(socket_path):
log.info("jail_activation_rollback_recovered", jail=name)
return True
log.warning("jail_activation_rollback_still_down", jail=name)
return False
async def deactivate_jail(
app: FastAPI,
config_dir: str,
socket_path: str,
name: str,
) -> JailActivationResponse:
"""Deactivate a jail and update the health-check cache.
This wrapper disables the jail in the config, reloads fail2ban, and then
forces an immediate health probe so any cached dashboard status reflects
the current daemon state.
"""
result = await _deactivate_jail(config_dir, socket_path, name)
await run_probe(app)
return result
async def _deactivate_jail(
config_dir: str,
socket_path: str,
name: str,
) -> JailActivationResponse:
"""Disable an active jail and reload fail2ban.
Writes ``enabled = false`` to ``jail.d/{name}.local`` and triggers a
full fail2ban reload so the jail stops immediately.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
socket_path: Path to the fail2ban Unix domain socket.
name: Name of the jail to deactivate. Must exist in the parsed config.
Returns:
:class:`~app.models.config.JailActivationResponse`.
Raises:
JailNameError: If *name* contains invalid characters.
JailNotFoundInConfigError: If *name* is not defined in any config file.
JailAlreadyInactiveError: If fail2ban already reports *name* as not
running.
ConfigWriteError: If writing the ``.local`` file fails.
~app.utils.fail2ban_client.Fail2BanConnectionError: If the fail2ban
socket is unreachable during reload.
"""
config_file_service.safe_jail_name(name)
all_jails, _source_files = await run_blocking(config_file_service._parse_jails_sync, Path(config_dir))
if name not in all_jails:
raise JailNotFoundInConfigError(name)
active_names = await config_file_service._get_active_jail_names(socket_path)
if name not in active_names:
raise JailAlreadyInactiveError(name)
await run_blocking(_write_local_override_sync,
Path(config_dir),
name,
False,
{},
)
try:
await config_file_service.jail_service.reload_all(socket_path, exclude_jails=[name])
except Exception as exc: # noqa: BLE001
log.warning("reload_after_deactivate_failed", jail=name, error=str(exc))
log.info("jail_deactivated", jail=name)
return JailActivationResponse(
name=name,
active=False,
message=f"Jail {name!r} deactivated successfully.",
)
async def delete_jail_local_override(
config_dir: str,
socket_path: str,
name: str,
) -> None:
"""Delete the ``jail.d/{name}.local`` override file for an inactive jail.
This is the clean-up action shown in the config UI when an inactive jail
still has a ``.local`` override file (e.g. ``enabled = false``). The
file is deleted outright; no fail2ban reload is required because the jail
is already inactive.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
socket_path: Path to the fail2ban Unix domain socket.
name: Name of the jail whose ``.local`` file should be removed.
Raises:
JailNameError: If *name* contains invalid characters.
JailNotFoundInConfigError: If *name* is not defined in any config file.
JailAlreadyActiveError: If the jail is currently active (refusing to
delete the live config file).
ConfigWriteError: If the file cannot be deleted.
"""
config_file_service.safe_jail_name(name)
all_jails, _source_files = await run_blocking(config_file_service._parse_jails_sync, Path(config_dir))
if name not in all_jails:
raise JailNotFoundInConfigError(name)
active_names = await config_file_service._get_active_jail_names(socket_path)
if name in active_names:
raise JailAlreadyActiveError(name)
local_path = Path(config_dir) / "jail.d" / f"{name}.local"
try:
await run_blocking( lambda: local_path.unlink(missing_ok=True))
except OSError as exc:
raise ConfigWriteError(f"Failed to delete {local_path}: {exc}") from exc
log.info("jail_local_override_deleted", jail=name, path=str(local_path))
async def validate_jail_config(
config_dir: str,
name: str,
) -> JailValidationResult:
"""Run pre-activation validation checks on a jail configuration.
Validates that referenced filter and action files exist in ``filter.d/``
and ``action.d/``, that all regex patterns compile, and that declared log
paths exist on disk.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
name: Name of the jail to validate.
Returns:
:class:`~app.models.config.JailValidationResult` with any issues found.
Raises:
JailNameError: If *name* contains invalid characters.
"""
config_file_service.safe_jail_name(name)
return await run_blocking(config_file_service._validate_jail_config_sync,
Path(config_dir),
name,
)
async def rollback_jail(
app: FastAPI,
config_dir: str,
socket_path: str,
name: str,
start_cmd_parts: list[str],
) -> RollbackResponse:
"""Rollback a jail and clear pending recovery state on success."""
result = await _rollback_jail(config_dir, socket_path, name, start_cmd_parts)
if result.fail2ban_running:
clear_pending_recovery(app)
clear_activation_record(app)
return result
async def _rollback_jail(
config_dir: str,
socket_path: str,
name: str,
start_cmd_parts: list[str],
) -> RollbackResponse:
"""Disable a bad jail config and restart the fail2ban daemon.
Writes ``enabled = false`` to ``jail.d/{name}.local`` (works even when
fail2ban is down — only a file write), then attempts to start the daemon
with *start_cmd_parts*. Waits up to 10 seconds for the socket to respond.
Args:
config_dir: Absolute path to the fail2ban configuration directory.
socket_path: Path to the fail2ban Unix domain socket.
name: Name of the jail to disable.
start_cmd_parts: Argument list for the daemon start command, e.g.
``["fail2ban-client", "start"]``.
Returns:
:class:`~app.models.config.RollbackResponse`.
Raises:
JailNameError: If *name* contains invalid characters.
ConfigWriteError: If writing the ``.local`` file fails.
"""
config_file_service.safe_jail_name(name)
# Write enabled=false — this must succeed even when fail2ban is down.
await run_blocking(_write_local_override_sync,
Path(config_dir),
name,
False,
{},
)
log.info("jail_rolled_back_disabled", jail=name)
# Attempt to start the daemon.
started = await config_file_service.start_daemon(start_cmd_parts)
log.info("jail_rollback_start_attempted", jail=name, start_ok=started)
# Wait for the socket to come back.
fail2ban_running = await config_file_service.wait_for_fail2ban(socket_path, max_wait_seconds=10.0, poll_interval=2.0)
active_jails = 0
if fail2ban_running:
names = await config_file_service._get_active_jail_names(socket_path)
active_jails = len(names)
if fail2ban_running:
log.info("jail_rollback_success", jail=name, active_jails=active_jails)
return RollbackResponse(
jail_name=name,
disabled=True,
fail2ban_running=True,
active_jails=active_jails,
message=(f"Jail {name!r} disabled and fail2ban restarted successfully with {active_jails} active jail(s)."),
)
log.warning("jail_rollback_fail2ban_still_down", jail=name)
return RollbackResponse(
jail_name=name,
disabled=True,
fail2ban_running=False,
active_jails=0,
message=(
f"Jail {name!r} was disabled but fail2ban did not come back online. "
"Check the fail2ban log for additional errors."
),
)