Files
BanGUI/backend/app/services/health_service.py
Lukas 0d5882b32f Fix HIGH priority issues: unbounded queries, rate limiting, health checks
Issue #3 - Unbounded Query Results (OOM):
- get_all_archived_history() now uses keyset pagination with bounded max_rows (50k default)
- Added 'id' field to records from get_archived_history() and get_archived_history_keyset()
- Protocol signature updated with page_size, max_rows, last_ban_id params

Issue #7 - Docker Health Check Fails:
- Added curl to Dockerfile.backend runtime image
- HEALTHCHECK now uses 'curl -f http://localhost:8000/api/health'
- compose.prod.yml: increased start_period to 40s, timeout to 10s
- Frontend healthcheck proxies to backend /api/health

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-01 21:47:36 +02:00

232 lines
7.6 KiB
Python

"""Health service.
Probes the fail2ban socket to determine whether the daemon is reachable and
collects aggregated server statistics (version, jail count, ban counts).
The probe is intentionally lightweight — it is meant to be called every 30
seconds by the background health-check task, not on every HTTP request.
"""
from __future__ import annotations
import asyncio
from collections.abc import Awaitable, Callable
from typing import TypeVar, cast
import structlog
from app import __version__
from app.models.config_domain import DomainServiceStatus
from app.models.server import ServerStatus
from app.utils.constants import FAIL2BAN_SOCKET_TIMEOUT_FAST
from app.utils.fail2ban_client import (
Fail2BanClient,
Fail2BanCommand,
Fail2BanConnectionError,
Fail2BanProtocolError,
)
from app.utils.fail2ban_response import (
ok,
to_dict,
)
log: structlog.stdlib.BoundLogger = structlog.get_logger()
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
T = TypeVar("T")
async def _safe_get(
client: Fail2BanClient,
command: Fail2BanCommand,
default: object | None = None,
) -> object | None:
"""Send a command and return *default* if it fails."""
try:
return ok(await client.send(command))
except (
Fail2BanConnectionError,
Fail2BanProtocolError,
ValueError,
OSError,
):
return default
async def _safe_get_typed(
client: Fail2BanClient,
command: Fail2BanCommand,
default: T,
) -> T:
"""Send a command and return the result typed as ``default``'s type."""
return cast("T", await _safe_get(client, command, default))
async def get_service_status(
socket_path: str,
probe_fn: Callable[[str], Awaitable[ServerStatus]] | None = None,
) -> DomainServiceStatus:
"""Return fail2ban service health status with log configuration.
Delegates to an injectable *probe_fn* (defaults to
:func:`~app.services.health_service.probe`).
Args:
socket_path: Path to the fail2ban Unix domain socket.
probe_fn: Optional probe function.
Returns:
:class:`~app.models.config_domain.DomainServiceStatus`.
"""
if probe_fn is None:
raise ValueError(
"probe_fn is required to avoid service-to-service coupling"
)
server_status = await probe_fn(socket_path)
if server_status.online:
client = Fail2BanClient(
socket_path=socket_path,
timeout=FAIL2BAN_SOCKET_TIMEOUT_FAST,
)
log_level_raw, log_target_raw = await asyncio.gather(
_safe_get_typed(client, ["get", "loglevel"], "INFO"),
_safe_get_typed(client, ["get", "logtarget"], "STDOUT"),
)
log_level = str(log_level_raw or "INFO").upper()
log_target = str(log_target_raw or "STDOUT")
else:
log_level = "UNKNOWN"
log_target = "UNKNOWN"
log.info(
"service_status_fetched",
online=server_status.online,
jail_count=server_status.active_jails,
)
return DomainServiceStatus(
online=server_status.online,
version=__version__,
jail_count=server_status.active_jails,
total_bans=server_status.total_bans,
total_failures=server_status.total_failures,
log_level=log_level,
log_target=log_target,
)
# ---------------------------------------------------------------------------
# Public interface
# ---------------------------------------------------------------------------
async def probe(
socket_path: str,
timeout: float = FAIL2BAN_SOCKET_TIMEOUT_FAST,
) -> ServerStatus:
"""Probe the fail2ban daemon and return a
:class:`~app.models.server.ServerStatus`.
Sends ``ping``, ``version``, ``status``, and per-jail ``status <jail>``
commands. Any socket or protocol error is caught and results in an
``online=False`` status so the dashboard can always return a safe default.
Args:
socket_path: Path to the fail2ban Unix domain socket.
timeout: Per-command socket timeout in seconds.
Returns:
A :class:`~app.models.server.ServerStatus` snapshot. ``online`` is
``True`` when the daemon is reachable, ``False`` otherwise.
"""
client = Fail2BanClient(socket_path=socket_path, timeout=timeout)
try:
# ------------------------------------------------------------------ #
# 1. Connectivity check #
# ------------------------------------------------------------------ #
ping_data = ok(await client.send(["ping"]))
if ping_data != "pong":
log.warning(
"fail2ban_unexpected_ping_response",
response=ping_data,
)
return ServerStatus(online=False)
# ------------------------------------------------------------------ #
# 2. Version
# ------------------------------------------------------------------ #
try:
version: str | None = str(ok(await client.send(["version"])))
except (ValueError, TypeError):
version = None
# ------------------------------------------------------------------ #
# 3. Global status — jail count and names #
# ------------------------------------------------------------------ #
status_data = to_dict(ok(await client.send(["status"])))
active_jails: int = int(str(status_data.get("Number of jail", 0) or 0))
jail_list_raw: str = str(
status_data.get("Jail list", "") or ""
).strip()
jail_names: list[str] = (
[j.strip() for j in jail_list_raw.split(",") if j.strip()]
if jail_list_raw
else []
)
# ------------------------------------------------------------------ #
# 4. Per-jail aggregation #
# ------------------------------------------------------------------ #
total_bans: int = 0
total_failures: int = 0
for jail_name in jail_names:
try:
jail_resp = to_dict(
ok(await client.send(["status", jail_name]))
)
filter_stats = to_dict(jail_resp.get("Filter") or [])
action_stats = to_dict(jail_resp.get("Actions") or [])
total_failures += int(
str(filter_stats.get("Currently failed", 0) or 0)
)
total_bans += int(
str(action_stats.get("Currently banned", 0) or 0)
)
except (ValueError, TypeError, KeyError) as exc:
log.warning(
"fail2ban_jail_status_parse_error",
jail=jail_name,
error=str(exc),
)
log.debug(
"fail2ban_probe_ok",
version=version,
active_jails=active_jails,
total_bans=total_bans,
total_failures=total_failures,
)
return ServerStatus(
online=True,
version=version,
active_jails=active_jails,
total_bans=total_bans,
total_failures=total_failures,
)
except (Fail2BanConnectionError, Fail2BanProtocolError) as exc:
log.warning("fail2ban_probe_failed", error=str(exc))
return ServerStatus(online=False)
except ValueError as exc:
log.error("fail2ban_probe_parse_error", error=str(exc))
return ServerStatus(online=False)