fix: retry, semaphore, reload lock, activation verify, bans_by_jail diagnostics
Stage 1.1-1.3: reload_all include/exclude_jails params already implemented; added keyword-arg assertions in router and service tests. Stage 2.1/6.1: _send_command_sync retry loop (3 attempts, 150ms exp backoff) retrying on EAGAIN/ECONNREFUSED/ENOBUFS; immediate raise on all other errors. Stage 2.2: asyncio.Lock at module level in jail_service.reload_all to serialize concurrent reload--all commands. Stage 3.1: activate_jail re-queries _get_active_jail_names after reload; returns active=False with descriptive message if jail did not start. Stage 4.1/6.2: asyncio.Semaphore (max 10) in Fail2BanClient.send, lazy- initialized; logs fail2ban_command_waiting_semaphore at debug when waiting. Stage 5.1/5.2: unit tests asserting reload_all is called with include_jails and exclude_jails; activation verification happy/sad path tests. Stage 6.3: TestSendCommandSyncRetry (5 cases) + TestFail2BanClientSemaphore concurrency test. Stage 7.1-7.3: _since_unix uses time.time(); bans_by_jail debug logging with since_iso; diagnostic warning when total==0 despite table rows; unit test verifying the warning fires for stale data.
This commit is contained in:
@@ -12,6 +12,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -76,6 +77,13 @@ def _origin_sql_filter(origin: BanOrigin | None) -> tuple[str, tuple[str, ...]]:
|
||||
def _since_unix(range_: TimeRange) -> int:
|
||||
"""Return the Unix timestamp representing the start of the time window.
|
||||
|
||||
Uses :func:`time.time` (always UTC epoch seconds on all platforms) to be
|
||||
consistent with how fail2ban stores ``timeofban`` values in its SQLite
|
||||
database. fail2ban records ``time.time()`` values directly, so
|
||||
comparing against a timezone-aware ``datetime.now(UTC).timestamp()`` would
|
||||
theoretically produce the same number but using :func:`time.time` avoids
|
||||
any tz-aware datetime pitfalls on misconfigured systems.
|
||||
|
||||
Args:
|
||||
range_: One of the supported time-range presets.
|
||||
|
||||
@@ -83,7 +91,7 @@ def _since_unix(range_: TimeRange) -> int:
|
||||
Unix timestamp (seconds since epoch) equal to *now − range_*.
|
||||
"""
|
||||
seconds: int = TIME_RANGE_SECONDS[range_]
|
||||
return int(datetime.now(tz=UTC).timestamp()) - seconds
|
||||
return int(time.time()) - seconds
|
||||
|
||||
|
||||
def _ts_to_iso(unix_ts: int) -> str:
|
||||
@@ -626,10 +634,11 @@ async def bans_by_jail(
|
||||
origin_clause, origin_params = _origin_sql_filter(origin)
|
||||
|
||||
db_path: str = await _get_fail2ban_db_path(socket_path)
|
||||
log.info(
|
||||
log.debug(
|
||||
"ban_service_bans_by_jail",
|
||||
db_path=db_path,
|
||||
since=since,
|
||||
since_iso=_ts_to_iso(since),
|
||||
range=range_,
|
||||
origin=origin,
|
||||
)
|
||||
@@ -644,6 +653,24 @@ async def bans_by_jail(
|
||||
count_row = await cur.fetchone()
|
||||
total: int = int(count_row[0]) if count_row else 0
|
||||
|
||||
# Diagnostic guard: if zero results were returned, check whether the
|
||||
# table has *any* rows and log a warning with min/max timeofban so
|
||||
# operators can diagnose timezone or filter mismatches from logs.
|
||||
if total == 0:
|
||||
async with f2b_db.execute(
|
||||
"SELECT COUNT(*), MIN(timeofban), MAX(timeofban) FROM bans"
|
||||
) as cur:
|
||||
diag_row = await cur.fetchone()
|
||||
if diag_row and diag_row[0] > 0:
|
||||
log.warning(
|
||||
"ban_service_bans_by_jail_empty_despite_data",
|
||||
table_row_count=diag_row[0],
|
||||
min_timeofban=diag_row[1],
|
||||
max_timeofban=diag_row[2],
|
||||
since=since,
|
||||
range=range_,
|
||||
)
|
||||
|
||||
async with f2b_db.execute(
|
||||
"SELECT jail, COUNT(*) AS cnt "
|
||||
"FROM bans "
|
||||
@@ -657,4 +684,9 @@ async def bans_by_jail(
|
||||
jails: list[JailBanCount] = [
|
||||
JailBanCount(jail=str(row["jail"]), count=int(row["cnt"])) for row in rows
|
||||
]
|
||||
log.debug(
|
||||
"ban_service_bans_by_jail_result",
|
||||
total=total,
|
||||
jail_count=len(jails),
|
||||
)
|
||||
return BansByJailResponse(jails=jails, total=total)
|
||||
|
||||
@@ -899,10 +899,30 @@ async def activate_jail(
|
||||
)
|
||||
|
||||
try:
|
||||
await jail_service.reload_all(socket_path)
|
||||
await jail_service.reload_all(socket_path, include_jails=[name])
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("reload_after_activate_failed", jail=name, error=str(exc))
|
||||
|
||||
# Verify the jail actually started after the reload. A config error
|
||||
# (bad regex, missing log file, etc.) may silently prevent fail2ban from
|
||||
# starting the jail even though the reload command succeeded.
|
||||
post_reload_names = await _get_active_jail_names(socket_path)
|
||||
actually_running = name in post_reload_names
|
||||
if not actually_running:
|
||||
log.warning(
|
||||
"jail_activation_unverified",
|
||||
jail=name,
|
||||
message="Jail did not appear in running jails after reload.",
|
||||
)
|
||||
return JailActivationResponse(
|
||||
name=name,
|
||||
active=False,
|
||||
message=(
|
||||
f"Jail {name!r} was written to config but did not start after "
|
||||
"reload — check the jail configuration (filters, log paths, regex)."
|
||||
),
|
||||
)
|
||||
|
||||
log.info("jail_activated", jail=name)
|
||||
return JailActivationResponse(
|
||||
name=name,
|
||||
@@ -962,7 +982,7 @@ async def deactivate_jail(
|
||||
)
|
||||
|
||||
try:
|
||||
await jail_service.reload_all(socket_path)
|
||||
await jail_service.reload_all(socket_path, exclude_jails=[name])
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("reload_after_deactivate_failed", jail=name, error=str(exc))
|
||||
|
||||
|
||||
@@ -37,6 +37,12 @@ log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
|
||||
_SOCKET_TIMEOUT: float = 10.0
|
||||
|
||||
# Guard against concurrent reload_all calls. Overlapping ``reload --all``
|
||||
# commands sent to fail2ban's socket produce undefined behaviour and may cause
|
||||
# jails to be permanently removed from the daemon. Serialising them here
|
||||
# ensures only one reload stream is in-flight at a time.
|
||||
_reload_all_lock: asyncio.Lock = asyncio.Lock()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Custom exceptions
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -540,7 +546,12 @@ async def reload_jail(socket_path: str, name: str) -> None:
|
||||
raise JailOperationError(str(exc)) from exc
|
||||
|
||||
|
||||
async def reload_all(socket_path: str) -> None:
|
||||
async def reload_all(
|
||||
socket_path: str,
|
||||
*,
|
||||
include_jails: list[str] | None = None,
|
||||
exclude_jails: list[str] | None = None,
|
||||
) -> None:
|
||||
"""Reload all fail2ban jails at once.
|
||||
|
||||
Fetches the current jail list first so that a ``['start', name]`` entry
|
||||
@@ -548,8 +559,14 @@ async def reload_all(socket_path: str) -> None:
|
||||
non-empty stream the end-of-reload phase deletes every jail that received
|
||||
no configuration commands.
|
||||
|
||||
*include_jails* are added to the stream (e.g. a newly activated jail that
|
||||
is not yet running). *exclude_jails* are removed from the stream (e.g. a
|
||||
jail that was just deactivated and should not be restarted).
|
||||
|
||||
Args:
|
||||
socket_path: Path to the fail2ban Unix domain socket.
|
||||
include_jails: Extra jail names to add to the start stream.
|
||||
exclude_jails: Jail names to remove from the start stream.
|
||||
|
||||
Raises:
|
||||
JailOperationError: If fail2ban reports the operation failed.
|
||||
@@ -557,17 +574,26 @@ async def reload_all(socket_path: str) -> None:
|
||||
cannot be reached.
|
||||
"""
|
||||
client = Fail2BanClient(socket_path=socket_path, timeout=_SOCKET_TIMEOUT)
|
||||
try:
|
||||
# Resolve jail names so we can build the minimal config stream.
|
||||
status_raw = _ok(await client.send(["status"]))
|
||||
status_dict = _to_dict(status_raw)
|
||||
jail_list_raw: str = str(status_dict.get("Jail list", ""))
|
||||
jail_names = [n.strip() for n in jail_list_raw.split(",") if n.strip()]
|
||||
stream: list[list[str]] = [["start", n] for n in jail_names]
|
||||
_ok(await client.send(["reload", "--all", [], stream]))
|
||||
log.info("all_jails_reloaded")
|
||||
except ValueError as exc:
|
||||
raise JailOperationError(str(exc)) from exc
|
||||
async with _reload_all_lock:
|
||||
try:
|
||||
# Resolve jail names so we can build the minimal config stream.
|
||||
status_raw = _ok(await client.send(["status"]))
|
||||
status_dict = _to_dict(status_raw)
|
||||
jail_list_raw: str = str(status_dict.get("Jail list", ""))
|
||||
jail_names = [n.strip() for n in jail_list_raw.split(",") if n.strip()]
|
||||
|
||||
# Merge include/exclude sets so the stream matches the desired state.
|
||||
names_set: set[str] = set(jail_names)
|
||||
if include_jails:
|
||||
names_set.update(include_jails)
|
||||
if exclude_jails:
|
||||
names_set -= set(exclude_jails)
|
||||
|
||||
stream: list[list[str]] = [["start", n] for n in sorted(names_set)]
|
||||
_ok(await client.send(["reload", "--all", [], stream]))
|
||||
log.info("all_jails_reloaded")
|
||||
except ValueError as exc:
|
||||
raise JailOperationError(str(exc)) from exc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user