fix: retry, semaphore, reload lock, activation verify, bans_by_jail diagnostics

Stage 1.1-1.3: reload_all include/exclude_jails params already implemented;
  added keyword-arg assertions in router and service tests.

Stage 2.1/6.1: _send_command_sync retry loop (3 attempts, 150ms exp backoff)
  retrying on EAGAIN/ECONNREFUSED/ENOBUFS; immediate raise on all other errors.

Stage 2.2: asyncio.Lock at module level in jail_service.reload_all to
  serialize concurrent reload--all commands.

Stage 3.1: activate_jail re-queries _get_active_jail_names after reload;
  returns active=False with descriptive message if jail did not start.

Stage 4.1/6.2: asyncio.Semaphore (max 10) in Fail2BanClient.send, lazy-
  initialized; logs fail2ban_command_waiting_semaphore at debug when waiting.

Stage 5.1/5.2: unit tests asserting reload_all is called with include_jails
  and exclude_jails; activation verification happy/sad path tests.

Stage 6.3: TestSendCommandSyncRetry (5 cases) + TestFail2BanClientSemaphore
  concurrency test.

Stage 7.1-7.3: _since_unix uses time.time(); bans_by_jail debug logging with
  since_iso; diagnostic warning when total==0 despite table rows; unit test
  verifying the warning fires for stale data.
This commit is contained in:
2026-03-14 11:09:55 +01:00
parent 2274e20123
commit 2f2e5a7419
9 changed files with 880 additions and 115 deletions

View File

@@ -12,6 +12,7 @@ from __future__ import annotations
import asyncio
import json
import time
from datetime import UTC, datetime
from typing import TYPE_CHECKING, Any
@@ -76,6 +77,13 @@ def _origin_sql_filter(origin: BanOrigin | None) -> tuple[str, tuple[str, ...]]:
def _since_unix(range_: TimeRange) -> int:
"""Return the Unix timestamp representing the start of the time window.
Uses :func:`time.time` (always UTC epoch seconds on all platforms) to be
consistent with how fail2ban stores ``timeofban`` values in its SQLite
database. fail2ban records ``time.time()`` values directly, so
comparing against a timezone-aware ``datetime.now(UTC).timestamp()`` would
theoretically produce the same number but using :func:`time.time` avoids
any tz-aware datetime pitfalls on misconfigured systems.
Args:
range_: One of the supported time-range presets.
@@ -83,7 +91,7 @@ def _since_unix(range_: TimeRange) -> int:
Unix timestamp (seconds since epoch) equal to *now range_*.
"""
seconds: int = TIME_RANGE_SECONDS[range_]
return int(datetime.now(tz=UTC).timestamp()) - seconds
return int(time.time()) - seconds
def _ts_to_iso(unix_ts: int) -> str:
@@ -626,10 +634,11 @@ async def bans_by_jail(
origin_clause, origin_params = _origin_sql_filter(origin)
db_path: str = await _get_fail2ban_db_path(socket_path)
log.info(
log.debug(
"ban_service_bans_by_jail",
db_path=db_path,
since=since,
since_iso=_ts_to_iso(since),
range=range_,
origin=origin,
)
@@ -644,6 +653,24 @@ async def bans_by_jail(
count_row = await cur.fetchone()
total: int = int(count_row[0]) if count_row else 0
# Diagnostic guard: if zero results were returned, check whether the
# table has *any* rows and log a warning with min/max timeofban so
# operators can diagnose timezone or filter mismatches from logs.
if total == 0:
async with f2b_db.execute(
"SELECT COUNT(*), MIN(timeofban), MAX(timeofban) FROM bans"
) as cur:
diag_row = await cur.fetchone()
if diag_row and diag_row[0] > 0:
log.warning(
"ban_service_bans_by_jail_empty_despite_data",
table_row_count=diag_row[0],
min_timeofban=diag_row[1],
max_timeofban=diag_row[2],
since=since,
range=range_,
)
async with f2b_db.execute(
"SELECT jail, COUNT(*) AS cnt "
"FROM bans "
@@ -657,4 +684,9 @@ async def bans_by_jail(
jails: list[JailBanCount] = [
JailBanCount(jail=str(row["jail"]), count=int(row["cnt"])) for row in rows
]
log.debug(
"ban_service_bans_by_jail_result",
total=total,
jail_count=len(jails),
)
return BansByJailResponse(jails=jails, total=total)