refactor(ban_service): extract _bans_by_country_load_data helper

Break up long function into focused helper. Load data logic separate from aggregation.
This commit is contained in:
2026-05-03 17:00:34 +02:00
parent 5058a50143
commit 2df029f7e8
8 changed files with 458 additions and 321 deletions

View File

@@ -13,7 +13,7 @@ from __future__ import annotations
import asyncio
import contextlib
import ipaddress
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Any, cast
import aiohttp
import structlog
@@ -514,6 +514,262 @@ async def list_bans(
_MAX_COMPANION_BANS: int = 200
# ---------------------------------------------------------------------------
# bans_by_country — implementation helpers
# ---------------------------------------------------------------------------
async def _bans_by_country_load_data(
*,
source: str,
socket_path: str,
since: int,
origin: BanOrigin | None,
history_archive_repo: HistoryArchiveRepository,
app_db: aiosqlite.Connection | None,
) -> tuple[dict[str, int], int, list[str]]:
"""Load per-IP ban counts and total for the requested time window.
Returns:
Tuple of (agg_rows dict mapping ip->event_count, total_ban_count, unique_ip_list).
"""
if source == "archive":
if app_db is None:
raise ValueError("app_db must be provided when source is 'archive'")
ip_counts = await history_archive_repo.get_ip_ban_counts(
db=app_db,
since=since,
origin=origin,
action="ban",
)
agg_rows = {row["ip"]: int(row["event_count"]) for row in ip_counts}
total = sum(agg_rows.values())
unique_ips = list(agg_rows.keys())
else:
db_path: str = await get_fail2ban_db_path(socket_path)
log.info(
"ban_service_bans_by_country",
db_path=db_path,
since=since,
origin=origin,
)
_, total = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
limit=0,
offset=0,
)
agg_rows_list = await fail2ban_db_repo.get_ban_event_counts(
db_path=db_path,
since=since,
origin=origin,
)
agg_rows = {r.ip: r.event_count for r in agg_rows_list}
unique_ips = list(agg_rows.keys())
return agg_rows, total, unique_ips
async def _bans_by_country_resolve_geo(
unique_ips: list[str],
*,
http_session: aiohttp.ClientSession | None,
geo_cache_lookup: GeoCacheLookup | None,
geo_cache: GeoCache | None,
geo_enricher: GeoEnricher | None,
app_db: aiosqlite.Connection | None,
) -> dict[str, GeoInfo]:
"""Resolve geo information for a list of unique IPs.
Uses the geo cache when available; falls back to legacy enricher.
Uncached IPs are scheduled for background resolution to warm the cache.
"""
if not unique_ips:
return {}
geo_map: dict[str, GeoInfo] = {}
if http_session is not None and geo_cache_lookup is not None:
geo_map, uncached = geo_cache_lookup(unique_ips)
if uncached:
log.info(
"ban_service_geo_background_scheduled",
uncached=len(uncached),
cached=len(geo_map),
)
if geo_cache is not None:
asyncio.create_task(
logged_task(
geo_cache.lookup_batch(uncached, http_session, db=app_db),
"geo_bans_by_country",
),
name="geo_bans_by_country",
)
elif geo_enricher is not None:
async def _safe_lookup(ip: str) -> tuple[str, GeoInfo | None]:
try:
return ip, await geo_enricher(ip)
except (TimeoutError, aiohttp.ClientError, OSError):
log.warning("ban_service_geo_lookup_failed", ip=ip)
return ip, None
except Exception as exc:
log.error(
"ban_service_geo_lookup_unexpected_error",
ip=ip,
error=type(exc).__name__,
)
raise
results = await asyncio.gather(*(_safe_lookup(ip) for ip in unique_ips))
geo_map = {ip: geo for ip, geo in results if geo is not None}
return geo_map
async def _bans_by_country_load_companion(
*,
source: str,
country_code: str | None,
geo_map: dict[str, GeoInfo],
since: int,
origin: BanOrigin | None,
db_path: str | None,
app_db: aiosqlite.Connection | None,
history_archive_repo: HistoryArchiveRepository,
) -> tuple[list[dict[str, Any] | fail2ban_db_repo.BanRecord], list[str]]:
"""Load companion ban rows and matched IPs for the given country filter.
Returns:
Tuple of (companion_rows, matched_ips_for_country).
"""
if country_code is None:
if source == "archive":
rows, _ = await history_archive_repo.get_archived_history(
db=app_db,
since=since,
origin=origin,
action="ban",
page=1,
page_size=_MAX_COMPANION_BANS,
)
else:
rows, _ = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
limit=_MAX_COMPANION_BANS,
offset=0,
)
return rows, []
matched_ips = [
ip
for ip, geo in geo_map.items()
if geo is not None and geo.country_code == country_code
]
if not matched_ips:
return [], matched_ips
if source == "archive":
rows, _ = await history_archive_repo.get_archived_history(
db=app_db,
since=since,
origin=origin,
action="ban",
ip_filter=matched_ips,
page=1,
page_size=_MAX_COMPANION_BANS,
)
else:
rows, _ = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
ip_filter=matched_ips,
)
return rows, matched_ips
def _bans_by_country_aggregate(
agg_rows: dict[str, int],
geo_map: dict[str, GeoInfo],
source: str,
) -> tuple[dict[str, int], dict[str, str]]:
"""Aggregate ban counts by country code.
Returns:
Tuple of (countries dict mapping cc->count, country_names dict mapping cc->name).
"""
countries: dict[str, int] = {}
country_names: dict[str, str] = {}
for ip, event_count in agg_rows.items():
geo = geo_map.get(ip)
cc: str | None = geo.country_code if geo else None
cn: str | None = geo.country_name if geo else None
if cc:
countries[cc] = countries.get(cc, 0) + event_count
if cn and cc not in country_names:
country_names[cc] = cn
return countries, country_names
def _bans_by_country_build_ban_items(
companion_rows: list[dict[str, Any] | fail2ban_db_repo.BanRecord],
geo_map: dict[str, GeoInfo],
source: str,
) -> list[DomainDashboardBanItem]:
"""Build DomainDashboardBanItem list from raw companion rows."""
bans: list[DomainDashboardBanItem] = []
for companion_row in companion_rows:
if source == "archive":
ip = companion_row["ip"]
jail = companion_row["jail"]
banned_at = ts_to_iso(int(companion_row["timeofban"]))
ban_count = int(companion_row["bancount"])
service = None
else:
ip = companion_row.ip
jail = companion_row.jail
banned_at = ts_to_iso(companion_row.timeofban)
ban_count = companion_row.bancount
matches, _ = parse_data_json(companion_row.data)
service = matches[0] if matches else None
geo = geo_map.get(ip)
cc = geo.country_code if geo else None
cn = geo.country_name if geo else None
asn: str | None = geo.asn if geo else None
org: str | None = geo.org if geo else None
bans.append(
DomainDashboardBanItem(
ip=ip,
jail=jail,
banned_at=banned_at,
service=service,
country_code=cc,
country_name=cn,
asn=asn,
org=org,
ban_count=ban_count,
origin=_derive_origin(jail),
)
)
return bans
async def bans_by_country(
socket_path: str,
range_: TimeRange,
@@ -569,211 +825,47 @@ async def bans_by_country(
if source not in ("fail2ban", "archive"):
raise ValueError(f"Unsupported source: {source!r}")
if source == "archive":
if app_db is None:
raise ValueError("app_db must be provided when source is 'archive'")
# Step 1: Load per-IP ban counts and total.
db_path: str | None = None
if source == "fail2ban":
db_path = await get_fail2ban_db_path(socket_path)
# SQL aggregation — no row materialisation into Python memory.
ip_counts = await history_archive_repo.get_ip_ban_counts(
db=app_db,
since=since,
origin=origin,
action="ban",
)
agg_rows, total, unique_ips = await _bans_by_country_load_data(
source=source,
socket_path=socket_path,
since=since,
origin=origin,
history_archive_repo=history_archive_repo,
app_db=app_db,
)
# Total = sum of all event counts.
total = sum(int(row["event_count"]) for row in ip_counts)
# Step 2: Resolve geo for unique IPs (from cache or enricher).
geo_map = await _bans_by_country_resolve_geo(
unique_ips,
http_session=http_session,
geo_cache_lookup=geo_cache_lookup,
geo_cache=geo_cache,
geo_enricher=geo_enricher,
app_db=app_db,
)
# {ip: event_count} for downstream geo aggregation.
agg_rows = {row["ip"]: int(row["event_count"]) for row in ip_counts}
# Step 3: Load companion ban rows (filtered by country if provided).
companion_rows, _ = await _bans_by_country_load_companion(
source=source,
country_code=country_code,
geo_map=geo_map,
since=since,
origin=origin,
db_path=db_path,
app_db=app_db,
history_archive_repo=history_archive_repo,
)
unique_ips = list(agg_rows.keys())
else:
origin_clause, origin_params = _origin_sql_filter(origin)
db_path: str = await get_fail2ban_db_path(socket_path)
log.info(
"ban_service_bans_by_country",
db_path=db_path,
since=since,
range=range_,
origin=origin,
)
# Step 4: Aggregate counts by country.
countries, country_names = _bans_by_country_aggregate(agg_rows, geo_map, source)
# Total count and companion rows reuse the same SQL query logic.
# Passing limit=0 returns only the total from the count query.
_, total = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
limit=0,
offset=0,
)
agg_rows = await fail2ban_db_repo.get_ban_event_counts(
db_path=db_path,
since=since,
origin=origin,
)
unique_ips = [r.ip for r in agg_rows]
geo_map: dict[str, GeoInfo] = {}
if http_session is not None and unique_ips and geo_cache_lookup is not None:
# Serve only what is already in the in-memory cache — no API calls on
# the hot path. Uncached IPs are resolved asynchronously in the
# background so subsequent requests benefit from a warmer cache.
geo_map, uncached = geo_cache_lookup(unique_ips)
if uncached:
log.info(
"ban_service_geo_background_scheduled",
uncached=len(uncached),
cached=len(geo_map),
)
if geo_cache is not None:
# Fire-and-forget: lookup_batch handles rate-limiting / retries.
# The dirty-set flush task persists results to the DB.
asyncio.create_task(
logged_task(
geo_cache.lookup_batch(uncached, http_session, db=app_db),
"geo_bans_by_country",
),
name="geo_bans_by_country",
)
elif geo_enricher is not None and unique_ips:
# Fallback: legacy per-IP enricher (used in tests / older callers).
async def _safe_lookup(ip: str) -> tuple[str, GeoInfo | None]:
try:
return ip, await geo_enricher(ip)
except (TimeoutError, aiohttp.ClientError, OSError):
log.warning("ban_service_geo_lookup_failed", ip=ip)
return ip, None
except Exception as exc:
log.error("ban_service_geo_lookup_unexpected_error", ip=ip, error=type(exc).__name__)
raise # Bubble programming errors to global handler
results = await asyncio.gather(*(_safe_lookup(ip) for ip in unique_ips))
geo_map = {ip: geo for ip, geo in results if geo is not None}
companion_rows: list[dict[str, Any] | fail2ban_db_repo.BanRecord]
if country_code is None:
if source == "archive":
companion_rows, _ = await history_archive_repo.get_archived_history(
db=app_db,
since=since,
origin=origin,
action="ban",
page=1,
page_size=_MAX_COMPANION_BANS,
)
else:
companion_rows, _ = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
limit=_MAX_COMPANION_BANS,
offset=0,
)
else:
matched_ips = [
ip
for ip, geo in geo_map.items()
if geo is not None and geo.country_code == country_code
]
if source == "archive":
if matched_ips:
# Use keyset pagination instead of loading all matched IPs at once.
companion_rows, _ = await history_archive_repo.get_archived_history(
db=app_db,
since=since,
origin=origin,
action="ban",
ip_filter=matched_ips,
page=1,
page_size=_MAX_COMPANION_BANS,
)
else:
companion_rows = []
else:
if matched_ips:
companion_rows, _ = await fail2ban_db_repo.get_currently_banned(
db_path=db_path,
since=since,
origin=origin,
ip_filter=matched_ips,
)
else:
companion_rows = []
# Build country aggregation from the SQL-grouped rows.
countries: dict[str, int] = {}
country_names: dict[str, str] = {}
if source == "archive":
agg_items = [
{
"ip": ip,
"event_count": count,
}
for ip, count in agg_rows.items()
]
else:
agg_items = agg_rows
for agg_row in agg_items:
if source == "archive":
ip = agg_row["ip"]
event_count = agg_row["event_count"]
else:
ip = agg_row.ip
event_count = agg_row.event_count
geo = geo_map.get(ip)
cc: str | None = geo.country_code if geo else None
cn: str | None = geo.country_name if geo else None
if cc:
countries[cc] = countries.get(cc, 0) + event_count
if cn and cc not in country_names:
country_names[cc] = cn
# Build companion table from recent rows (geo already cached from batch step).
bans: list[DomainDashboardBanItem] = []
for companion_row in companion_rows:
if source == "archive":
ip = companion_row["ip"]
jail = companion_row["jail"]
banned_at = ts_to_iso(int(companion_row["timeofban"]))
ban_count = int(companion_row["bancount"])
service = None
else:
ip = companion_row.ip
jail = companion_row.jail
banned_at = ts_to_iso(companion_row.timeofban)
ban_count = companion_row.bancount
matches, _ = parse_data_json(companion_row.data)
service = matches[0] if matches else None
geo = geo_map.get(ip)
cc = geo.country_code if geo else None
cn = geo.country_name if geo else None
asn: str | None = geo.asn if geo else None
org: str | None = geo.org if geo else None
bans.append(
DomainDashboardBanItem(
ip=ip,
jail=jail,
banned_at=banned_at,
service=service,
country_code=cc,
country_name=cn,
asn=asn,
org=org,
ban_count=ban_count,
origin=_derive_origin(jail),
)
)
# Step 5: Build companion ban items for the response.
bans = _bans_by_country_build_ban_items(companion_rows, geo_map, source)
return DomainBansByCountry(
countries=countries,