Fix empty error field in geo_lookup_request_failed log events

- Replace str(exc) with repr(exc) in lookup() and _batch_api_call()
  so exception class name is always present even for no-message errors
  (e.g. aiohttp.ServerDisconnectedError() whose str() is empty)
- Add exc_type=type(exc).__name__ field to network-error log events
  for easy structured-log filtering
- Move import aiohttp to runtime import; use aiohttp.ClientTimeout()
  instead of raw float, removing # type: ignore[arg-type] workarounds
- Add TestErrorLogging with 3 tests covering empty-message exceptions
This commit is contained in:
2026-03-12 17:50:58 +01:00
parent 029c094e18
commit a61c9dc969
3 changed files with 351 additions and 690 deletions

View File

@@ -38,14 +38,15 @@ Usage::
from __future__ import annotations
import asyncio
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING
import aiohttp
import structlog
if TYPE_CHECKING:
import aiohttp
import aiosqlite
import geoip2.database
import geoip2.errors
@@ -81,6 +82,14 @@ _REQUEST_TIMEOUT: float = 5.0
#: eligible for a new API attempt. Default: 5 minutes.
_NEG_CACHE_TTL: float = 300.0
#: Minimum delay in seconds between consecutive batch HTTP requests to
#: ip-api.com. The free tier allows 45 requests/min; 1.5 s ≈ 40 req/min.
_BATCH_DELAY: float = 1.5
#: Maximum number of retries for a batch chunk that fails with a
#: transient error (e.g. connection reset due to rate limiting).
_BATCH_MAX_RETRIES: int = 2
# ---------------------------------------------------------------------------
# Domain model
# ---------------------------------------------------------------------------
@@ -146,6 +155,49 @@ def clear_neg_cache() -> None:
_neg_cache.clear()
def is_cached(ip: str) -> bool:
"""Return ``True`` if *ip* has a positive entry in the in-memory cache.
A positive entry is one with a non-``None`` ``country_code``. This is
useful for skipping IPs that have already been resolved when building
a list for :func:`lookup_batch`.
Args:
ip: IPv4 or IPv6 address string.
Returns:
``True`` when *ip* is in the cache with a known country code.
"""
return ip in _cache and _cache[ip].country_code is not None
async def cache_stats(db: aiosqlite.Connection) -> dict[str, int]:
"""Return diagnostic counters for the geo cache subsystem.
Queries the persistent store for the number of unresolved entries and
combines it with in-memory counters.
Args:
db: Open BanGUI application database connection.
Returns:
Dict with keys ``cache_size``, ``unresolved``, ``neg_cache_size``,
and ``dirty_size``.
"""
async with db.execute(
"SELECT COUNT(*) FROM geo_cache WHERE country_code IS NULL"
) as cur:
row = await cur.fetchone()
unresolved: int = int(row[0]) if row else 0
return {
"cache_size": len(_cache),
"unresolved": unresolved,
"neg_cache_size": len(_neg_cache),
"dirty_size": len(_dirty),
}
def init_geoip(mmdb_path: str | None) -> None:
"""Initialise the MaxMind GeoLite2-Country database reader.
@@ -322,7 +374,7 @@ async def lookup(
url: str = _API_URL.format(ip=ip)
api_ok = False
try:
async with http_session.get(url, timeout=_REQUEST_TIMEOUT) as resp: # type: ignore[arg-type]
async with http_session.get(url, timeout=aiohttp.ClientTimeout(total=_REQUEST_TIMEOUT)) as resp:
if resp.status != 200:
log.warning("geo_lookup_non_200", ip=ip, status=resp.status)
else:
@@ -345,7 +397,12 @@ async def lookup(
message=data.get("message", "unknown"),
)
except Exception as exc: # noqa: BLE001
log.warning("geo_lookup_request_failed", ip=ip, error=str(exc))
log.warning(
"geo_lookup_request_failed",
ip=ip,
exc_type=type(exc).__name__,
error=repr(exc),
)
if not api_ok:
# Try local MaxMind database as fallback.
@@ -421,9 +478,36 @@ async def lookup_batch(
log.info("geo_batch_lookup_start", total=len(uncached))
for chunk_start in range(0, len(uncached), _BATCH_SIZE):
for batch_idx, chunk_start in enumerate(range(0, len(uncached), _BATCH_SIZE)):
chunk = uncached[chunk_start : chunk_start + _BATCH_SIZE]
chunk_result = await _batch_api_call(chunk, http_session)
# Throttle: pause between consecutive HTTP calls to stay within the
# ip-api.com free-tier rate limit (45 req/min).
if batch_idx > 0:
await asyncio.sleep(_BATCH_DELAY)
# Retry transient failures (e.g. connection-reset from rate limit).
chunk_result: dict[str, GeoInfo] | None = None
for attempt in range(_BATCH_MAX_RETRIES + 1):
chunk_result = await _batch_api_call(chunk, http_session)
# If every IP in the chunk came back with country_code=None and the
# batch wasn't tiny, that almost certainly means the whole request
# was rejected (connection reset / 429). Retry after a back-off.
all_failed = all(
info.country_code is None for info in chunk_result.values()
)
if not all_failed or attempt >= _BATCH_MAX_RETRIES:
break
backoff = _BATCH_DELAY * (2 ** (attempt + 1))
log.warning(
"geo_batch_retry",
attempt=attempt + 1,
chunk_size=len(chunk),
backoff=backoff,
)
await asyncio.sleep(backoff)
assert chunk_result is not None # noqa: S101
for ip, info in chunk_result.items():
if info.country_code is not None:
@@ -493,14 +577,19 @@ async def _batch_api_call(
async with http_session.post(
_BATCH_API_URL,
json=payload,
timeout=_REQUEST_TIMEOUT * 2, # type: ignore[arg-type]
timeout=aiohttp.ClientTimeout(total=_REQUEST_TIMEOUT * 2),
) as resp:
if resp.status != 200:
log.warning("geo_batch_non_200", status=resp.status, count=len(ips))
return fallback
data: list[dict[str, object]] = await resp.json(content_type=None)
except Exception as exc: # noqa: BLE001
log.warning("geo_batch_request_failed", count=len(ips), error=str(exc))
log.warning(
"geo_batch_request_failed",
count=len(ips),
exc_type=type(exc).__name__,
error=repr(exc),
)
return fallback
out: dict[str, GeoInfo] = {}