"""GeoCache service — encapsulates geo service mutable state. This module defines the :class:`GeoCache` class which encapsulates all module-level mutable state from the original geo_service into a single injectable instance. The cache manages: - In-memory positive results cache (``ip → GeoInfo``) - Negative cache (failed lookups with TTL) - Dirty set (entries pending persistence) - Lock protecting cache mutations - MaxMind GeoLite2 reader initialization An instance should be created once at startup and stored on ``app.state.geo_cache``. """ from __future__ import annotations import asyncio import time from typing import TYPE_CHECKING import aiohttp import structlog from app.models.geo import GeoInfo from app.repositories import geo_cache_repo if TYPE_CHECKING: import aiosqlite import geoip2.database import geoip2.errors log: structlog.stdlib.BoundLogger = structlog.get_logger() # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- #: ip-api.com single-IP lookup endpoint (HTTP only on the free tier). _API_URL: str = ( "http://ip-api.com/json/{ip}?fields=status,message,country,countryCode,org,as" ) #: ip-api.com batch endpoint — accepts up to 100 IPs per POST. _BATCH_API_URL: str = ( "http://ip-api.com/batch?fields=status,message,country,countryCode,org,as,query" ) #: Maximum IPs per batch request (ip-api.com hard limit is 100). _BATCH_SIZE: int = 100 #: Maximum number of entries kept in the in-process cache before it is #: flushed completely. A simple eviction strategy — the cache is cheap to #: rebuild from the persistent store. _MAX_CACHE_SIZE: int = 50_000 #: Timeout for outgoing geo API requests in seconds. _REQUEST_TIMEOUT: float = 5.0 #: How many seconds a failed lookup result is suppressed before the IP is #: eligible for a new API attempt. Default: 5 minutes. _NEG_CACHE_TTL: float = 300.0 #: Minimum delay in seconds between consecutive batch HTTP requests to #: ip-api.com. The free tier allows 45 requests/min; 1.5 s ≈ 40 req/min. _BATCH_DELAY: float = 1.5 #: Maximum number of retries for a batch chunk that fails with a #: transient error (e.g. connection reset due to rate limiting). _BATCH_MAX_RETRIES: int = 2 class GeoCache: """Manages IP geolocation caching with positive and negative caches. Encapsulates all mutable state needed for geo-IP resolution. Provides methods for single lookups, batch lookups, persistence, and cache management. Primary resolution strategy: 1. Check in-memory cache 2. Check negative cache (recently failed IPs within TTL) 3. Try local MaxMind GeoLite2-Country database (if available) 4. If allow_http_fallback is True, try ip-api.com HTTP API 5. Record as negative cache entry if all resolvers fail State: _cache: In-memory positive results cache (``ip → GeoInfo``). _neg_cache: Failed lookup timestamps (``ip → epoch``). _dirty: IPs added but not yet persisted to database. _geoip_reader: Optional MaxMind GeoLite2 reader. _geoip_initialized: Indicates whether init_geoip() has been called. _allow_http_fallback: Whether to use ip-api.com as fallback. _cache_lock: Async lock protecting cache mutations. """ def __init__(self, allow_http_fallback: bool = False) -> None: """Initialize an empty GeoCache. Args: allow_http_fallback: Whether to fall back to ip-api.com HTTP API when the MaxMind database is unavailable. Default is False (fail rather than send IPs unencrypted). """ self._cache: dict[str, GeoInfo] = {} self._neg_cache: dict[str, float] = {} self._dirty: set[str] = set() self._geoip_reader: geoip2.database.Reader | None = None self._geoip_initialized: bool = False self._allow_http_fallback: bool = allow_http_fallback self._cache_lock: asyncio.Lock = asyncio.Lock() async def clear(self) -> None: """Flush both the positive and negative lookup caches. Also clears the dirty set so any pending-but-unpersisted entries are discarded. Useful in tests and when the operator suspects stale data. """ async with self._cache_lock: self._cache.clear() self._neg_cache.clear() self._dirty.clear() async def clear_neg_cache(self) -> None: """Flush only the negative (failed-lookups) cache. Useful when triggering a manual re-resolve so that previously failed IPs are immediately eligible for a new API attempt. """ async with self._cache_lock: self._neg_cache.clear() def is_cached(self, ip: str) -> bool: """Return ``True`` if *ip* has a positive entry in the in-memory cache. A positive entry is one with a non-``None`` ``country_code``. Args: ip: IPv4 or IPv6 address string. Returns: ``True`` when *ip* is in the cache with a known country code. """ return ip in self._cache and self._cache[ip].country_code is not None async def cache_stats(self, db: aiosqlite.Connection) -> dict[str, int]: """Return diagnostic counters for the geo cache subsystem. Queries the persistent store for the number of unresolved entries and combines it with in-memory counters. Args: db: Open BanGUI application database connection. Returns: Dict with keys ``cache_size``, ``unresolved``, ``neg_cache_size``, and ``dirty_size``. """ unresolved = await geo_cache_repo.count_unresolved(db) return { "cache_size": len(self._cache), "unresolved": unresolved, "neg_cache_size": len(self._neg_cache), "dirty_size": len(self._dirty), } async def count_unresolved(self, db: aiosqlite.Connection) -> int: """Return the number of unresolved entries in the persistent geo cache.""" return await geo_cache_repo.count_unresolved(db) async def get_unresolved_ips(self, db: aiosqlite.Connection) -> list[str]: """Return geo cache IPs where the country code has not yet been resolved. Args: db: Open BanGUI application database connection. Returns: List of IP addresses that are candidates for re-resolution. """ return await geo_cache_repo.get_unresolved_ips(db) async def re_resolve_all( self, db: aiosqlite.Connection, http_session: aiohttp.ClientSession, ) -> dict[str, int]: """Retry geo resolution for all unresolved cache entries. This helper clears the in-memory negative cache before attempting a fresh batch lookup, then returns counters for how many IPs were retried and how many gained a resolved country code. Args: db: BanGUI application database connection. http_session: Shared aiohttp client session. Returns: A dict with ``resolved`` and ``total`` counts. """ import structlog # noqa: PLC0415 log = structlog.get_logger() unresolved = await self.get_unresolved_ips(db) if not unresolved: return {"resolved": 0, "total": 0} await self.clear_neg_cache() geo_map = await self.lookup_batch(unresolved, http_session, db=db) resolved_count = sum( 1 for info in geo_map.values() if info.country_code is not None ) log.info( "geo_re_resolve_complete", total=len(unresolved), resolved=resolved_count, ) return {"resolved": resolved_count, "total": len(unresolved)} def init_geoip(self, mmdb_path: str | None) -> None: """Initialise the MaxMind GeoLite2-Country database reader. This function is startup-only and must be called before request handling begins. A second initialization attempt is considered a programming error and raises ``RuntimeError``. If *mmdb_path* is ``None``, empty, or the file does not exist the fallback is silently disabled — ip-api.com remains the sole resolver. Args: mmdb_path: Absolute path to a ``GeoLite2-Country.mmdb`` file. """ if self._geoip_initialized: raise RuntimeError("GeoIP reader already initialised") if not mmdb_path: return from pathlib import Path # noqa: PLC0415 import geoip2.database # noqa: PLC0415 if not Path(mmdb_path).is_file(): log.warning("geoip_mmdb_not_found", path=mmdb_path) return self._geoip_reader = geoip2.database.Reader(mmdb_path) self._geoip_initialized = True log.info("geoip_mmdb_loaded", path=mmdb_path) def _geoip_lookup(self, ip: str) -> GeoInfo | None: """Attempt a local MaxMind GeoLite2 lookup for *ip*. Returns ``None`` when the reader is not initialised, the IP is not in the database, or any other error occurs. Args: ip: IPv4 or IPv6 address string. Returns: A :class:`GeoInfo` with at least ``country_code`` populated, or ``None`` when resolution is impossible. """ if self._geoip_reader is None: return None import geoip2.errors # noqa: PLC0415 try: response = self._geoip_reader.country(ip) code: str | None = response.country.iso_code or None name: str | None = response.country.name or None if code is None: return None return GeoInfo(country_code=code, country_name=name, asn=None, org=None) except geoip2.errors.AddressNotFoundError: return None except (OSError, geoip2.errors.GeoIP2Error) as exc: log.warning("geoip_lookup_failed", ip=ip, error=type(exc).__name__) return None async def load_cache_from_db(self, db: aiosqlite.Connection) -> None: """Pre-populate the in-memory cache from the ``geo_cache`` table. Should be called once during application startup so the service starts with a warm cache instead of making cold API calls on the first request. Args: db: Open :class:`aiosqlite.Connection` to the BanGUI application database (not the fail2ban database). """ count = 0 cache_entries: list[tuple[str, GeoInfo]] = [] for row in await geo_cache_repo.load_all(db): country_code: str | None = row["country_code"] if country_code is None: continue ip: str = row["ip"] cache_entries.append( ( ip, GeoInfo( country_code=country_code, country_name=row["country_name"], asn=row["asn"], org=row["org"], ), ) ) count += 1 async with self._cache_lock: for ip, info in cache_entries: self._cache[ip] = info log.info("geo_cache_loaded_from_db", entries=count) async def _store(self, ip: str, info: GeoInfo) -> None: """Insert *info* into the cache, flushing if over capacity. When the IP resolved successfully (``country_code is not None``) it is also added to the dirty set so :meth:`flush_dirty` can persist it to the database on the next scheduled flush. Args: ip: The IP address key. info: The :class:`GeoInfo` to store. """ async with self._cache_lock: if len(self._cache) >= _MAX_CACHE_SIZE: self._cache.clear() self._dirty.clear() log.info("geo_cache_flushed", reason="capacity") self._cache[ip] = info if info.country_code is not None: self._dirty.add(ip) async def lookup( self, ip: str, http_session: aiohttp.ClientSession, db: aiosqlite.Connection | None = None, ) -> GeoInfo | None: """Resolve an IP address to country, ASN, and organisation metadata. Resolution strategy (in order): 1. Check in-memory cache 2. Check negative cache (skip if within TTL) 3. Try local MaxMind GeoLite2-Country database (primary resolver) 4. If allow_http_fallback is True, try ip-api.com HTTP API (unencrypted) 5. Record as negative cache entry if all resolvers fail Results are cached in-process. If the cache exceeds ``_MAX_CACHE_SIZE`` entries it is flushed before the new result is stored. Only successful resolutions (``country_code is not None``) are written to the persistent cache when *db* is provided. Failed lookups are **not** cached so they are retried on the next call. Args: ip: IPv4 or IPv6 address string. http_session: Shared :class:`aiohttp.ClientSession` (from ``app.state.http_session``). db: Optional BanGUI application database. When provided, successful lookups are persisted for cross-restart cache warming. Returns: A :class:`GeoInfo` instance, or ``None`` when the lookup fails in a way that should prevent the caller from caching a bad result (e.g. network timeout). """ if ip in self._cache: return self._cache[ip] # Negative cache: skip IPs that recently failed to avoid hammering the API. neg_ts = self._neg_cache.get(ip) if neg_ts is not None and (time.monotonic() - neg_ts) < _NEG_CACHE_TTL: return GeoInfo(country_code=None, country_name=None, asn=None, org=None) # PRIMARY RESOLVER: Try local MaxMind database first. result = self._geoip_lookup(ip) if result is not None: await self._store(ip, result) if result.country_code is not None and db is not None: try: await geo_cache_repo.upsert_entry_and_commit( db=db, ip=ip, country_code=result.country_code, country_name=result.country_name, asn=result.asn, org=result.org, ) except (OSError) as exc: log.warning("geo_persist_failed", ip=ip, error=type(exc).__name__) log.debug("geo_lookup_success_mmdb", ip=ip, country=result.country_code) return result # FALLBACK RESOLVER: Try ip-api.com HTTP API only if explicitly allowed. if not self._allow_http_fallback: log.debug("geo_lookup_failed_no_http_fallback", ip=ip) async with self._cache_lock: self._neg_cache[ip] = time.monotonic() if db is not None: try: await geo_cache_repo.upsert_neg_entry_and_commit(db=db, ip=ip) except (OSError) as exc: log.warning("geo_persist_neg_failed", ip=ip, error=type(exc).__name__) return GeoInfo(country_code=None, country_name=None, asn=None, org=None) # HTTP API call (only when allow_http_fallback is True). url: str = _API_URL.format(ip=ip) api_ok = False try: async with http_session.get(url, timeout=aiohttp.ClientTimeout(total=_REQUEST_TIMEOUT)) as resp: if resp.status != 200: log.warning("geo_lookup_http_non_200", ip=ip, status=resp.status) else: data: dict[str, object] = await resp.json(content_type=None) if data.get("status") == "success": api_ok = True result = self._parse_single_response(data) await self._store(ip, result) if result.country_code is not None and db is not None: try: await geo_cache_repo.upsert_entry_and_commit( db=db, ip=ip, country_code=result.country_code, country_name=result.country_name, asn=result.asn, org=result.org, ) except (OSError) as exc: log.warning("geo_persist_failed", ip=ip, error=type(exc).__name__) log.debug("geo_lookup_success_http", ip=ip, country=result.country_code, asn=result.asn) return result log.debug( "geo_lookup_http_failed", ip=ip, message=data.get("message", "unknown"), ) except (TimeoutError, aiohttp.ClientError, ValueError) as exc: log.warning( "geo_lookup_http_request_failed", ip=ip, exc_type=type(exc).__name__, error=repr(exc), ) if not api_ok: # Both resolvers failed — record in negative cache to avoid hammering. async with self._cache_lock: self._neg_cache[ip] = time.monotonic() if db is not None: try: await geo_cache_repo.upsert_neg_entry_and_commit(db=db, ip=ip) except Exception as exc: # noqa: BLE001 log.warning("geo_persist_neg_failed", ip=ip, error=str(exc)) return GeoInfo(country_code=None, country_name=None, asn=None, org=None) def lookup_cached_only( self, ips: list[str], ) -> tuple[dict[str, GeoInfo], list[str]]: """Return cached geo data for *ips* without making any external API calls. Used by callers that want to return a fast response using only what is already in memory, while deferring resolution of uncached IPs to a background task. Args: ips: IP address strings to look up. Returns: A ``(geo_map, uncached)`` tuple where *geo_map* maps every IP that was already in the in-memory cache to its :class:`GeoInfo`, and *uncached* is the list of IPs that were not found in the cache. Entries in the negative cache (recently failed) are **not** included in *uncached* so they are not re-queued immediately. """ geo_map: dict[str, GeoInfo] = {} uncached: list[str] = [] now = time.monotonic() for ip in dict.fromkeys(ips): # deduplicate, preserve order if ip in self._cache: geo_map[ip] = self._cache[ip] elif ip in self._neg_cache and (now - self._neg_cache[ip]) < _NEG_CACHE_TTL: # Still within the cool-down window — do not re-queue. pass else: uncached.append(ip) return geo_map, uncached async def lookup_batch( self, ips: list[str], http_session: aiohttp.ClientSession, db: aiosqlite.Connection | None = None, ) -> dict[str, GeoInfo]: """Resolve multiple IP addresses in bulk. Resolution strategy: 1. Return cached entries immediately (both positive and negative cache) 2. For uncached IPs, try local MaxMind database first 3. If allow_http_fallback is True, use ip-api.com batch endpoint for remaining 4. Record unresolvable IPs in negative cache IPs already present in the in-memory cache are returned immediately without making an HTTP request. Uncached IPs are first resolved via the local MaxMind database, then (if enabled) sent to ``http://ip-api.com/batch`` in chunks of up to :data:`_BATCH_SIZE`. Only successful resolutions (``country_code is not None``) are written to the persistent cache when *db* is provided. Both positive and negative entries are written in bulk using ``executemany`` (one round-trip per chunk) rather than one ``execute`` per IP. Args: ips: List of IP address strings to resolve. Duplicates are ignored. http_session: Shared :class:`aiohttp.ClientSession`. db: Optional BanGUI application database for persistent cache writes. Returns: Dict mapping ``ip → GeoInfo`` for every input IP. IPs whose resolution failed will have a ``GeoInfo`` with all-``None`` fields. """ geo_result: dict[str, GeoInfo] = {} uncached: list[str] = [] _empty = GeoInfo(country_code=None, country_name=None, asn=None, org=None) unique_ips = list(dict.fromkeys(ips)) # deduplicate, preserve order now = time.monotonic() for ip in unique_ips: if ip in self._cache: geo_result[ip] = self._cache[ip] elif ip in self._neg_cache and (now - self._neg_cache[ip]) < _NEG_CACHE_TTL: # Recently failed — skip resolution, return empty result. geo_result[ip] = _empty else: uncached.append(ip) if not uncached: return geo_result log.info("geo_batch_lookup_start", total=len(uncached)) # PRIMARY: Try local MaxMind database for all uncached IPs. pos_rows: list[tuple[str, str | None, str | None, str | None, str | None]] = [] neg_ips: list[str] = [] remaining_uncached: list[str] = [] for ip in uncached: mmdb_result = self._geoip_lookup(ip) if mmdb_result is not None: await self._store(ip, mmdb_result) geo_result[ip] = mmdb_result if db is not None: pos_rows.append( (ip, mmdb_result.country_code, mmdb_result.country_name, mmdb_result.asn, mmdb_result.org) ) else: # MMDB lookup failed — keep for potential HTTP fallback or final failure. remaining_uncached.append(ip) # Persist MMDB results if any. if db is not None and pos_rows: try: await geo_cache_repo.bulk_upsert_entries_and_commit(db, pos_rows) except (OSError) as exc: log.warning( "geo_batch_persist_mmdb_failed", count=len(pos_rows), error=type(exc).__name__, ) # FALLBACK: Try HTTP API only if enabled and there are remaining IPs. if not self._allow_http_fallback or not remaining_uncached: # Record remaining as negative cache. for ip in remaining_uncached: async with self._cache_lock: self._neg_cache[ip] = time.monotonic() geo_result[ip] = _empty neg_ips.append(ip) if db is not None and neg_ips: try: await geo_cache_repo.bulk_upsert_neg_entries_and_commit(db, neg_ips) except (OSError) as exc: log.warning( "geo_batch_persist_neg_failed", count=len(neg_ips), error=type(exc).__name__, ) log.info( "geo_batch_lookup_complete", requested=len(uncached), resolved=sum(1 for g in geo_result.values() if g.country_code is not None), ) return geo_result # HTTP API batch processing. pos_rows.clear() neg_ips.clear() for batch_idx, chunk_start in enumerate(range(0, len(remaining_uncached), _BATCH_SIZE)): chunk = remaining_uncached[chunk_start : chunk_start + _BATCH_SIZE] # Throttle: pause between consecutive HTTP calls to stay within the # ip-api.com free-tier rate limit (45 req/min). if batch_idx > 0: await asyncio.sleep(_BATCH_DELAY) # Retry transient failures (e.g. connection-reset from rate limit). chunk_result: dict[str, GeoInfo] | None = None for attempt in range(_BATCH_MAX_RETRIES + 1): chunk_result = await self._batch_api_call(chunk, http_session) # If every IP in the chunk came back with country_code=None and the # batch wasn't tiny, that almost certainly means the whole request # was rejected (connection reset / 429). Retry after a back-off. all_failed = all( info.country_code is None for info in chunk_result.values() ) if not all_failed or attempt >= _BATCH_MAX_RETRIES: break backoff = _BATCH_DELAY * (2 ** (attempt + 1)) log.warning( "geo_batch_retry", attempt=attempt + 1, chunk_size=len(chunk), backoff=backoff, ) await asyncio.sleep(backoff) assert chunk_result is not None # noqa: S101 for ip, info in chunk_result.items(): if info.country_code is not None: # Successful HTTP resolution. await self._store(ip, info) geo_result[ip] = info if db is not None: pos_rows.append( (ip, info.country_code, info.country_name, info.asn, info.org) ) else: # HTTP failed — record as negative cache. async with self._cache_lock: self._neg_cache[ip] = time.monotonic() geo_result[ip] = _empty if db is not None: neg_ips.append(ip) if db is not None and (pos_rows or neg_ips): try: await geo_cache_repo.bulk_upsert_entries_and_neg_entries_and_commit( db, pos_rows, neg_ips, ) except (OSError) as exc: log.warning( "geo_batch_persist_failed", positive_count=len(pos_rows), negative_count=len(neg_ips), error=type(exc).__name__, ) pos_rows.clear() neg_ips.clear() log.info( "geo_batch_lookup_complete", requested=len(uncached), resolved=sum(1 for g in geo_result.values() if g.country_code is not None), ) return geo_result async def _batch_api_call( self, ips: list[str], http_session: aiohttp.ClientSession, ) -> dict[str, GeoInfo]: """Send one batch request to the ip-api.com batch endpoint. Args: ips: Up to :data:`_BATCH_SIZE` IP address strings. http_session: Shared HTTP session. Returns: Dict mapping ``ip → GeoInfo`` for every IP in *ips*. IPs where the API returned a failure record or the request raised an exception get an all-``None`` :class:`GeoInfo`. """ empty = GeoInfo(country_code=None, country_name=None, asn=None, org=None) fallback: dict[str, GeoInfo] = dict.fromkeys(ips, empty) payload = [{"query": ip} for ip in ips] try: async with http_session.post( _BATCH_API_URL, json=payload, timeout=aiohttp.ClientTimeout(total=_REQUEST_TIMEOUT * 2), ) as resp: if resp.status != 200: log.warning("geo_batch_non_200", status=resp.status, count=len(ips)) return fallback data: list[dict[str, object]] = await resp.json(content_type=None) except (TimeoutError, aiohttp.ClientError, ValueError) as exc: log.warning( "geo_batch_request_failed", count=len(ips), exc_type=type(exc).__name__, error=repr(exc), ) return fallback out: dict[str, GeoInfo] = {} for entry in data: ip_str: str = str(entry.get("query", "")) if not ip_str: continue if entry.get("status") != "success": out[ip_str] = empty log.debug( "geo_batch_entry_failed", ip=ip_str, message=entry.get("message", "unknown"), ) continue out[ip_str] = self._parse_single_response(entry) # Fill any IPs missing from the response. for ip in ips: if ip not in out: out[ip] = empty return out def _parse_single_response(self, data: dict[str, object]) -> GeoInfo: """Build a :class:`GeoInfo` from a single ip-api.com response dict. Args: data: A ``status == "success"`` JSON response from ip-api.com. Returns: Populated :class:`GeoInfo`. """ country_code: str | None = self._str_or_none(data.get("countryCode")) country_name: str | None = self._str_or_none(data.get("country")) asn_raw: str | None = self._str_or_none(data.get("as")) org_raw: str | None = self._str_or_none(data.get("org")) # ip-api returns "AS12345 Some Org" in both "as" and "org". asn: str | None = asn_raw.split()[0] if asn_raw else None return GeoInfo( country_code=country_code, country_name=country_name, asn=asn, org=org_raw, ) def _str_or_none(self, value: object) -> str | None: """Return *value* as a non-empty string, or ``None``. Args: value: Raw JSON value which may be ``None``, empty, or a string. Returns: Stripped string if non-empty, else ``None``. """ if value is None: return None s = str(value).strip() return s if s else None async def flush_dirty(self, db: aiosqlite.Connection) -> int: """Persist all new in-memory geo entries to the ``geo_cache`` table. Takes an atomic snapshot of the dirty set, clears it, then batch-inserts all entries that are still present in the cache using a single ``executemany`` call and one ``COMMIT``. This is the only place that writes to the persistent cache during normal operation after startup. If the database write fails the entries are re-added to the dirty set so they will be retried on the next flush cycle. Args: db: Open :class:`aiosqlite.Connection` to the BanGUI application database. Returns: The number of rows successfully upserted. """ async with self._cache_lock: if not self._dirty: return 0 # Atomically snapshot and clear while holding the cache lock. to_flush = self._dirty.copy() self._dirty.clear() rows = [ ( ip, self._cache[ip].country_code, self._cache[ip].country_name, self._cache[ip].asn, self._cache[ip].org, ) for ip in to_flush if ip in self._cache ] if not rows: return 0 try: await geo_cache_repo.bulk_upsert_entries_and_commit(db, rows) except (OSError) as exc: log.warning("geo_flush_dirty_failed", error=type(exc).__name__) # Re-add to dirty so they are retried on the next flush cycle. self._dirty.update(to_flush) return 0 log.info("geo_flush_dirty_complete", count=len(rows)) return len(rows)