diff --git a/backend/app/startup.py b/backend/app/startup.py index e93ad08..cebf138 100644 --- a/backend/app/startup.py +++ b/backend/app/startup.py @@ -41,6 +41,7 @@ from app.tasks import ( geo_re_resolve, health_check, history_sync, + rate_limiter_cleanup, session_cleanup, ) from app.utils.async_utils import run_blocking @@ -395,6 +396,7 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No - geo_re_resolve: Periodic re-resolution of stale records - history_sync: Periodic synchronization of ban history - session_cleanup: Periodic cleanup of expired sessions + - rate_limiter_cleanup: Periodic cleanup of expired rate-limiter entries Args: app: The FastAPI application instance. @@ -407,5 +409,6 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No geo_re_resolve.register(app) history_sync.register(app) session_cleanup.register(app) + rate_limiter_cleanup.register(app) - log.info("startup_tasks_registered", count=7) + log.info("startup_tasks_registered", count=8) diff --git a/backend/app/tasks/rate_limiter_cleanup.py b/backend/app/tasks/rate_limiter_cleanup.py new file mode 100644 index 0000000..45719c5 --- /dev/null +++ b/backend/app/tasks/rate_limiter_cleanup.py @@ -0,0 +1,71 @@ +"""Rate limiter cleanup background task. + +Registers an APScheduler job that periodically removes expired rate-limit +entries from the in-memory rate limiter. Without this cleanup, the +rate-limiter state dictionary grows unbounded over long runtimes, eventually +consuming excessive memory. + +The cleanup is conservative: it only removes IPs with no recent attempts +(all timestamps outside the rate-limit window), so active or recently-active +IPs are preserved. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import structlog + +if TYPE_CHECKING: + from fastapi import FastAPI + +log: structlog.stdlib.BoundLogger = structlog.get_logger() + +#: How often the cleanup job fires (seconds). Chosen to balance memory +#: management against CPU overhead. A 30-minute interval handles typical +#: brute-force attack patterns while staying lightweight. +RATE_LIMITER_CLEANUP_INTERVAL: int = 30 * 60 # 30 minutes + +#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates. +JOB_ID: str = "rate_limiter_cleanup" + + +def _run_cleanup(app: FastAPI) -> None: + """Trigger cleanup of expired rate-limiter entries. + + Args: + app: The FastAPI application instance (holds the rate limiter). + """ + rate_limiter = getattr(app.state, "login_rate_limiter", None) + if rate_limiter is None: + log.warning( + "rate_limiter_cleanup_skipped", + reason="rate_limiter not found on app.state", + ) + return + + rate_limiter.cleanup_expired() + + +def register(app: FastAPI) -> None: + """Add (or replace) the rate-limiter cleanup job in the application scheduler. + + Must be called after the scheduler has been started (i.e., inside the + lifespan handler, after ``scheduler.start()``). + + Args: + app: The :class:`fastapi.FastAPI` application instance whose + ``app.state.scheduler`` will receive the job. + """ + app.state.scheduler.add_job( + _run_cleanup, + trigger="interval", + seconds=RATE_LIMITER_CLEANUP_INTERVAL, + kwargs={"app": app}, + id=JOB_ID, + replace_existing=True, + ) + log.info( + "rate_limiter_cleanup_scheduled", + interval_seconds=RATE_LIMITER_CLEANUP_INTERVAL, + ) diff --git a/backend/app/utils/rate_limiter.py b/backend/app/utils/rate_limiter.py index cc004b5..1ff00df 100644 --- a/backend/app/utils/rate_limiter.py +++ b/backend/app/utils/rate_limiter.py @@ -11,6 +11,23 @@ attacks to a single worker. The penalty strategy for failed login attempts is also managed here: record_failure() records a failure timestamp and returns the penalty delay to apply, enabling progressive back-off without exhausting request capacity. + +Operational Notes +----------------- + +**Cleanup Lifecycle**: The rate limiter state (_attempts, _failures, _lock_counts) +grows as IPs interact with the system. To prevent unbounded memory growth during +long runtimes, a scheduled background task (rate_limiter_cleanup) calls the +cleanup_expired() method every 30 minutes. This is safe because: + +- cleanup_expired() only removes IPs with no recent attempts (all timestamps + outside the rate-limit window), so active IPs are never disrupted. +- The cleanup is non-blocking and logged for observability. +- Individual requests already prune old timestamps from each IP's deque during + is_allowed() and record_failure(), so cleanup primarily handles dormant IPs. + +For monitoring, check logs for "rate_limiter_cleanup" events to observe how +many IPs are being retired from memory each cleanup cycle. """ from __future__ import annotations