From c2dd9f5f559f4077c076f38d7af5fc35a7961020 Mon Sep 17 00:00:00 2001 From: Lukas Date: Wed, 29 Apr 2026 19:28:45 +0200 Subject: [PATCH] Add scheduled cleanup for rate limiter (#32) Implement periodic cleanup of expired rate-limiter entries to prevent unbounded memory growth during long runtimes. Changes: - Create rate_limiter_cleanup task that calls cleanup_expired() every 30 minutes - Register the task in the startup DAG alongside other background jobs - Update rate_limiter module documentation with operational notes about the cleanup lifecycle and memory management strategy The cleanup is conservative and only removes IPs with no recent attempts (all timestamps outside the rate-limit window), so active IPs are preserved. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- backend/app/startup.py | 5 +- backend/app/tasks/rate_limiter_cleanup.py | 71 +++++++++++++++++++++++ backend/app/utils/rate_limiter.py | 17 ++++++ 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 backend/app/tasks/rate_limiter_cleanup.py diff --git a/backend/app/startup.py b/backend/app/startup.py index e93ad08..cebf138 100644 --- a/backend/app/startup.py +++ b/backend/app/startup.py @@ -41,6 +41,7 @@ from app.tasks import ( geo_re_resolve, health_check, history_sync, + rate_limiter_cleanup, session_cleanup, ) from app.utils.async_utils import run_blocking @@ -395,6 +396,7 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No - geo_re_resolve: Periodic re-resolution of stale records - history_sync: Periodic synchronization of ban history - session_cleanup: Periodic cleanup of expired sessions + - rate_limiter_cleanup: Periodic cleanup of expired rate-limiter entries Args: app: The FastAPI application instance. @@ -407,5 +409,6 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No geo_re_resolve.register(app) history_sync.register(app) session_cleanup.register(app) + rate_limiter_cleanup.register(app) - log.info("startup_tasks_registered", count=7) + log.info("startup_tasks_registered", count=8) diff --git a/backend/app/tasks/rate_limiter_cleanup.py b/backend/app/tasks/rate_limiter_cleanup.py new file mode 100644 index 0000000..45719c5 --- /dev/null +++ b/backend/app/tasks/rate_limiter_cleanup.py @@ -0,0 +1,71 @@ +"""Rate limiter cleanup background task. + +Registers an APScheduler job that periodically removes expired rate-limit +entries from the in-memory rate limiter. Without this cleanup, the +rate-limiter state dictionary grows unbounded over long runtimes, eventually +consuming excessive memory. + +The cleanup is conservative: it only removes IPs with no recent attempts +(all timestamps outside the rate-limit window), so active or recently-active +IPs are preserved. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import structlog + +if TYPE_CHECKING: + from fastapi import FastAPI + +log: structlog.stdlib.BoundLogger = structlog.get_logger() + +#: How often the cleanup job fires (seconds). Chosen to balance memory +#: management against CPU overhead. A 30-minute interval handles typical +#: brute-force attack patterns while staying lightweight. +RATE_LIMITER_CLEANUP_INTERVAL: int = 30 * 60 # 30 minutes + +#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates. +JOB_ID: str = "rate_limiter_cleanup" + + +def _run_cleanup(app: FastAPI) -> None: + """Trigger cleanup of expired rate-limiter entries. + + Args: + app: The FastAPI application instance (holds the rate limiter). + """ + rate_limiter = getattr(app.state, "login_rate_limiter", None) + if rate_limiter is None: + log.warning( + "rate_limiter_cleanup_skipped", + reason="rate_limiter not found on app.state", + ) + return + + rate_limiter.cleanup_expired() + + +def register(app: FastAPI) -> None: + """Add (or replace) the rate-limiter cleanup job in the application scheduler. + + Must be called after the scheduler has been started (i.e., inside the + lifespan handler, after ``scheduler.start()``). + + Args: + app: The :class:`fastapi.FastAPI` application instance whose + ``app.state.scheduler`` will receive the job. + """ + app.state.scheduler.add_job( + _run_cleanup, + trigger="interval", + seconds=RATE_LIMITER_CLEANUP_INTERVAL, + kwargs={"app": app}, + id=JOB_ID, + replace_existing=True, + ) + log.info( + "rate_limiter_cleanup_scheduled", + interval_seconds=RATE_LIMITER_CLEANUP_INTERVAL, + ) diff --git a/backend/app/utils/rate_limiter.py b/backend/app/utils/rate_limiter.py index cc004b5..1ff00df 100644 --- a/backend/app/utils/rate_limiter.py +++ b/backend/app/utils/rate_limiter.py @@ -11,6 +11,23 @@ attacks to a single worker. The penalty strategy for failed login attempts is also managed here: record_failure() records a failure timestamp and returns the penalty delay to apply, enabling progressive back-off without exhausting request capacity. + +Operational Notes +----------------- + +**Cleanup Lifecycle**: The rate limiter state (_attempts, _failures, _lock_counts) +grows as IPs interact with the system. To prevent unbounded memory growth during +long runtimes, a scheduled background task (rate_limiter_cleanup) calls the +cleanup_expired() method every 30 minutes. This is safe because: + +- cleanup_expired() only removes IPs with no recent attempts (all timestamps + outside the rate-limit window), so active IPs are never disrupted. +- The cleanup is non-blocking and logged for observability. +- Individual requests already prune old timestamps from each IP's deque during + is_allowed() and record_failure(), so cleanup primarily handles dormant IPs. + +For monitoring, check logs for "rate_limiter_cleanup" events to observe how +many IPs are being retired from memory each cleanup cycle. """ from __future__ import annotations