Add scheduled cleanup for rate limiter (#32)

Implement periodic cleanup of expired rate-limiter entries to prevent
unbounded memory growth during long runtimes.

Changes:
- Create rate_limiter_cleanup task that calls cleanup_expired() every 30 minutes
- Register the task in the startup DAG alongside other background jobs
- Update rate_limiter module documentation with operational notes about the
  cleanup lifecycle and memory management strategy

The cleanup is conservative and only removes IPs with no recent attempts
(all timestamps outside the rate-limit window), so active IPs are preserved.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-04-29 19:28:45 +02:00
parent 18036d53bf
commit c2dd9f5f55
3 changed files with 92 additions and 1 deletions

View File

@@ -41,6 +41,7 @@ from app.tasks import (
geo_re_resolve,
health_check,
history_sync,
rate_limiter_cleanup,
session_cleanup,
)
from app.utils.async_utils import run_blocking
@@ -395,6 +396,7 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No
- geo_re_resolve: Periodic re-resolution of stale records
- history_sync: Periodic synchronization of ban history
- session_cleanup: Periodic cleanup of expired sessions
- rate_limiter_cleanup: Periodic cleanup of expired rate-limiter entries
Args:
app: The FastAPI application instance.
@@ -407,5 +409,6 @@ async def _stage_register_tasks(app: FastAPI, scheduler: AsyncIOScheduler) -> No
geo_re_resolve.register(app)
history_sync.register(app)
session_cleanup.register(app)
rate_limiter_cleanup.register(app)
log.info("startup_tasks_registered", count=7)
log.info("startup_tasks_registered", count=8)

View File

@@ -0,0 +1,71 @@
"""Rate limiter cleanup background task.
Registers an APScheduler job that periodically removes expired rate-limit
entries from the in-memory rate limiter. Without this cleanup, the
rate-limiter state dictionary grows unbounded over long runtimes, eventually
consuming excessive memory.
The cleanup is conservative: it only removes IPs with no recent attempts
(all timestamps outside the rate-limit window), so active or recently-active
IPs are preserved.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import structlog
if TYPE_CHECKING:
from fastapi import FastAPI
log: structlog.stdlib.BoundLogger = structlog.get_logger()
#: How often the cleanup job fires (seconds). Chosen to balance memory
#: management against CPU overhead. A 30-minute interval handles typical
#: brute-force attack patterns while staying lightweight.
RATE_LIMITER_CLEANUP_INTERVAL: int = 30 * 60 # 30 minutes
#: Stable APScheduler job ID — ensures re-registration replaces, not duplicates.
JOB_ID: str = "rate_limiter_cleanup"
def _run_cleanup(app: FastAPI) -> None:
"""Trigger cleanup of expired rate-limiter entries.
Args:
app: The FastAPI application instance (holds the rate limiter).
"""
rate_limiter = getattr(app.state, "login_rate_limiter", None)
if rate_limiter is None:
log.warning(
"rate_limiter_cleanup_skipped",
reason="rate_limiter not found on app.state",
)
return
rate_limiter.cleanup_expired()
def register(app: FastAPI) -> None:
"""Add (or replace) the rate-limiter cleanup job in the application scheduler.
Must be called after the scheduler has been started (i.e., inside the
lifespan handler, after ``scheduler.start()``).
Args:
app: The :class:`fastapi.FastAPI` application instance whose
``app.state.scheduler`` will receive the job.
"""
app.state.scheduler.add_job(
_run_cleanup,
trigger="interval",
seconds=RATE_LIMITER_CLEANUP_INTERVAL,
kwargs={"app": app},
id=JOB_ID,
replace_existing=True,
)
log.info(
"rate_limiter_cleanup_scheduled",
interval_seconds=RATE_LIMITER_CLEANUP_INTERVAL,
)

View File

@@ -11,6 +11,23 @@ attacks to a single worker.
The penalty strategy for failed login attempts is also managed here:
record_failure() records a failure timestamp and returns the penalty delay
to apply, enabling progressive back-off without exhausting request capacity.
Operational Notes
-----------------
**Cleanup Lifecycle**: The rate limiter state (_attempts, _failures, _lock_counts)
grows as IPs interact with the system. To prevent unbounded memory growth during
long runtimes, a scheduled background task (rate_limiter_cleanup) calls the
cleanup_expired() method every 30 minutes. This is safe because:
- cleanup_expired() only removes IPs with no recent attempts (all timestamps
outside the rate-limit window), so active IPs are never disrupted.
- The cleanup is non-blocking and logged for observability.
- Individual requests already prune old timestamps from each IP's deque during
is_allowed() and record_failure(), so cleanup primarily handles dormant IPs.
For monitoring, check logs for "rate_limiter_cleanup" events to observe how
many IPs are being retired from memory each cleanup cycle.
"""
from __future__ import annotations