From bc87bee41617faef6ad01bfccf2021f2380333f0 Mon Sep 17 00:00:00 2001 From: Lukas Date: Wed, 27 May 2026 22:09:18 +0200 Subject: [PATCH] refactor(scheduler): drop separate scheduler.db in favour of MemoryJobStore Scheduler used a separate SQLite file (scheduler.db) only to persist one cron job. This was originally required because APScheduler's SQLAlchemyJobStore is sync-only, creating an async/sync driver conflict when accessing the same file. The job is rebuilt from config.json on every startup regardless (replace_existing=True), so the persisted state only served misfire detection. Moved misfire detection into the app layer by querying system_settings.last_scan_timestamp on startup: if the last scan is >23h but <25h ago, an immediate rescan is triggered. Change summary: - Remove SQLAlchemyJobStore; use default MemoryJobStore instead - Add _check_missed_run() that reads last_scan_timestamp from aniworld.db - Update docs/DEVELOPMENT.md scheduler troubleshooting section - Update the scheduler unit test that verified SQLAlchemyJobStore --- docs/DEVELOPMENT.md | 36 ++++------ src/server/services/scheduler_service.py | 87 ++++++++++++++++++++---- tests/unit/test_scheduler_service.py | 13 ++-- 3 files changed, 95 insertions(+), 41 deletions(-) diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 0a2e600..a96648e 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -162,24 +162,21 @@ await client.close() # May not be called if exception raised earlier ### Scheduler Persistence and Recovery -APScheduler stores jobs in `data/scheduler.db` (SQLite) so they survive process restarts: +The scheduler uses APScheduler's in-memory job store. Jobs are reconstructed from `config.json` on every startup — no separate database is needed. ```python -from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore - -jobstores = { - "default": SQLAlchemyJobStore(url="sqlite:///./data/scheduler.db"), -} -scheduler = AsyncIOScheduler(jobstores=jobstores) +# Jobs are built from config on startup — no persistence DB required +scheduler = AsyncIOScheduler() # default MemoryJobStore +scheduler.add_job(..., replace_existing=True) ``` -**Grace period:** `misfire_grace_time=3600` (1 hour). If server is down at scheduled time and restarts within 1 hour, missed job runs automatically via APScheduler coalesce behavior. +**Startup misfire recovery:** On `start()`, the scheduler checks `system_settings.last_scan_timestamp` in `aniworld.db`. If the last scan is overdue (>23h but <25h ago), an immediate rescan is triggered. This replaces APScheduler's built-in misfire handling which required a separate SQLite database. -**Startup recovery:** On `start()`, scheduler loads persisted jobs from DB. APScheduler handles missed jobs internally when `coalesce=True`. +**Grace period:** If the server was down for more than 25 hours, no automatic recovery occurs to avoid surprise rescans after long downtime. **Health endpoint:** `GET /health` returns `scheduler_next_run` and `scheduler_last_run` for external monitors (Uptime Kuma, Prometheus, etc.). -**If server is down >1 hour:** No automatic recovery. Manual trigger via `POST /api/scheduler/trigger-rescan` or wait for next scheduled run. +**If server is down too long:** Manual trigger via `POST /api/scheduler/trigger-rescan` or wait for next scheduled run. ### Database Session Management @@ -257,30 +254,27 @@ DNS checks are warnings because failures can be transient. anime_directory error #### Scheduler missed a run 1. Server was down at scheduled time (03:00 UTC by default). -2. Check `data/scheduler.db` exists — if not, jobs are not persisted. -3. If server was down >1 hour, missed job is dropped (misfire window exceeded). +2. On restart, the scheduler checks `last_scan_timestamp` — if overdue by 23-25h, it triggers immediately. +3. If server was down >25 hours, missed job is skipped to avoid surprise rescans. 4. Trigger manually: `POST /api/scheduler/trigger-rescan` 5. Monitor next run: `GET /health` → `scheduler_next_run` -6. If problem repeats, increase `misfire_grace_time` in `scheduler_service.py`. #### Scheduler not firing (no events at scheduled time) If the scheduler appears configured but never triggers: -1. **Verify scheduler.db contains the job:** - ```bash - sqlite3 data/scheduler.db "SELECT id, next_run_time FROM apscheduler_jobs;" - ``` - - `next_run_time` should be in the future - - If it's in the past, the server was down when the job should have fired - -2. **Check application logs for scheduler startup:** +1. **Check application logs for scheduler startup:** ``` grep "Scheduler service started" fastapi_app.log ``` - If missing, the scheduler failed to start — check for errors above this line - If present, scheduler started successfully +2. **Verify the job is registered:** + ``` + grep "Scheduler started with cron trigger" fastapi_app.log + ``` + 3. **Verify APScheduler events in logs:** ``` grep "apscheduler.executors.default" fastapi_app.log diff --git a/src/server/services/scheduler_service.py b/src/server/services/scheduler_service.py index 0b44e70..968c85e 100644 --- a/src/server/services/scheduler_service.py +++ b/src/server/services/scheduler_service.py @@ -4,17 +4,16 @@ Uses APScheduler's AsyncIOScheduler with CronTrigger for precise cron-based scheduling. The legacy interval-based loop has been removed in favour of the cron approach. -Jobs are persisted to a SQLite database so they survive process restarts. -On startup, if the last scheduled run was missed (server was down at the -cron time), the job is triggered immediately within a grace period. +Jobs are held in memory (no separate scheduler database). On startup, +if the last scan timestamp indicates a missed run (server was down at the +scheduled cron time), a rescan is triggered immediately. """ from __future__ import annotations import logging -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import List, Optional -from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.triggers.cron import CronTrigger @@ -83,10 +82,9 @@ class SchedulerService: logger.error("Failed to load scheduler configuration: %s", exc) raise SchedulerServiceError(f"Failed to load config: {exc}") from exc - jobstores = { - "default": SQLAlchemyJobStore(url="sqlite:///./data/scheduler.db"), - } - self._scheduler = AsyncIOScheduler(jobstores=jobstores) + # Use in-memory job store — no separate scheduler.db needed. + # Jobs are reconstructed from config on every startup. + self._scheduler = AsyncIOScheduler() if not self._config.enabled: logger.info("Scheduler is disabled in configuration — not adding jobs") @@ -125,10 +123,7 @@ class SchedulerService: self._scheduler.start() self._is_running = True - # Startup recovery: if the server was down at the scheduled time and - # the job is within the misfire window, APScheduler will run it - # automatically. Log the scheduled time for visibility. - # Note: next_run_time is only available AFTER scheduler.start() + # Log next scheduled run for visibility. job = self._scheduler.get_job(_JOB_ID) if job: next_run = job.next_run_time @@ -137,6 +132,11 @@ class SchedulerService: next_run.isoformat() if next_run else None, ) + # Startup misfire recovery: check if the last scan was missed while + # the server was down. If overdue by more than one interval but within + # the grace period, trigger an immediate rescan. + await self._check_missed_run() + async def stop(self) -> None: """Stop the APScheduler gracefully.""" logger.info("SchedulerService.stop() called") @@ -303,6 +303,67 @@ class SchedulerService: ) return trigger + async def _check_missed_run(self) -> None: + """Check if a scheduled rescan was missed while the server was down. + + Compares system_settings.last_scan_timestamp against the expected + schedule. If the last scan is overdue (more than 24h ago for a daily + schedule) but within the grace period, triggers an immediate rescan. + """ + if not self._config or not self._config.enabled: + return + if not self._config.schedule_days: + return + + try: + from src.server.database.connection import ( # noqa: PLC0415 + get_db_session, + ) + from src.server.database.system_settings_service import ( # noqa: PLC0415 + SystemSettingsService, + ) + + async with get_db_session() as db: + settings = await SystemSettingsService.get_or_create(db) + last_scan = settings.last_scan_timestamp + + if last_scan is None: + # Never scanned before — trigger immediately + logger.info("No previous scan recorded — triggering immediate rescan") + await self._perform_rescan() + return + + # Ensure timezone-aware comparison + if last_scan.tzinfo is None: + last_scan = last_scan.replace(tzinfo=timezone.utc) + + now = datetime.now(timezone.utc) + elapsed = now - last_scan + + # If last scan was more than 24h + grace period ago, don't trigger + # (avoids surprise rescans after long downtime). + max_overdue = timedelta(hours=24, seconds=_MISFIRE_GRACE_SECONDS) + # If last scan was more than ~25h ago, skip (too stale) + if elapsed > max_overdue: + logger.info( + "Last scan was %s ago (> %s) — skipping missed-run recovery", + elapsed, + max_overdue, + ) + return + + # Check if a run should have happened between last_scan and now. + # Simple heuristic: if elapsed > 24h, we missed at least one daily run. + if elapsed > timedelta(hours=23): + logger.info( + "Missed scheduled rescan detected (last scan %s ago) — triggering now", + elapsed, + ) + await self._perform_rescan() + + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning("Missed-run check failed (non-fatal): %s", exc) + async def _broadcast(self, event_type: str, data: dict) -> None: """Broadcast a WebSocket event to all connected clients.""" try: diff --git a/tests/unit/test_scheduler_service.py b/tests/unit/test_scheduler_service.py index 7a40502..5d5cff9 100644 --- a/tests/unit/test_scheduler_service.py +++ b/tests/unit/test_scheduler_service.py @@ -489,12 +489,12 @@ class TestSingletonHelpers: # --------------------------------------------------------------------------- -# 12.12 Persistent job store — SQLAlchemyJobStore passed to AsyncIOScheduler +# 12.12 In-memory job store — no separate scheduler.db needed # --------------------------------------------------------------------------- -class TestPersistentJobStore: +class TestInMemoryJobStore: @pytest.mark.asyncio - async def test_start_creates_scheduler_with_sqlalchemy_jobstore( + async def test_start_creates_scheduler_without_jobstore_arg( self, scheduler_service, mock_config_service ): with patch( @@ -508,10 +508,9 @@ class TestPersistentJobStore: MockScheduler.assert_called_once() call_kwargs = MockScheduler.call_args - jobstores = call_kwargs[1]["jobstores"] - assert "default" in jobstores - # Verify it's a SQLAlchemyJobStore (class check via module name) - assert "sqlalchemy" in type(jobstores["default"]).__module__ + # No jobstores argument — uses default MemoryJobStore + if call_kwargs[1]: + assert "jobstores" not in call_kwargs[1] @pytest.mark.asyncio async def test_job_options_include_misfire_grace_and_coalesce(