Update observability docs and task utilities

- Add Observability.md documentation
- Standardize task logging with correlation_id support
- Add log_sanitizer utility for PII masking
- Update Tasks.md tracking
- Update geo_cache tasks and other task modules with correlation_id

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-03 11:52:09 +02:00
parent 7b93499551
commit 0133489920
17 changed files with 582 additions and 124 deletions

View File

@@ -7,16 +7,21 @@ if the running instance experiences temporary delays or high load.
Without this heartbeat, stale lock detection (based on TTL) could incorrectly
determine that the scheduler instance has crashed when it's merely busy, and
a new instance could take over.
Correlation IDs are propagated through the task using :mod:`app.utils.correlation`
so that task logs can be correlated across runs.
"""
from __future__ import annotations
import uuid
from typing import TYPE_CHECKING
import structlog
from app.tasks.db import task_db
from app.tasks.timeout_utils import run_with_timeout
from app.utils.correlation import get_correlation_id, reset_correlation_id, set_correlation_id
from app.utils.runtime_state import get_effective_settings
from app.utils.scheduler_lock import update_scheduler_lock_heartbeat
@@ -38,7 +43,10 @@ JOB_ID: str = "scheduler_lock_heartbeat"
TASK_TIMEOUT_SECONDS: int = 5
async def _update_heartbeat_with_resources(settings: Settings) -> None:
async def _update_heartbeat_with_resources(
settings: Settings,
correlation_id: str | None = None,
) -> None:
"""Update the scheduler lock heartbeat timestamp.
If the heartbeat update fails (e.g., we no longer hold the lock), log
@@ -51,17 +59,31 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
Args:
settings: The resolved application settings used for database access.
correlation_id: Optional correlation ID from the triggering request.
"""
if correlation_id is None:
correlation_id = str(uuid.uuid4())
token = set_correlation_id(correlation_id)
try:
await _do_update_heartbeat_with_settings(settings)
finally:
reset_correlation_id(token)
async def _do_update_heartbeat_with_settings(settings: Settings) -> None:
"""Inner heartbeat logic that runs with correlation context set."""
async def _do_update() -> None:
async with task_db(settings) as db:
success = await update_scheduler_lock_heartbeat(db)
if success:
log.debug("scheduler_lock_heartbeat_updated")
log.debug("scheduler_lock_heartbeat_updated", correlation_id=get_correlation_id())
else:
log.warning(
"scheduler_lock_heartbeat_failed",
correlation_id=get_correlation_id(),
message="Failed to update heartbeat; we no longer hold the lock. "
"Another instance may have taken over or the database connection failed.",
)
@@ -71,12 +93,14 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
except TimeoutError:
log.error(
"scheduler_lock_heartbeat_timeout",
correlation_id=get_correlation_id(),
timeout_seconds=TASK_TIMEOUT_SECONDS,
message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
)
except Exception as e:
log.error(
"scheduler_lock_heartbeat_error",
correlation_id=get_correlation_id(),
error=str(e),
message="Unexpected error during heartbeat update.",
)