Update observability docs and task utilities
- Add Observability.md documentation - Standardize task logging with correlation_id support - Add log_sanitizer utility for PII masking - Update Tasks.md tracking - Update geo_cache tasks and other task modules with correlation_id Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -7,16 +7,21 @@ if the running instance experiences temporary delays or high load.
|
||||
Without this heartbeat, stale lock detection (based on TTL) could incorrectly
|
||||
determine that the scheduler instance has crashed when it's merely busy, and
|
||||
a new instance could take over.
|
||||
|
||||
Correlation IDs are propagated through the task using :mod:`app.utils.correlation`
|
||||
so that task logs can be correlated across runs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
|
||||
from app.tasks.db import task_db
|
||||
from app.tasks.timeout_utils import run_with_timeout
|
||||
from app.utils.correlation import get_correlation_id, reset_correlation_id, set_correlation_id
|
||||
from app.utils.runtime_state import get_effective_settings
|
||||
from app.utils.scheduler_lock import update_scheduler_lock_heartbeat
|
||||
|
||||
@@ -38,7 +43,10 @@ JOB_ID: str = "scheduler_lock_heartbeat"
|
||||
TASK_TIMEOUT_SECONDS: int = 5
|
||||
|
||||
|
||||
async def _update_heartbeat_with_resources(settings: Settings) -> None:
|
||||
async def _update_heartbeat_with_resources(
|
||||
settings: Settings,
|
||||
correlation_id: str | None = None,
|
||||
) -> None:
|
||||
"""Update the scheduler lock heartbeat timestamp.
|
||||
|
||||
If the heartbeat update fails (e.g., we no longer hold the lock), log
|
||||
@@ -51,17 +59,31 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
|
||||
|
||||
Args:
|
||||
settings: The resolved application settings used for database access.
|
||||
correlation_id: Optional correlation ID from the triggering request.
|
||||
"""
|
||||
if correlation_id is None:
|
||||
correlation_id = str(uuid.uuid4())
|
||||
|
||||
token = set_correlation_id(correlation_id)
|
||||
try:
|
||||
await _do_update_heartbeat_with_settings(settings)
|
||||
finally:
|
||||
reset_correlation_id(token)
|
||||
|
||||
|
||||
async def _do_update_heartbeat_with_settings(settings: Settings) -> None:
|
||||
"""Inner heartbeat logic that runs with correlation context set."""
|
||||
|
||||
async def _do_update() -> None:
|
||||
async with task_db(settings) as db:
|
||||
success = await update_scheduler_lock_heartbeat(db)
|
||||
|
||||
if success:
|
||||
log.debug("scheduler_lock_heartbeat_updated")
|
||||
log.debug("scheduler_lock_heartbeat_updated", correlation_id=get_correlation_id())
|
||||
else:
|
||||
log.warning(
|
||||
"scheduler_lock_heartbeat_failed",
|
||||
correlation_id=get_correlation_id(),
|
||||
message="Failed to update heartbeat; we no longer hold the lock. "
|
||||
"Another instance may have taken over or the database connection failed.",
|
||||
)
|
||||
@@ -71,12 +93,14 @@ async def _update_heartbeat_with_resources(settings: Settings) -> None:
|
||||
except TimeoutError:
|
||||
log.error(
|
||||
"scheduler_lock_heartbeat_timeout",
|
||||
correlation_id=get_correlation_id(),
|
||||
timeout_seconds=TASK_TIMEOUT_SECONDS,
|
||||
message="Heartbeat update exceeded timeout. The database may be slow or unresponsive.",
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(
|
||||
"scheduler_lock_heartbeat_error",
|
||||
correlation_id=get_correlation_id(),
|
||||
error=str(e),
|
||||
message="Unexpected error during heartbeat update.",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user