Implement structured logging to centralized platforms (Datadog, Papertrail, ELK)
This commit adds support for shipping logs to external centralized logging platforms, addressing the MEDIUM priority task for structured logging infrastructure. ## Key Changes: ### 1. New Documentation: Docs/Observability.md - Comprehensive guide to logging architecture and configuration - Covers all three supported platforms (Datadog, Papertrail, Elasticsearch) - Includes best practices, security considerations, and troubleshooting - Documents sensitive data handling and compliance requirements ### 2. Core Implementation: app/utils/external_logging.py - ExternalLogHandler: Abstract base class for non-blocking log delivery - DatadogLogHandler: HTTP API integration with JSON payloads - PapertrailLogHandler: Syslog protocol over TCP - ElasticsearchLogHandler: Bulk API integration with NDJSON format - Features: - Async buffering with configurable batch size and flush interval - Exponential backoff retry logic - Non-blocking delivery (never blocks application logic) - Proper error handling and internal logging - Lifecycle management (start/shutdown) ### 3. Configuration: app/config.py - New Settings fields for external logging: - external_logging_enabled (default: False) - external_logging_provider (datadog/papertrail/elasticsearch) - external_logging_buffer_size (default: 1000) - external_logging_flush_interval_seconds (default: 5.0) - Provider-specific configuration (API keys, hosts, batch sizes) - All fields have sensible defaults - Full field validation and normalization ### 4. Integration: app/main.py - Global _external_log_handler for application lifecycle - _external_logging_processor: structlog processor for handler integration - Updated _configure_logging(): Add handler to processor chain when enabled - Updated _lifespan(): Initialize handler before startup, shutdown on termination ### 5. Tests: backend/tests/test_external_logging.py - 20 comprehensive tests covering all handlers and factory - Configuration validation tests - All tests passing ## Design Decisions: 1. **Non-blocking Delivery**: External logging never blocks request handling. Failures are logged locally but don't impact application. 2. **Buffering Strategy**: In-memory buffer with configurable size prevents unbounded memory growth. When buffer fills, oldest logs are dropped with a warning. 3. **Retry Logic**: Transient failures (timeouts, 5xx errors) are retried with exponential backoff. Permanent failures (bad credentials) are logged and skipped. 4. **Disabled by Default**: External logging is opt-in via environment variables, maintaining backward compatibility with existing deployments. 5. **Provider Flexibility**: Support for multiple platforms allows users to choose based on their infrastructure (cloud-native, on-premise, etc). ## Backward Compatibility: - All new configuration fields have defaults - External logging disabled by default - No changes to existing logging behavior unless explicitly configured - No new required dependencies ## Testing: - All 20 new tests passing - Existing tests unaffected (same count of passing tests) - Configuration validation tested - Handler creation and lifecycle management tested Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -15,7 +15,7 @@ import logging
|
||||
import re
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import AsyncGenerator, Awaitable, Callable
|
||||
@@ -62,6 +62,10 @@ from app.routers import (
|
||||
setup,
|
||||
)
|
||||
from app.startup import startup_shared_resources
|
||||
from app.utils.external_logging import (
|
||||
ExternalLogHandler,
|
||||
create_external_log_handler,
|
||||
)
|
||||
from app.utils.rate_limiter import GlobalRateLimiter, RateLimiter
|
||||
from app.utils.runtime_state import ApplicationState, RuntimeState
|
||||
from app.utils.scheduler_lock import release_scheduler_lock
|
||||
@@ -75,28 +79,56 @@ log: structlog.stdlib.BoundLogger = structlog.get_logger()
|
||||
# Logging configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_external_log_handler: ExternalLogHandler | None = None
|
||||
|
||||
def _configure_logging(log_level: str) -> None:
|
||||
|
||||
def _external_logging_processor(
|
||||
logger: logging.Logger, method_name: str, event_dict: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""Structlog processor that queues logs to external logging handler.
|
||||
|
||||
Args:
|
||||
logger: The logger instance.
|
||||
method_name: The name of the method called on the logger.
|
||||
event_dict: The event dictionary from structlog.
|
||||
|
||||
Returns:
|
||||
The event dictionary unchanged (other processors handle rendering).
|
||||
"""
|
||||
if _external_log_handler is not None:
|
||||
_external_log_handler.queue_log(event_dict.copy())
|
||||
return event_dict
|
||||
|
||||
|
||||
def _configure_logging(log_level: str, settings: Settings | None = None) -> None:
|
||||
"""Configure structlog for production JSON output.
|
||||
|
||||
Args:
|
||||
log_level: One of ``debug``, ``info``, ``warning``, ``error``, ``critical``.
|
||||
settings: Optional Settings object to configure external logging.
|
||||
"""
|
||||
level: int = logging.getLevelName(log_level.upper())
|
||||
logging.basicConfig(level=level, stream=sys.stdout, format="%(message)s")
|
||||
|
||||
processors = [
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
]
|
||||
|
||||
if settings and settings.external_logging_enabled and settings.external_logging_provider:
|
||||
processors.append(_external_logging_processor)
|
||||
|
||||
processors.append(structlog.processors.JSONRenderer())
|
||||
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer(),
|
||||
],
|
||||
processors=processors,
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
@@ -140,16 +172,47 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
Args:
|
||||
app: The :class:`fastapi.FastAPI` instance being started.
|
||||
"""
|
||||
settings: Settings = app.state.settings
|
||||
_configure_logging(settings.log_level)
|
||||
global _external_log_handler # noqa: PLW0603
|
||||
|
||||
log.info("bangui_starting_up", database_path=settings.database_path)
|
||||
settings: Settings = app.state.settings
|
||||
|
||||
http_session, scheduler, startup_db = await startup_shared_resources(app, settings)
|
||||
app.state.http_session = http_session
|
||||
app.state.scheduler = scheduler
|
||||
app.state.startup_db = startup_db
|
||||
|
||||
# Initialize external logging handler before configuring logging
|
||||
_external_log_handler = None
|
||||
if settings.external_logging_enabled and settings.external_logging_provider:
|
||||
try:
|
||||
_external_log_handler = create_external_log_handler(
|
||||
provider=settings.external_logging_provider,
|
||||
api_key=settings.datadog_api_key,
|
||||
datadog_site=settings.datadog_site,
|
||||
datadog_batch_size=settings.datadog_batch_size,
|
||||
papertrail_host=settings.papertrail_host,
|
||||
papertrail_port=settings.papertrail_port,
|
||||
papertrail_program_name=settings.papertrail_program_name,
|
||||
elasticsearch_hosts=settings.elasticsearch_hosts,
|
||||
elasticsearch_index_prefix=settings.elasticsearch_index_prefix,
|
||||
elasticsearch_batch_size=settings.elasticsearch_batch_size,
|
||||
flush_interval_seconds=settings.external_logging_flush_interval_seconds,
|
||||
buffer_size=settings.external_logging_buffer_size,
|
||||
http_session=http_session,
|
||||
)
|
||||
if _external_log_handler:
|
||||
_external_log_handler.start_periodic_flush()
|
||||
except ValueError as exc:
|
||||
log.warning(
|
||||
"external_logging_initialization_failed",
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
# Now configure logging with the handler in place
|
||||
_configure_logging(settings.log_level, settings)
|
||||
|
||||
log.info("bangui_starting_up", database_path=settings.database_path)
|
||||
|
||||
# Ensure session cache is initialized based on effective settings.
|
||||
# This cache is process-local and not cluster-safe. In multi-worker
|
||||
# deployments, it should be replaced with a shared backend.
|
||||
@@ -172,6 +235,14 @@ async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
log.info("bangui_shutting_down")
|
||||
scheduler.shutdown(wait=False)
|
||||
await http_session.close()
|
||||
|
||||
# Shutdown external logging handler
|
||||
if _external_log_handler:
|
||||
try:
|
||||
await _external_log_handler.shutdown()
|
||||
except Exception as exc:
|
||||
log.error("external_logging_shutdown_failed", error=str(exc))
|
||||
|
||||
# Release the scheduler lock to allow other instances to take over
|
||||
try:
|
||||
await release_scheduler_lock(startup_db)
|
||||
|
||||
Reference in New Issue
Block a user