This commit is contained in:
2025-10-22 09:20:35 +02:00
parent 1c8c18c1ea
commit 9e686017a6
18 changed files with 5177 additions and 0 deletions

View File

@@ -0,0 +1,380 @@
"""Log management utilities for rotation, archival, and search."""
import gzip
import logging
import shutil
from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class LogFile:
"""Information about a log file."""
filename: str
path: Path
size_bytes: int
created_time: datetime
modified_time: datetime
class LogManager:
"""Manage application logs."""
def __init__(self, log_dir: str = "logs"):
"""Initialize log manager.
Args:
log_dir: Directory containing log files.
"""
self.log_dir = Path(log_dir)
self.log_dir.mkdir(parents=True, exist_ok=True)
self.archived_dir = self.log_dir / "archived"
self.archived_dir.mkdir(exist_ok=True)
def get_log_files(self, pattern: str = "*.log") -> List[LogFile]:
"""Get list of log files.
Args:
pattern: Glob pattern for log files.
Returns:
list: List of LogFile objects.
"""
log_files = []
for log_path in self.log_dir.glob(pattern):
if log_path.is_file():
stat = log_path.stat()
log_files.append(
LogFile(
filename=log_path.name,
path=log_path,
size_bytes=stat.st_size,
created_time=datetime.fromtimestamp(
stat.st_ctime
),
modified_time=datetime.fromtimestamp(
stat.st_mtime
),
)
)
return sorted(log_files, key=lambda x: x.modified_time, reverse=True)
def rotate_log(
self, log_file: str, max_size_bytes: int = 10485760
) -> bool:
"""Rotate a log file if it exceeds max size.
Args:
log_file: Name of the log file.
max_size_bytes: Maximum size before rotation (default 10MB).
Returns:
bool: True if rotation was needed and successful.
"""
try:
log_path = self.log_dir / log_file
if not log_path.exists():
logger.warning(f"Log file not found: {log_file}")
return False
stat = log_path.stat()
if stat.st_size < max_size_bytes:
return False
# Create rotated filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
rotated_name = f"{log_path.stem}_{timestamp}.log"
rotated_path = self.log_dir / rotated_name
shutil.move(str(log_path), str(rotated_path))
# Compress the rotated file
self._compress_log(rotated_path)
logger.info(f"Rotated log file: {log_file} -> {rotated_name}")
return True
except Exception as e:
logger.error(f"Failed to rotate log file {log_file}: {e}")
return False
def _compress_log(self, log_path: Path) -> bool:
"""Compress a log file.
Args:
log_path: Path to the log file.
Returns:
bool: True if compression was successful.
"""
try:
gz_path = log_path.parent / f"{log_path.name}.gz"
with open(log_path, "rb") as f_in:
with gzip.open(gz_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
log_path.unlink()
logger.debug(f"Compressed log file: {log_path.name}")
return True
except Exception as e:
logger.error(f"Failed to compress log {log_path}: {e}")
return False
def archive_old_logs(
self, days_old: int = 30
) -> int:
"""Archive log files older than specified days.
Args:
days_old: Archive logs older than this many days.
Returns:
int: Number of logs archived.
"""
try:
cutoff_time = datetime.now() - timedelta(days=days_old)
archived_count = 0
for log_file in self.get_log_files():
if log_file.modified_time < cutoff_time:
try:
archived_path = (
self.archived_dir / log_file.filename
)
shutil.move(str(log_file.path), str(archived_path))
self._compress_log(archived_path)
archived_count += 1
logger.debug(
f"Archived log: {log_file.filename}"
)
except Exception as e:
logger.warning(
f"Failed to archive {log_file.filename}: {e}"
)
logger.info(f"Archived {archived_count} old log files")
return archived_count
except Exception as e:
logger.error(f"Failed to archive logs: {e}")
return 0
def search_logs(
self, search_term: str, case_sensitive: bool = False
) -> Dict[str, List[str]]:
"""Search for lines matching a term in log files.
Args:
search_term: Text to search for.
case_sensitive: Whether search is case-sensitive.
Returns:
dict: Dictionary mapping log files to matching lines.
"""
try:
results = {}
for log_file in self.get_log_files():
try:
with open(log_file.path, "r", encoding="utf-8") as f:
matching_lines = []
for line_num, line in enumerate(f, 1):
if case_sensitive:
if search_term in line:
matching_lines.append(
f"{line_num}: {line.strip()}"
)
else:
if search_term.lower() in line.lower():
matching_lines.append(
f"{line_num}: {line.strip()}"
)
if matching_lines:
results[log_file.filename] = matching_lines
except Exception as e:
logger.warning(
f"Failed to search {log_file.filename}: {e}"
)
logger.debug(
f"Search for '{search_term}' found {len(results)} log files"
)
return results
except Exception as e:
logger.error(f"Failed to search logs: {e}")
return {}
def export_logs(
self,
output_file: str,
log_pattern: str = "*.log",
compress: bool = True,
) -> bool:
"""Export logs to a file or archive.
Args:
output_file: Path to output file.
log_pattern: Pattern for logs to include.
compress: Whether to compress the output.
Returns:
bool: True if export was successful.
"""
try:
output_path = Path(output_file)
if compress:
import tarfile
tar_path = output_path.with_suffix(".tar.gz")
with tarfile.open(tar_path, "w:gz") as tar:
for log_file in self.get_log_files(log_pattern):
tar.add(
log_file.path,
arcname=log_file.filename,
)
logger.info(f"Exported logs to: {tar_path}")
return True
else:
# Concatenate all logs
with open(output_path, "w") as out_f:
for log_file in self.get_log_files(log_pattern):
out_f.write(f"\n\n=== {log_file.filename} ===\n\n")
with open(log_file.path, "r") as in_f:
out_f.write(in_f.read())
logger.info(f"Exported logs to: {output_path}")
return True
except Exception as e:
logger.error(f"Failed to export logs: {e}")
return False
def get_log_stats(self) -> Dict[str, Any]:
"""Get statistics about log files.
Returns:
dict: Log statistics.
"""
try:
log_files = self.get_log_files()
total_size = sum(log.size_bytes for log in log_files)
total_files = len(log_files)
if not log_files:
return {
"total_files": 0,
"total_size_bytes": 0,
"total_size_mb": 0,
"average_size_bytes": 0,
"largest_file": None,
"oldest_file": None,
"newest_file": None,
}
return {
"total_files": total_files,
"total_size_bytes": total_size,
"total_size_mb": total_size / (1024 * 1024),
"average_size_bytes": total_size // total_files,
"largest_file": max(
log_files, key=lambda x: x.size_bytes
).filename,
"oldest_file": log_files[-1].filename,
"newest_file": log_files[0].filename,
}
except Exception as e:
logger.error(f"Failed to get log stats: {e}")
return {}
def cleanup_logs(
self, max_total_size_mb: int = 100, keep_files: int = 5
) -> int:
"""Clean up old logs to maintain size limit.
Args:
max_total_size_mb: Maximum total log size in MB.
keep_files: Minimum files to keep.
Returns:
int: Number of files deleted.
"""
try:
max_bytes = max_total_size_mb * 1024 * 1024
log_files = self.get_log_files()
if len(log_files) <= keep_files:
return 0
total_size = sum(log.size_bytes for log in log_files)
deleted_count = 0
for log_file in reversed(log_files):
if (
total_size <= max_bytes
or len(log_files) <= keep_files
):
break
try:
log_file.path.unlink()
total_size -= log_file.size_bytes
deleted_count += 1
logger.debug(f"Deleted log file: {log_file.filename}")
except Exception as e:
logger.warning(
f"Failed to delete {log_file.filename}: {e}"
)
logger.info(f"Cleaned up {deleted_count} log files")
return deleted_count
except Exception as e:
logger.error(f"Failed to cleanup logs: {e}")
return 0
def set_log_level(self, logger_name: str, level: str) -> bool:
"""Set log level for a specific logger.
Args:
logger_name: Name of the logger.
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
Returns:
bool: True if successful.
"""
try:
log_level = getattr(logging, level.upper(), logging.INFO)
target_logger = logging.getLogger(logger_name)
target_logger.setLevel(log_level)
logger.info(f"Set {logger_name} log level to {level}")
return True
except Exception as e:
logger.error(f"Failed to set log level: {e}")
return False
# Global log manager instance
_log_manager: Optional[LogManager] = None
def get_log_manager() -> LogManager:
"""Get or create the global log manager instance.
Returns:
LogManager: The log manager instance.
"""
global _log_manager
if _log_manager is None:
_log_manager = LogManager()
return _log_manager

358
src/server/utils/metrics.py Normal file
View File

@@ -0,0 +1,358 @@
"""Metrics collection for Prometheus and custom business metrics."""
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from threading import Lock
from typing import Any, Dict, Optional
logger = logging.getLogger(__name__)
class MetricType(Enum):
"""Types of metrics."""
COUNTER = "counter"
GAUGE = "gauge"
HISTOGRAM = "histogram"
SUMMARY = "summary"
@dataclass
class MetricValue:
"""A single metric value with metadata."""
name: str
value: float
metric_type: MetricType
labels: Dict[str, str] = field(default_factory=dict)
timestamp: datetime = field(default_factory=datetime.now)
help_text: str = ""
@dataclass
class HistogramBucket:
"""Histogram bucket for latency tracking."""
le: float # bucket upper bound in seconds
count: int = 0
class MetricsCollector:
"""Collect and export metrics for monitoring."""
def __init__(self):
"""Initialize metrics collector."""
self._metrics: Dict[str, MetricValue] = {}
self._request_timings: Dict[str, list[float]] = {}
self._download_stats: Dict[str, int] = {
"completed": 0,
"failed": 0,
"total_size_bytes": 0,
}
self._lock = Lock()
self._timers: Dict[str, float] = {}
def increment_counter(
self,
name: str,
value: float = 1.0,
labels: Optional[Dict[str, str]] = None,
help_text: str = "",
) -> None:
"""Increment a counter metric.
Args:
name: Metric name.
value: Amount to increment by.
labels: Optional labels for the metric.
help_text: Optional help text describing the metric.
"""
with self._lock:
if name not in self._metrics:
self._metrics[name] = MetricValue(
name=name,
value=value,
metric_type=MetricType.COUNTER,
labels=labels or {},
help_text=help_text,
)
else:
self._metrics[name].value += value
def set_gauge(
self,
name: str,
value: float,
labels: Optional[Dict[str, str]] = None,
help_text: str = "",
) -> None:
"""Set a gauge metric.
Args:
name: Metric name.
value: Gauge value.
labels: Optional labels for the metric.
help_text: Optional help text describing the metric.
"""
with self._lock:
self._metrics[name] = MetricValue(
name=name,
value=value,
metric_type=MetricType.GAUGE,
labels=labels or {},
help_text=help_text,
)
def observe_histogram(
self,
name: str,
value: float,
labels: Optional[Dict[str, str]] = None,
help_text: str = "",
) -> None:
"""Observe a value for histogram.
Args:
name: Metric name.
value: Value to record.
labels: Optional labels for the metric.
help_text: Optional help text describing the metric.
"""
with self._lock:
if name not in self._request_timings:
self._request_timings[name] = []
self._request_timings[name].append(value)
# Update histogram metric
if name not in self._metrics:
self._metrics[name] = MetricValue(
name=name,
value=value,
metric_type=MetricType.HISTOGRAM,
labels=labels or {},
help_text=help_text,
)
def start_timer(self, timer_name: str) -> None:
"""Start a timer for tracking operation duration.
Args:
timer_name: Name of the timer.
"""
self._timers[timer_name] = time.time()
def end_timer(
self,
timer_name: str,
metric_name: str,
labels: Optional[Dict[str, str]] = None,
) -> float:
"""End a timer and record the duration.
Args:
timer_name: Name of the timer to end.
metric_name: Name of the metric to record.
labels: Optional labels for the metric.
Returns:
Duration in seconds.
"""
if timer_name not in self._timers:
logger.warning(f"Timer {timer_name} not started")
return 0.0
duration = time.time() - self._timers[timer_name]
del self._timers[timer_name]
self.observe_histogram(
metric_name, duration, labels, "Request/operation duration"
)
return duration
def record_download_success(self, size_bytes: int) -> None:
"""Record a successful download.
Args:
size_bytes: Size of downloaded file in bytes.
"""
with self._lock:
self._download_stats["completed"] += 1
self._download_stats["total_size_bytes"] += size_bytes
self.increment_counter(
"downloads_completed_total",
help_text="Total successful downloads",
)
def record_download_failure(self) -> None:
"""Record a failed download."""
with self._lock:
self._download_stats["failed"] += 1
self.increment_counter(
"downloads_failed_total", help_text="Total failed downloads"
)
def get_download_stats(self) -> Dict[str, int]:
"""Get download statistics.
Returns:
dict: Download statistics.
"""
with self._lock:
return self._download_stats.copy()
def get_request_statistics(
self, metric_name: str
) -> Optional[Dict[str, float]]:
"""Get statistics for a request timing metric.
Args:
metric_name: Name of the metric to analyze.
Returns:
Statistics including count, sum, mean, min, max.
"""
with self._lock:
if metric_name not in self._request_timings:
return None
timings = self._request_timings[metric_name]
if not timings:
return None
return {
"count": len(timings),
"sum": sum(timings),
"mean": sum(timings) / len(timings),
"min": min(timings),
"max": max(timings),
"p50": sorted(timings)[len(timings) // 2],
"p99": sorted(timings)[int(len(timings) * 0.99)],
}
def export_prometheus_format(self) -> str:
"""Export metrics in Prometheus text format.
Returns:
str: Prometheus format metrics.
"""
with self._lock:
lines = []
for name, metric in self._metrics.items():
# Add help text if available
if metric.help_text:
lines.append(f"# HELP {name} {metric.help_text}")
lines.append(f"# TYPE {name} {metric.metric_type.value}")
# Format labels
label_str = ""
if metric.labels:
label_pairs = [
f'{k}="{v}"' for k, v in metric.labels.items()
]
label_str = "{" + ",".join(label_pairs) + "}"
# Add metric value
lines.append(f"{name}{label_str} {metric.value}")
return "\n".join(lines)
def export_json(self) -> Dict[str, Any]:
"""Export metrics as JSON.
Returns:
dict: Metrics in JSON-serializable format.
"""
with self._lock:
metrics_dict = {}
for name, metric in self._metrics.items():
metrics_dict[name] = {
"value": metric.value,
"type": metric.metric_type.value,
"labels": metric.labels,
"timestamp": metric.timestamp.isoformat(),
}
return {
"metrics": metrics_dict,
"downloads": self._download_stats,
"request_timings": {
name: self.get_request_statistics(name)
for name in self._request_timings
},
}
def reset_metrics(self) -> None:
"""Reset all collected metrics."""
with self._lock:
self._metrics.clear()
self._request_timings.clear()
self._download_stats = {
"completed": 0,
"failed": 0,
"total_size_bytes": 0,
}
def get_all_metrics(self) -> Dict[str, MetricValue]:
"""Get all collected metrics.
Returns:
dict: All metrics keyed by name.
"""
with self._lock:
return self._metrics.copy()
# Global metrics collector instance
_metrics_collector: Optional[MetricsCollector] = None
def get_metrics_collector() -> MetricsCollector:
"""Get or create the global metrics collector instance.
Returns:
MetricsCollector: The metrics collector instance.
"""
global _metrics_collector
if _metrics_collector is None:
_metrics_collector = MetricsCollector()
return _metrics_collector
class TimerContext:
"""Context manager for timing operations."""
def __init__(
self,
metric_name: str,
timer_name: Optional[str] = None,
labels: Optional[Dict[str, str]] = None,
):
"""Initialize timer context.
Args:
metric_name: Name of the metric to record.
timer_name: Optional name for the timer.
labels: Optional labels for the metric.
"""
self.metric_name = metric_name
self.timer_name = timer_name or metric_name
self.labels = labels
self.collector = get_metrics_collector()
def __enter__(self):
"""Start the timer."""
self.collector.start_timer(self.timer_name)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""End the timer and record the metric."""
self.collector.end_timer(
self.timer_name, self.metric_name, self.labels
)

361
src/server/utils/system.py Normal file
View File

@@ -0,0 +1,361 @@
"""System utility functions for monitoring and management."""
import logging
import os
import shutil
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
import psutil
logger = logging.getLogger(__name__)
@dataclass
class DiskInfo:
"""Information about disk usage."""
total_bytes: int
used_bytes: int
free_bytes: int
percent_used: float
path: str
@dataclass
class ProcessInfo:
"""Information about a process."""
pid: int
name: str
status: str
cpu_percent: float
memory_percent: float
memory_mb: float
create_time: datetime
class SystemUtilities:
"""Utilities for system monitoring and management."""
@staticmethod
def get_disk_usage(path: str = "/") -> Optional[DiskInfo]:
"""Get disk usage information.
Args:
path: Path to check disk usage for.
Returns:
DiskInfo: Disk usage information.
"""
try:
usage = psutil.disk_usage(path)
return DiskInfo(
total_bytes=usage.total,
used_bytes=usage.used,
free_bytes=usage.free,
percent_used=usage.percent,
path=path,
)
except Exception as e:
logger.error(f"Failed to get disk usage for {path}: {e}")
return None
@staticmethod
def get_all_disk_usage() -> List[DiskInfo]:
"""Get disk usage for all mounted partitions.
Returns:
list: List of DiskInfo for each partition.
"""
try:
partitions = psutil.disk_partitions()
disk_infos = []
for partition in partitions:
try:
usage = psutil.disk_usage(partition.mountpoint)
disk_infos.append(
DiskInfo(
total_bytes=usage.total,
used_bytes=usage.used,
free_bytes=usage.free,
percent_used=usage.percent,
path=partition.mountpoint,
)
)
except Exception as e:
logger.warning(
f"Failed to get usage for {partition.mountpoint}: {e}"
)
return disk_infos
except Exception as e:
logger.error(f"Failed to get all disk usage: {e}")
return []
@staticmethod
def cleanup_directory(
directory: str, pattern: str = "*", max_age_days: int = 30
) -> int:
"""Clean up files in a directory matching a pattern.
Args:
directory: Directory to clean.
pattern: File pattern to match (glob).
max_age_days: Only delete files older than this.
Returns:
int: Number of files deleted.
"""
try:
from datetime import timedelta
path = Path(directory)
if not path.exists():
logger.warning(f"Directory not found: {directory}")
return 0
deleted_count = 0
cutoff_time = datetime.now() - timedelta(days=max_age_days)
for file_path in path.glob(pattern):
if file_path.is_file():
file_time = datetime.fromtimestamp(
file_path.stat().st_mtime
)
if file_time < cutoff_time:
try:
file_path.unlink()
deleted_count += 1
logger.debug(f"Deleted file: {file_path}")
except Exception as e:
logger.warning(
f"Failed to delete {file_path}: {e}"
)
logger.info(f"Cleaned up {deleted_count} files from {directory}")
return deleted_count
except Exception as e:
logger.error(f"Failed to cleanup directory {directory}: {e}")
return 0
@staticmethod
def cleanup_empty_directories(directory: str) -> int:
"""Remove empty directories.
Args:
directory: Root directory to clean.
Returns:
int: Number of directories deleted.
"""
try:
path = Path(directory)
if not path.exists():
return 0
deleted_count = 0
# Walk from bottom to top to delete empty dirs
for root, dirs, files in os.walk(directory, topdown=False):
for dir_name in dirs:
dir_path = Path(root) / dir_name
try:
if not os.listdir(dir_path):
os.rmdir(dir_path)
deleted_count += 1
logger.debug(
f"Deleted empty directory: {dir_path}"
)
except Exception as e:
logger.debug(f"Cannot delete {dir_path}: {e}")
logger.info(f"Cleaned up {deleted_count} empty directories")
return deleted_count
except Exception as e:
logger.error(f"Failed to cleanup empty directories: {e}")
return 0
@staticmethod
def get_directory_size(directory: str) -> int:
"""Get total size of a directory.
Args:
directory: Directory path.
Returns:
int: Total size in bytes.
"""
try:
path = Path(directory)
if not path.exists():
return 0
total_size = 0
for entry in path.rglob("*"):
if entry.is_file():
total_size += entry.stat().st_size
return total_size
except Exception as e:
logger.error(f"Failed to get directory size for {directory}: {e}")
return 0
@staticmethod
def get_process_info(pid: Optional[int] = None) -> Optional[ProcessInfo]:
"""Get information about a process.
Args:
pid: Process ID. If None, uses current process.
Returns:
ProcessInfo: Process information.
"""
try:
if pid is None:
pid = os.getpid()
process = psutil.Process(pid)
with process.oneshot():
return ProcessInfo(
pid=process.pid,
name=process.name(),
status=process.status(),
cpu_percent=process.cpu_percent(),
memory_percent=process.memory_percent(),
memory_mb=process.memory_info().rss / (1024 * 1024),
create_time=datetime.fromtimestamp(
process.create_time()
),
)
except Exception as e:
logger.error(f"Failed to get process info for {pid}: {e}")
return None
@staticmethod
def get_all_processes() -> List[ProcessInfo]:
"""Get information about all running processes.
Returns:
list: List of ProcessInfo for each process.
"""
try:
processes = []
for proc in psutil.process_iter(
["pid", "name", "status", "cpu_num", "memory_percent"]
):
try:
info = SystemUtilities.get_process_info(proc.pid)
if info:
processes.append(info)
except Exception:
pass
return processes
except Exception as e:
logger.error(f"Failed to get all processes: {e}")
return []
@staticmethod
def get_system_info() -> Dict[str, Any]:
"""Get comprehensive system information.
Returns:
dict: System information.
"""
try:
import platform
return {
"platform": platform.platform(),
"processor": platform.processor(),
"cpu_count": psutil.cpu_count(logical=False),
"cpu_count_logical": psutil.cpu_count(logical=True),
"boot_time": datetime.fromtimestamp(
psutil.boot_time()
).isoformat(),
"hostname": platform.node(),
"python_version": platform.python_version(),
}
except Exception as e:
logger.error(f"Failed to get system info: {e}")
return {}
@staticmethod
def get_network_info() -> Dict[str, Any]:
"""Get network information.
Returns:
dict: Network statistics.
"""
try:
net_io = psutil.net_io_counters()
return {
"bytes_sent": net_io.bytes_sent,
"bytes_recv": net_io.bytes_recv,
"packets_sent": net_io.packets_sent,
"packets_recv": net_io.packets_recv,
"errors_in": net_io.errin,
"errors_out": net_io.errout,
"dropped_in": net_io.dropin,
"dropped_out": net_io.dropout,
}
except Exception as e:
logger.error(f"Failed to get network info: {e}")
return {}
@staticmethod
def copy_file_atomic(
src: str, dest: str, chunk_size: int = 1024 * 1024
) -> bool:
"""Copy a file atomically using temporary file.
Args:
src: Source file path.
dest: Destination file path.
chunk_size: Size of chunks for copying.
Returns:
bool: True if successful.
"""
try:
src_path = Path(src)
dest_path = Path(dest)
if not src_path.exists():
logger.error(f"Source file not found: {src}")
return False
# Create temporary file
temp_path = dest_path.parent / f"{dest_path.name}.tmp"
# Copy to temporary file
shutil.copyfile(src, temp_path)
# Atomic rename
temp_path.replace(dest_path)
logger.debug(f"Atomically copied {src} to {dest}")
return True
except Exception as e:
logger.error(f"Failed to copy file {src} to {dest}: {e}")
return False
# Global system utilities instance
_system_utilities: Optional[SystemUtilities] = None
def get_system_utilities() -> SystemUtilities:
"""Get or create the global system utilities instance.
Returns:
SystemUtilities: The system utilities instance.
"""
global _system_utilities
if _system_utilities is None:
_system_utilities = SystemUtilities()
return _system_utilities