feat: Add comprehensive provider health monitoring and failover system

- Implemented ProviderHealthMonitor for real-time tracking
  - Monitors availability, response times, success rates
  - Automatic marking unavailable after failures
  - Background health check loop

- Added ProviderFailover for automatic provider switching
  - Configurable retry attempts with exponential backoff
  - Integration with health monitoring
  - Smart provider selection

- Created MonitoredProviderWrapper for performance tracking
  - Transparent monitoring for any provider
  - Automatic metric recording
  - No changes needed to existing providers

- Implemented ProviderConfigManager for dynamic configuration
  - Runtime updates without restart
  - Per-provider settings (timeout, retries, bandwidth)
  - JSON-based persistence

- Added Provider Management API (15+ endpoints)
  - Health monitoring endpoints
  - Configuration management
  - Failover control

- Comprehensive testing (34 tests, 100% pass rate)
  - Health monitoring tests
  - Failover scenario tests
  - Configuration management tests

- Documentation updates
  - Updated infrastructure.md
  - Updated instructions.md
  - Created PROVIDER_ENHANCEMENT_SUMMARY.md

Total: ~2,593 lines of code, 34 passing tests
This commit is contained in:
2025-10-24 11:01:40 +02:00
parent 85d73b8294
commit fecdb38a90
23 changed files with 3137 additions and 109 deletions

View File

@@ -0,0 +1,207 @@
"""Unit tests for provider failover system."""
import pytest
from src.core.providers.failover import (
ProviderFailover,
configure_failover,
get_failover,
)
class TestProviderFailover:
"""Test ProviderFailover class."""
def test_failover_initialization(self):
"""Test failover initialization."""
providers = ["provider1", "provider2", "provider3"]
failover = ProviderFailover(
providers=providers,
max_retries=5,
retry_delay=2.0,
)
assert failover._providers == providers
assert failover._max_retries == 5
assert failover._retry_delay == 2.0
def test_get_current_provider(self):
"""Test getting current provider."""
providers = ["provider1", "provider2"]
failover = ProviderFailover(
providers=providers,
enable_health_monitoring=False,
)
current = failover.get_current_provider()
assert current in providers
def test_get_next_provider(self):
"""Test getting next provider."""
providers = ["provider1", "provider2", "provider3"]
failover = ProviderFailover(
providers=providers,
enable_health_monitoring=False,
)
first = failover.get_current_provider()
next_provider = failover.get_next_provider()
assert next_provider in providers
assert next_provider != first
@pytest.mark.asyncio
async def test_execute_with_failover_success(self):
"""Test successful execution with failover."""
async def mock_operation(provider: str) -> str:
return f"Success with {provider}"
failover = ProviderFailover(
providers=["provider1"],
enable_health_monitoring=False,
)
result = await failover.execute_with_failover(
operation=mock_operation,
operation_name="test_op",
)
assert "Success" in result
@pytest.mark.asyncio
async def test_execute_with_failover_retry(self):
"""Test failover with retry on first failure."""
call_count = 0
async def mock_operation(provider: str) -> str:
nonlocal call_count
call_count += 1
if call_count == 1:
raise Exception("First attempt failed")
return f"Success with {provider}"
failover = ProviderFailover(
providers=["provider1"],
max_retries=2,
retry_delay=0.1,
enable_health_monitoring=False,
)
result = await failover.execute_with_failover(
operation=mock_operation,
operation_name="test_op",
)
assert "Success" in result
assert call_count == 2
@pytest.mark.asyncio
async def test_execute_with_failover_all_fail(self):
"""Test failover when all providers fail."""
async def mock_operation(provider: str) -> str:
raise Exception(f"Failed with {provider}")
failover = ProviderFailover(
providers=["provider1", "provider2"],
max_retries=1,
retry_delay=0.1,
enable_health_monitoring=False,
)
with pytest.raises(Exception) as exc_info:
await failover.execute_with_failover(
operation=mock_operation,
operation_name="test_op",
)
assert "failed with all providers" in str(exc_info.value)
def test_add_provider(self):
"""Test adding provider to failover chain."""
failover = ProviderFailover(providers=["provider1"])
failover.add_provider("provider2")
assert "provider2" in failover.get_providers()
assert len(failover.get_providers()) == 2
def test_remove_provider(self):
"""Test removing provider from failover chain."""
failover = ProviderFailover(providers=["provider1", "provider2"])
success = failover.remove_provider("provider1")
assert success is True
assert "provider1" not in failover.get_providers()
assert len(failover.get_providers()) == 1
def test_remove_nonexistent_provider(self):
"""Test removing provider that doesn't exist."""
failover = ProviderFailover(providers=["provider1"])
success = failover.remove_provider("nonexistent")
assert success is False
def test_set_provider_priority(self):
"""Test setting provider priority."""
failover = ProviderFailover(
providers=["provider1", "provider2", "provider3"]
)
success = failover.set_provider_priority("provider3", 0)
assert success is True
providers = failover.get_providers()
assert providers[0] == "provider3"
def test_set_priority_nonexistent_provider(self):
"""Test setting priority for nonexistent provider."""
failover = ProviderFailover(providers=["provider1"])
success = failover.set_provider_priority("nonexistent", 0)
assert success is False
def test_get_failover_stats(self):
"""Test getting failover statistics."""
providers = ["provider1", "provider2"]
failover = ProviderFailover(
providers=providers,
max_retries=3,
retry_delay=1.5,
enable_health_monitoring=False,
)
stats = failover.get_failover_stats()
assert stats["total_providers"] == 2
assert stats["providers"] == providers
assert stats["max_retries"] == 3
assert stats["retry_delay"] == 1.5
assert stats["health_monitoring_enabled"] is False
class TestFailoverSingleton:
"""Test global failover singleton."""
def test_get_failover_singleton(self):
"""Test that get_failover returns singleton."""
failover1 = get_failover()
failover2 = get_failover()
assert failover1 is failover2
def test_configure_failover(self):
"""Test configuring global failover instance."""
providers = ["custom1", "custom2"]
failover = configure_failover(
providers=providers,
max_retries=10,
retry_delay=3.0,
)
assert failover._providers == providers
assert failover._max_retries == 10
assert failover._retry_delay == 3.0

View File

@@ -0,0 +1,329 @@
"""Unit tests for provider health monitoring system."""
import asyncio
from datetime import datetime
import pytest
from src.core.providers.health_monitor import (
ProviderHealthMetrics,
ProviderHealthMonitor,
RequestMetric,
get_health_monitor,
)
class TestProviderHealthMetrics:
"""Test ProviderHealthMetrics dataclass."""
def test_metrics_initialization(self):
"""Test metrics initialization with defaults."""
metrics = ProviderHealthMetrics(provider_name="test_provider")
assert metrics.provider_name == "test_provider"
assert metrics.is_available is True
assert metrics.total_requests == 0
assert metrics.successful_requests == 0
assert metrics.failed_requests == 0
assert metrics.average_response_time_ms == 0.0
assert metrics.consecutive_failures == 0
assert metrics.uptime_percentage == 100.0
def test_success_rate_calculation(self):
"""Test success rate calculation."""
metrics = ProviderHealthMetrics(provider_name="test")
metrics.total_requests = 100
metrics.successful_requests = 75
assert metrics.success_rate == 75.0
assert metrics.failure_rate == 25.0
def test_success_rate_zero_requests(self):
"""Test success rate with zero requests."""
metrics = ProviderHealthMetrics(provider_name="test")
assert metrics.success_rate == 0.0
assert metrics.failure_rate == 100.0
def test_to_dict(self):
"""Test metrics conversion to dictionary."""
metrics = ProviderHealthMetrics(
provider_name="test",
total_requests=10,
successful_requests=8,
)
result = metrics.to_dict()
assert result["provider_name"] == "test"
assert result["total_requests"] == 10
assert result["successful_requests"] == 8
assert result["success_rate"] == 80.0
assert "average_response_time_ms" in result
class TestProviderHealthMonitor:
"""Test ProviderHealthMonitor class."""
def test_monitor_initialization(self):
"""Test monitor initialization."""
monitor = ProviderHealthMonitor(
max_history_size=500,
health_check_interval=60,
failure_threshold=5,
)
assert monitor._max_history_size == 500
assert monitor._health_check_interval == 60
assert monitor._failure_threshold == 5
assert not monitor._is_running
def test_record_successful_request(self):
"""Test recording successful request."""
monitor = ProviderHealthMonitor()
monitor.record_request(
provider_name="test_provider",
success=True,
response_time_ms=150.0,
bytes_transferred=1024,
)
metrics = monitor.get_provider_metrics("test_provider")
assert metrics is not None
assert metrics.total_requests == 1
assert metrics.successful_requests == 1
assert metrics.failed_requests == 0
assert metrics.is_available is True
assert metrics.consecutive_failures == 0
assert metrics.average_response_time_ms == 150.0
assert metrics.total_bytes_downloaded == 1024
def test_record_failed_request(self):
"""Test recording failed request."""
monitor = ProviderHealthMonitor(failure_threshold=2)
monitor.record_request(
provider_name="test_provider",
success=False,
response_time_ms=200.0,
error_message="Connection timeout",
)
metrics = monitor.get_provider_metrics("test_provider")
assert metrics is not None
assert metrics.total_requests == 1
assert metrics.failed_requests == 1
assert metrics.consecutive_failures == 1
assert metrics.last_error == "Connection timeout"
assert metrics.is_available is True # Below threshold
def test_mark_unavailable_after_failures(self):
"""Test marking provider unavailable after threshold."""
monitor = ProviderHealthMonitor(failure_threshold=3)
for i in range(3):
monitor.record_request(
provider_name="test_provider",
success=False,
response_time_ms=100.0,
error_message=f"Error {i}",
)
metrics = monitor.get_provider_metrics("test_provider")
assert metrics.is_available is False
assert metrics.consecutive_failures == 3
def test_recovery_after_success(self):
"""Test provider recovery after successful request."""
monitor = ProviderHealthMonitor(failure_threshold=2)
# Record failures
for _ in range(2):
monitor.record_request(
provider_name="test_provider",
success=False,
response_time_ms=100.0,
)
metrics = monitor.get_provider_metrics("test_provider")
assert metrics.is_available is False
# Record success
monitor.record_request(
provider_name="test_provider",
success=True,
response_time_ms=100.0,
)
metrics = monitor.get_provider_metrics("test_provider")
assert metrics.is_available is True
assert metrics.consecutive_failures == 0
def test_average_response_time_calculation(self):
"""Test average response time calculation."""
monitor = ProviderHealthMonitor()
monitor.record_request(
"test", success=True, response_time_ms=100.0
)
monitor.record_request(
"test", success=True, response_time_ms=200.0
)
monitor.record_request(
"test", success=True, response_time_ms=300.0
)
metrics = monitor.get_provider_metrics("test")
assert metrics.average_response_time_ms == 200.0
def test_get_all_metrics(self):
"""Test getting metrics for all providers."""
monitor = ProviderHealthMonitor()
monitor.record_request("provider1", success=True, response_time_ms=100.0) # noqa: E501
monitor.record_request("provider2", success=True, response_time_ms=150.0) # noqa: E501
all_metrics = monitor.get_all_metrics()
assert len(all_metrics) == 2
assert "provider1" in all_metrics
assert "provider2" in all_metrics
def test_get_available_providers(self):
"""Test getting available providers list."""
monitor = ProviderHealthMonitor(failure_threshold=2)
# Available provider
monitor.record_request("provider1", success=True, response_time_ms=100.0) # noqa: E501
# Unavailable provider
for _ in range(3):
monitor.record_request(
"provider2", success=False, response_time_ms=100.0
)
available = monitor.get_available_providers()
assert "provider1" in available
assert "provider2" not in available
def test_get_best_provider(self):
"""Test getting best provider based on performance."""
monitor = ProviderHealthMonitor()
# Provider 1: 80% success, 100ms avg
for i in range(10):
monitor.record_request(
"provider1",
success=(i < 8),
response_time_ms=100.0,
)
# Provider 2: 90% success, 150ms avg
for i in range(10):
monitor.record_request(
"provider2",
success=(i < 9),
response_time_ms=150.0,
)
best = monitor.get_best_provider()
# Provider 2 should be best (higher success rate)
assert best == "provider2"
def test_reset_provider_metrics(self):
"""Test resetting provider metrics."""
monitor = ProviderHealthMonitor()
monitor.record_request("test", success=True, response_time_ms=100.0)
success = monitor.reset_provider_metrics("test")
assert success is True
metrics = monitor.get_provider_metrics("test")
assert metrics.total_requests == 0
def test_reset_nonexistent_provider(self):
"""Test resetting metrics for nonexistent provider."""
monitor = ProviderHealthMonitor()
success = monitor.reset_provider_metrics("nonexistent")
assert success is False
def test_health_summary(self):
"""Test health summary generation."""
monitor = ProviderHealthMonitor()
monitor.record_request("provider1", success=True, response_time_ms=100.0) # noqa: E501
monitor.record_request("provider2", success=True, response_time_ms=150.0) # noqa: E501
summary = monitor.get_health_summary()
assert summary["total_providers"] == 2
assert summary["available_providers"] == 2
assert summary["availability_percentage"] == 100.0
assert "average_success_rate" in summary
assert "average_response_time_ms" in summary
assert "providers" in summary
@pytest.mark.asyncio
async def test_start_stop_monitoring(self):
"""Test starting and stopping health monitoring."""
monitor = ProviderHealthMonitor(health_check_interval=1)
monitor.start_monitoring()
assert monitor._is_running is True
assert monitor._health_check_task is not None
await asyncio.sleep(0.1) # Let it run briefly
await monitor.stop_monitoring()
assert monitor._is_running is False
@pytest.mark.asyncio
async def test_periodic_health_checks(self):
"""Test periodic health check execution."""
monitor = ProviderHealthMonitor(health_check_interval=0.1)
# Add some data
monitor.record_request("test", success=True, response_time_ms=100.0)
monitor.start_monitoring()
await asyncio.sleep(0.3) # Wait for health checks
await monitor.stop_monitoring()
metrics = monitor.get_provider_metrics("test")
assert metrics.last_check_time is not None
class TestRequestMetric:
"""Test RequestMetric dataclass."""
def test_metric_initialization(self):
"""Test request metric initialization."""
now = datetime.now()
metric = RequestMetric(
timestamp=now,
success=True,
response_time_ms=150.0,
bytes_transferred=2048,
error_message=None,
)
assert metric.timestamp == now
assert metric.success is True
assert metric.response_time_ms == 150.0
assert metric.bytes_transferred == 2048
assert metric.error_message is None
class TestHealthMonitorSingleton:
"""Test global health monitor singleton."""
def test_get_health_monitor_singleton(self):
"""Test that get_health_monitor returns singleton."""
monitor1 = get_health_monitor()
monitor2 = get_health_monitor()
assert monitor1 is monitor2