# AniWorld Alerting Rules groups: - name: aniworld.rules rules: # Application Health Alerts - alert: AniWorldDown expr: up{job="aniworld-web"} == 0 for: 1m labels: severity: critical annotations: summary: "AniWorld application is down" description: "AniWorld web application has been down for more than 1 minute." - alert: AniWorldHighResponseTime expr: histogram_quantile(0.95, rate(flask_request_duration_seconds_bucket[5m])) > 5 for: 2m labels: severity: warning annotations: summary: "High response time for AniWorld" description: "95th percentile response time is {{ $value }} seconds." # System Resource Alerts - alert: HighCPUUsage expr: aniworld_cpu_usage_percent > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on AniWorld server" description: "CPU usage is above 80% for more than 5 minutes. Current value: {{ $value }}%" - alert: HighMemoryUsage expr: aniworld_memory_usage_percent > 85 for: 3m labels: severity: warning annotations: summary: "High memory usage on AniWorld server" description: "Memory usage is above 85% for more than 3 minutes. Current value: {{ $value }}%" - alert: CriticalMemoryUsage expr: aniworld_memory_usage_percent > 95 for: 1m labels: severity: critical annotations: summary: "Critical memory usage on AniWorld server" description: "Memory usage is above 95%. Current value: {{ $value }}%" - alert: HighDiskUsage expr: aniworld_disk_usage_percent > 90 for: 5m labels: severity: warning annotations: summary: "High disk usage on AniWorld server" description: "Disk usage is above 90% for more than 5 minutes. Current value: {{ $value }}%" - alert: CriticalDiskUsage expr: aniworld_disk_usage_percent > 95 for: 1m labels: severity: critical annotations: summary: "Critical disk usage on AniWorld server" description: "Disk usage is above 95%. Current value: {{ $value }}%" # Database Alerts - alert: DatabaseConnectionFailure expr: up{job="aniworld-web"} == 1 and aniworld_database_connected == 0 for: 2m labels: severity: critical annotations: summary: "Database connection failure" description: "AniWorld cannot connect to the database for more than 2 minutes." - alert: SlowDatabaseQueries expr: aniworld_database_query_duration_seconds > 5 for: 1m labels: severity: warning annotations: summary: "Slow database queries detected" description: "Database queries are taking longer than 5 seconds. Current duration: {{ $value }}s" # Download Performance Alerts - alert: HighDownloadFailureRate expr: rate(aniworld_downloads_failed_total[5m]) / rate(aniworld_downloads_total[5m]) > 0.1 for: 3m labels: severity: warning annotations: summary: "High download failure rate" description: "Download failure rate is above 10% for the last 5 minutes." - alert: NoDownloadActivity expr: increase(aniworld_downloads_total[1h]) == 0 for: 2h labels: severity: info annotations: summary: "No download activity detected" description: "No downloads have been initiated in the last 2 hours." # Process Alerts - alert: HighThreadCount expr: aniworld_process_threads > 100 for: 5m labels: severity: warning annotations: summary: "High thread count in AniWorld process" description: "Thread count is above 100 for more than 5 minutes. Current count: {{ $value }}" - alert: ProcessMemoryLeak expr: increase(aniworld_process_memory_bytes[1h]) > 100000000 # 100MB for: 1h labels: severity: warning annotations: summary: "Potential memory leak detected" description: "Process memory usage has increased by more than 100MB in the last hour." # Network Alerts - alert: NetworkConnectivityIssue expr: aniworld_network_connectivity == 0 for: 2m labels: severity: warning annotations: summary: "Network connectivity issue" description: "AniWorld is experiencing network connectivity issues." # Security Alerts - alert: HighFailedLoginAttempts expr: increase(aniworld_failed_login_attempts_total[5m]) > 10 for: 1m labels: severity: warning annotations: summary: "High number of failed login attempts" description: "More than 10 failed login attempts in the last 5 minutes." - alert: UnauthorizedAPIAccess expr: increase(aniworld_unauthorized_api_requests_total[5m]) > 50 for: 2m labels: severity: warning annotations: summary: "High number of unauthorized API requests" description: "More than 50 unauthorized API requests in the last 5 minutes." # Cache Performance Alerts - alert: LowCacheHitRate expr: aniworld_cache_hit_rate < 0.7 for: 10m labels: severity: info annotations: summary: "Low cache hit rate" description: "Cache hit rate is below 70% for more than 10 minutes. Current rate: {{ $value }}" - name: infrastructure.rules rules: # Redis Alerts - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis server has been down for more than 1 minute." - alert: RedisHighMemoryUsage expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "Redis high memory usage" description: "Redis memory usage is above 90%." # Nginx Alerts - alert: NginxDown expr: up{job="nginx"} == 0 for: 1m labels: severity: critical annotations: summary: "Nginx is down" description: "Nginx reverse proxy has been down for more than 1 minute." - alert: NginxHighErrorRate expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05 for: 2m labels: severity: warning annotations: summary: "High error rate in Nginx" description: "Nginx is returning more than 5% server errors." - name: custom.rules rules: # Custom Business Logic Alerts - alert: AnimeCollectionSizeIncreaseStalled expr: increase(aniworld_anime_total[24h]) == 0 for: 48h labels: severity: info annotations: summary: "Anime collection size hasn't increased" description: "No new anime have been added to the collection in the last 48 hours." - alert: EpisodeDownloadBacklog expr: aniworld_episodes_pending > 1000 for: 1h labels: severity: warning annotations: summary: "Large episode download backlog" description: "More than 1000 episodes are pending download. Current backlog: {{ $value }}"