226 lines
7.6 KiB
YAML

# AniWorld Alerting Rules
groups:
- name: aniworld.rules
rules:
# Application Health Alerts
- alert: AniWorldDown
expr: up{job="aniworld-web"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AniWorld application is down"
description: "AniWorld web application has been down for more than 1 minute."
- alert: AniWorldHighResponseTime
expr: histogram_quantile(0.95, rate(flask_request_duration_seconds_bucket[5m])) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "High response time for AniWorld"
description: "95th percentile response time is {{ $value }} seconds."
# System Resource Alerts
- alert: HighCPUUsage
expr: aniworld_cpu_usage_percent > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on AniWorld server"
description: "CPU usage is above 80% for more than 5 minutes. Current value: {{ $value }}%"
- alert: HighMemoryUsage
expr: aniworld_memory_usage_percent > 85
for: 3m
labels:
severity: warning
annotations:
summary: "High memory usage on AniWorld server"
description: "Memory usage is above 85% for more than 3 minutes. Current value: {{ $value }}%"
- alert: CriticalMemoryUsage
expr: aniworld_memory_usage_percent > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Critical memory usage on AniWorld server"
description: "Memory usage is above 95%. Current value: {{ $value }}%"
- alert: HighDiskUsage
expr: aniworld_disk_usage_percent > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High disk usage on AniWorld server"
description: "Disk usage is above 90% for more than 5 minutes. Current value: {{ $value }}%"
- alert: CriticalDiskUsage
expr: aniworld_disk_usage_percent > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Critical disk usage on AniWorld server"
description: "Disk usage is above 95%. Current value: {{ $value }}%"
# Database Alerts
- alert: DatabaseConnectionFailure
expr: up{job="aniworld-web"} == 1 and aniworld_database_connected == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Database connection failure"
description: "AniWorld cannot connect to the database for more than 2 minutes."
- alert: SlowDatabaseQueries
expr: aniworld_database_query_duration_seconds > 5
for: 1m
labels:
severity: warning
annotations:
summary: "Slow database queries detected"
description: "Database queries are taking longer than 5 seconds. Current duration: {{ $value }}s"
# Download Performance Alerts
- alert: HighDownloadFailureRate
expr: rate(aniworld_downloads_failed_total[5m]) / rate(aniworld_downloads_total[5m]) > 0.1
for: 3m
labels:
severity: warning
annotations:
summary: "High download failure rate"
description: "Download failure rate is above 10% for the last 5 minutes."
- alert: NoDownloadActivity
expr: increase(aniworld_downloads_total[1h]) == 0
for: 2h
labels:
severity: info
annotations:
summary: "No download activity detected"
description: "No downloads have been initiated in the last 2 hours."
# Process Alerts
- alert: HighThreadCount
expr: aniworld_process_threads > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High thread count in AniWorld process"
description: "Thread count is above 100 for more than 5 minutes. Current count: {{ $value }}"
- alert: ProcessMemoryLeak
expr: increase(aniworld_process_memory_bytes[1h]) > 100000000 # 100MB
for: 1h
labels:
severity: warning
annotations:
summary: "Potential memory leak detected"
description: "Process memory usage has increased by more than 100MB in the last hour."
# Network Alerts
- alert: NetworkConnectivityIssue
expr: aniworld_network_connectivity == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Network connectivity issue"
description: "AniWorld is experiencing network connectivity issues."
# Security Alerts
- alert: HighFailedLoginAttempts
expr: increase(aniworld_failed_login_attempts_total[5m]) > 10
for: 1m
labels:
severity: warning
annotations:
summary: "High number of failed login attempts"
description: "More than 10 failed login attempts in the last 5 minutes."
- alert: UnauthorizedAPIAccess
expr: increase(aniworld_unauthorized_api_requests_total[5m]) > 50
for: 2m
labels:
severity: warning
annotations:
summary: "High number of unauthorized API requests"
description: "More than 50 unauthorized API requests in the last 5 minutes."
# Cache Performance Alerts
- alert: LowCacheHitRate
expr: aniworld_cache_hit_rate < 0.7
for: 10m
labels:
severity: info
annotations:
summary: "Low cache hit rate"
description: "Cache hit rate is below 70% for more than 10 minutes. Current rate: {{ $value }}"
- name: infrastructure.rules
rules:
# Redis Alerts
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis server has been down for more than 1 minute."
- alert: RedisHighMemoryUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Redis high memory usage"
description: "Redis memory usage is above 90%."
# Nginx Alerts
- alert: NginxDown
expr: up{job="nginx"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Nginx is down"
description: "Nginx reverse proxy has been down for more than 1 minute."
- alert: NginxHighErrorRate
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate in Nginx"
description: "Nginx is returning more than 5% server errors."
- name: custom.rules
rules:
# Custom Business Logic Alerts
- alert: AnimeCollectionSizeIncreaseStalled
expr: increase(aniworld_anime_total[24h]) == 0
for: 48h
labels:
severity: info
annotations:
summary: "Anime collection size hasn't increased"
description: "No new anime have been added to the collection in the last 48 hours."
- alert: EpisodeDownloadBacklog
expr: aniworld_episodes_pending > 1000
for: 1h
labels:
severity: warning
annotations:
summary: "Large episode download backlog"
description: "More than 1000 episodes are pending download. Current backlog: {{ $value }}"