226 lines
7.6 KiB
YAML
226 lines
7.6 KiB
YAML
# AniWorld Alerting Rules
|
|
|
|
groups:
|
|
- name: aniworld.rules
|
|
rules:
|
|
# Application Health Alerts
|
|
- alert: AniWorldDown
|
|
expr: up{job="aniworld-web"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "AniWorld application is down"
|
|
description: "AniWorld web application has been down for more than 1 minute."
|
|
|
|
- alert: AniWorldHighResponseTime
|
|
expr: histogram_quantile(0.95, rate(flask_request_duration_seconds_bucket[5m])) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High response time for AniWorld"
|
|
description: "95th percentile response time is {{ $value }} seconds."
|
|
|
|
# System Resource Alerts
|
|
- alert: HighCPUUsage
|
|
expr: aniworld_cpu_usage_percent > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on AniWorld server"
|
|
description: "CPU usage is above 80% for more than 5 minutes. Current value: {{ $value }}%"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: aniworld_memory_usage_percent > 85
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on AniWorld server"
|
|
description: "Memory usage is above 85% for more than 3 minutes. Current value: {{ $value }}%"
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: aniworld_memory_usage_percent > 95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory usage on AniWorld server"
|
|
description: "Memory usage is above 95%. Current value: {{ $value }}%"
|
|
|
|
- alert: HighDiskUsage
|
|
expr: aniworld_disk_usage_percent > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk usage on AniWorld server"
|
|
description: "Disk usage is above 90% for more than 5 minutes. Current value: {{ $value }}%"
|
|
|
|
- alert: CriticalDiskUsage
|
|
expr: aniworld_disk_usage_percent > 95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk usage on AniWorld server"
|
|
description: "Disk usage is above 95%. Current value: {{ $value }}%"
|
|
|
|
# Database Alerts
|
|
- alert: DatabaseConnectionFailure
|
|
expr: up{job="aniworld-web"} == 1 and aniworld_database_connected == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database connection failure"
|
|
description: "AniWorld cannot connect to the database for more than 2 minutes."
|
|
|
|
- alert: SlowDatabaseQueries
|
|
expr: aniworld_database_query_duration_seconds > 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Slow database queries detected"
|
|
description: "Database queries are taking longer than 5 seconds. Current duration: {{ $value }}s"
|
|
|
|
# Download Performance Alerts
|
|
- alert: HighDownloadFailureRate
|
|
expr: rate(aniworld_downloads_failed_total[5m]) / rate(aniworld_downloads_total[5m]) > 0.1
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High download failure rate"
|
|
description: "Download failure rate is above 10% for the last 5 minutes."
|
|
|
|
- alert: NoDownloadActivity
|
|
expr: increase(aniworld_downloads_total[1h]) == 0
|
|
for: 2h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "No download activity detected"
|
|
description: "No downloads have been initiated in the last 2 hours."
|
|
|
|
# Process Alerts
|
|
- alert: HighThreadCount
|
|
expr: aniworld_process_threads > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High thread count in AniWorld process"
|
|
description: "Thread count is above 100 for more than 5 minutes. Current count: {{ $value }}"
|
|
|
|
- alert: ProcessMemoryLeak
|
|
expr: increase(aniworld_process_memory_bytes[1h]) > 100000000 # 100MB
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Potential memory leak detected"
|
|
description: "Process memory usage has increased by more than 100MB in the last hour."
|
|
|
|
# Network Alerts
|
|
- alert: NetworkConnectivityIssue
|
|
expr: aniworld_network_connectivity == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Network connectivity issue"
|
|
description: "AniWorld is experiencing network connectivity issues."
|
|
|
|
# Security Alerts
|
|
- alert: HighFailedLoginAttempts
|
|
expr: increase(aniworld_failed_login_attempts_total[5m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High number of failed login attempts"
|
|
description: "More than 10 failed login attempts in the last 5 minutes."
|
|
|
|
- alert: UnauthorizedAPIAccess
|
|
expr: increase(aniworld_unauthorized_api_requests_total[5m]) > 50
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High number of unauthorized API requests"
|
|
description: "More than 50 unauthorized API requests in the last 5 minutes."
|
|
|
|
# Cache Performance Alerts
|
|
- alert: LowCacheHitRate
|
|
expr: aniworld_cache_hit_rate < 0.7
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Low cache hit rate"
|
|
description: "Cache hit rate is below 70% for more than 10 minutes. Current rate: {{ $value }}"
|
|
|
|
- name: infrastructure.rules
|
|
rules:
|
|
# Redis Alerts
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis server has been down for more than 1 minute."
|
|
|
|
- alert: RedisHighMemoryUsage
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Redis high memory usage"
|
|
description: "Redis memory usage is above 90%."
|
|
|
|
# Nginx Alerts
|
|
- alert: NginxDown
|
|
expr: up{job="nginx"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Nginx is down"
|
|
description: "Nginx reverse proxy has been down for more than 1 minute."
|
|
|
|
- alert: NginxHighErrorRate
|
|
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate in Nginx"
|
|
description: "Nginx is returning more than 5% server errors."
|
|
|
|
- name: custom.rules
|
|
rules:
|
|
# Custom Business Logic Alerts
|
|
- alert: AnimeCollectionSizeIncreaseStalled
|
|
expr: increase(aniworld_anime_total[24h]) == 0
|
|
for: 48h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Anime collection size hasn't increased"
|
|
description: "No new anime have been added to the collection in the last 48 hours."
|
|
|
|
- alert: EpisodeDownloadBacklog
|
|
expr: aniworld_episodes_pending > 1000
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Large episode download backlog"
|
|
description: "More than 1000 episodes are pending download. Current backlog: {{ $value }}" |