second server version
This commit is contained in:
226
docker/prometheus/alerts.yml
Normal file
226
docker/prometheus/alerts.yml
Normal file
@@ -0,0 +1,226 @@
|
||||
# AniWorld Alerting Rules
|
||||
|
||||
groups:
|
||||
- name: aniworld.rules
|
||||
rules:
|
||||
# Application Health Alerts
|
||||
- alert: AniWorldDown
|
||||
expr: up{job="aniworld-web"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "AniWorld application is down"
|
||||
description: "AniWorld web application has been down for more than 1 minute."
|
||||
|
||||
- alert: AniWorldHighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(flask_request_duration_seconds_bucket[5m])) > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time for AniWorld"
|
||||
description: "95th percentile response time is {{ $value }} seconds."
|
||||
|
||||
# System Resource Alerts
|
||||
- alert: HighCPUUsage
|
||||
expr: aniworld_cpu_usage_percent > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on AniWorld server"
|
||||
description: "CPU usage is above 80% for more than 5 minutes. Current value: {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: aniworld_memory_usage_percent > 85
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on AniWorld server"
|
||||
description: "Memory usage is above 85% for more than 3 minutes. Current value: {{ $value }}%"
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: aniworld_memory_usage_percent > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory usage on AniWorld server"
|
||||
description: "Memory usage is above 95%. Current value: {{ $value }}%"
|
||||
|
||||
- alert: HighDiskUsage
|
||||
expr: aniworld_disk_usage_percent > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk usage on AniWorld server"
|
||||
description: "Disk usage is above 90% for more than 5 minutes. Current value: {{ $value }}%"
|
||||
|
||||
- alert: CriticalDiskUsage
|
||||
expr: aniworld_disk_usage_percent > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk usage on AniWorld server"
|
||||
description: "Disk usage is above 95%. Current value: {{ $value }}%"
|
||||
|
||||
# Database Alerts
|
||||
- alert: DatabaseConnectionFailure
|
||||
expr: up{job="aniworld-web"} == 1 and aniworld_database_connected == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database connection failure"
|
||||
description: "AniWorld cannot connect to the database for more than 2 minutes."
|
||||
|
||||
- alert: SlowDatabaseQueries
|
||||
expr: aniworld_database_query_duration_seconds > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow database queries detected"
|
||||
description: "Database queries are taking longer than 5 seconds. Current duration: {{ $value }}s"
|
||||
|
||||
# Download Performance Alerts
|
||||
- alert: HighDownloadFailureRate
|
||||
expr: rate(aniworld_downloads_failed_total[5m]) / rate(aniworld_downloads_total[5m]) > 0.1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High download failure rate"
|
||||
description: "Download failure rate is above 10% for the last 5 minutes."
|
||||
|
||||
- alert: NoDownloadActivity
|
||||
expr: increase(aniworld_downloads_total[1h]) == 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "No download activity detected"
|
||||
description: "No downloads have been initiated in the last 2 hours."
|
||||
|
||||
# Process Alerts
|
||||
- alert: HighThreadCount
|
||||
expr: aniworld_process_threads > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High thread count in AniWorld process"
|
||||
description: "Thread count is above 100 for more than 5 minutes. Current count: {{ $value }}"
|
||||
|
||||
- alert: ProcessMemoryLeak
|
||||
expr: increase(aniworld_process_memory_bytes[1h]) > 100000000 # 100MB
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Potential memory leak detected"
|
||||
description: "Process memory usage has increased by more than 100MB in the last hour."
|
||||
|
||||
# Network Alerts
|
||||
- alert: NetworkConnectivityIssue
|
||||
expr: aniworld_network_connectivity == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network connectivity issue"
|
||||
description: "AniWorld is experiencing network connectivity issues."
|
||||
|
||||
# Security Alerts
|
||||
- alert: HighFailedLoginAttempts
|
||||
expr: increase(aniworld_failed_login_attempts_total[5m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of failed login attempts"
|
||||
description: "More than 10 failed login attempts in the last 5 minutes."
|
||||
|
||||
- alert: UnauthorizedAPIAccess
|
||||
expr: increase(aniworld_unauthorized_api_requests_total[5m]) > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of unauthorized API requests"
|
||||
description: "More than 50 unauthorized API requests in the last 5 minutes."
|
||||
|
||||
# Cache Performance Alerts
|
||||
- alert: LowCacheHitRate
|
||||
expr: aniworld_cache_hit_rate < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Low cache hit rate"
|
||||
description: "Cache hit rate is below 70% for more than 10 minutes. Current rate: {{ $value }}"
|
||||
|
||||
- name: infrastructure.rules
|
||||
rules:
|
||||
# Redis Alerts
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis server has been down for more than 1 minute."
|
||||
|
||||
- alert: RedisHighMemoryUsage
|
||||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Redis high memory usage"
|
||||
description: "Redis memory usage is above 90%."
|
||||
|
||||
# Nginx Alerts
|
||||
- alert: NginxDown
|
||||
expr: up{job="nginx"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Nginx is down"
|
||||
description: "Nginx reverse proxy has been down for more than 1 minute."
|
||||
|
||||
- alert: NginxHighErrorRate
|
||||
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) / rate(nginx_http_requests_total[5m]) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate in Nginx"
|
||||
description: "Nginx is returning more than 5% server errors."
|
||||
|
||||
- name: custom.rules
|
||||
rules:
|
||||
# Custom Business Logic Alerts
|
||||
- alert: AnimeCollectionSizeIncreaseStalled
|
||||
expr: increase(aniworld_anime_total[24h]) == 0
|
||||
for: 48h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Anime collection size hasn't increased"
|
||||
description: "No new anime have been added to the collection in the last 48 hours."
|
||||
|
||||
- alert: EpisodeDownloadBacklog
|
||||
expr: aniworld_episodes_pending > 1000
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Large episode download backlog"
|
||||
description: "More than 1000 episodes are pending download. Current backlog: {{ $value }}"
|
||||
67
docker/prometheus/prometheus.yml
Normal file
67
docker/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
# Prometheus Configuration for AniWorld Monitoring
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# AniWorld Application Metrics
|
||||
- job_name: 'aniworld-web'
|
||||
static_configs:
|
||||
- targets: ['aniworld-web:5000']
|
||||
metrics_path: '/api/health/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# System Metrics (Node Exporter)
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
|
||||
# Redis Metrics
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
|
||||
# Nginx Metrics
|
||||
- job_name: 'nginx'
|
||||
static_configs:
|
||||
- targets: ['nginx-exporter:9113']
|
||||
|
||||
# Prometheus Self-Monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Health Check Monitoring
|
||||
- job_name: 'aniworld-health'
|
||||
static_configs:
|
||||
- targets: ['aniworld-web:5000']
|
||||
metrics_path: '/api/health/system'
|
||||
scrape_interval: 60s
|
||||
|
||||
# Blackbox Exporter for External Monitoring
|
||||
- job_name: 'blackbox'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://aniworld-web:5000/health
|
||||
- http://aniworld-web:5000/api/health/ready
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
Reference in New Issue
Block a user