# Prometheus Alert Rules # This file defines alerting rules for monitoring the bot infrastructure groups: # Bot Health Monitoring - name: bot_health rules: # Telegram Bot Health - alert: TelegramBotDown expr: up{job="telegram-bot"} == 0 for: 1m labels: severity: critical service: telegram-bot annotations: summary: "Telegram Bot is down" description: "Telegram Bot has been down for more than 1 minute" runbook_url: "https://docs.example.com/runbooks/telegram-bot-down" - alert: TelegramBotHighErrorRate expr: rate(http_requests_total{job="telegram-bot",status=~"5.."}[5m]) > 0.1 for: 2m labels: severity: warning service: telegram-bot annotations: summary: "Telegram Bot high error rate" description: "Telegram Bot error rate is {{ $value }} errors per second" - alert: TelegramBotHighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="telegram-bot"}[5m])) > 2 for: 5m labels: severity: warning service: telegram-bot annotations: summary: "Telegram Bot high response time" description: "95th percentile response time is {{ $value }} seconds" # AnonBot Health - alert: AnonBotDown expr: up{job="anon-bot"} == 0 for: 1m labels: severity: critical service: anon-bot annotations: summary: "AnonBot is down" description: "AnonBot has been down for more than 1 minute" runbook_url: "https://docs.example.com/runbooks/anon-bot-down" - alert: AnonBotHighErrorRate expr: rate(http_requests_total{job="anon-bot",status=~"5.."}[5m]) > 0.1 for: 2m labels: severity: warning service: anon-bot annotations: summary: "AnonBot high error rate" description: "AnonBot error rate is {{ $value }} errors per second" - alert: AnonBotHighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="anon-bot"}[5m])) > 2 for: 5m labels: severity: warning service: anon-bot annotations: summary: "AnonBot high response time" description: "95th percentile response time is {{ $value }} seconds" # Infrastructure Health Monitoring - name: infrastructure_health rules: # Prometheus Health - alert: PrometheusDown expr: up{job="prometheus"} == 0 for: 1m labels: severity: critical service: prometheus annotations: summary: "Prometheus is down" description: "Prometheus has been down for more than 1 minute" - alert: PrometheusHighMemoryUsage expr: (prometheus_tsdb_head_series / prometheus_tsdb_head_series_limit) > 0.8 for: 5m labels: severity: warning service: prometheus annotations: summary: "Prometheus high memory usage" description: "Prometheus memory usage is {{ $value | humanizePercentage }} of limit" # Grafana Health - alert: GrafanaDown expr: up{job="grafana"} == 0 for: 1m labels: severity: critical service: grafana annotations: summary: "Grafana is down" description: "Grafana has been down for more than 1 minute" # Nginx Health - alert: NginxDown expr: up{job="nginx"} == 0 for: 1m labels: severity: critical service: nginx annotations: summary: "Nginx is down" description: "Nginx has been down for more than 1 minute" - alert: NginxHighErrorRate expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1 for: 2m labels: severity: warning service: nginx annotations: summary: "Nginx high error rate" description: "Nginx error rate is {{ $value }} errors per second" # System Resource Monitoring - name: system_resources rules: # High CPU Usage - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning service: system annotations: summary: "High CPU usage" description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" - alert: VeryHighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 2m labels: severity: critical service: system annotations: summary: "Very high CPU usage" description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" # High Memory Usage - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 for: 5m labels: severity: warning service: system annotations: summary: "High memory usage" description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" - alert: VeryHighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 2m labels: severity: critical service: system annotations: summary: "Very high memory usage" description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" # Disk Space - alert: LowDiskSpace expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80 for: 5m labels: severity: warning service: system annotations: summary: "Low disk space" description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})" - alert: VeryLowDiskSpace expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 95 for: 2m labels: severity: critical service: system annotations: summary: "Very low disk space" description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})" # Docker Container Monitoring - name: docker_containers rules: # Container Restart - alert: ContainerRestarting expr: rate(container_start_time_seconds[10m]) > 0 for: 0m labels: severity: warning service: docker annotations: summary: "Container restarting" description: "Container {{ $labels.name }} is restarting frequently" # Container High Memory Usage - alert: ContainerHighMemoryUsage expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80 for: 5m labels: severity: warning service: docker annotations: summary: "Container high memory usage" description: "Container {{ $labels.name }} memory usage is {{ $value }}%" # Container High CPU Usage - alert: ContainerHighCPUUsage expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100) > 80 for: 5m labels: severity: warning service: docker annotations: summary: "Container high CPU usage" description: "Container {{ $labels.name }} CPU usage is {{ $value }}%" # Database Monitoring - name: database_health rules: # Database Connection Issues - alert: DatabaseConnectionFailed expr: increase(database_connection_errors_total[5m]) > 5 for: 1m labels: severity: critical service: database annotations: summary: "Database connection failures" description: "{{ $value }} database connection failures in the last 5 minutes" # Database High Query Time - alert: DatabaseHighQueryTime expr: histogram_quantile(0.95, rate(database_query_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning service: database annotations: summary: "Database high query time" description: "95th percentile database query time is {{ $value }} seconds"