feat: integrate Uptime Kuma and Alertmanager into Docker setup
- Add Uptime Kuma service for status monitoring with health checks. - Introduce Alertmanager service for alert management and notifications. - Update docker-compose.yml to include new services and their configurations. - Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs. - Modify Ansible playbook to install necessary packages and configure SSL for new services. - Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager. - Adjust Prometheus configuration to include alert rules and external URLs.
This commit is contained in:
253
infra/prometheus/alert_rules.yml
Normal file
253
infra/prometheus/alert_rules.yml
Normal file
@@ -0,0 +1,253 @@
|
||||
# Prometheus Alert Rules
|
||||
# This file defines alerting rules for monitoring the bot infrastructure
|
||||
|
||||
groups:
|
||||
# Bot Health Monitoring
|
||||
- name: bot_health
|
||||
rules:
|
||||
# Telegram Bot Health
|
||||
- alert: TelegramBotDown
|
||||
expr: up{job="telegram-bot"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: telegram-bot
|
||||
annotations:
|
||||
summary: "Telegram Bot is down"
|
||||
description: "Telegram Bot has been down for more than 1 minute"
|
||||
runbook_url: "https://docs.example.com/runbooks/telegram-bot-down"
|
||||
|
||||
- alert: TelegramBotHighErrorRate
|
||||
expr: rate(http_requests_total{job="telegram-bot",status=~"5.."}[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: telegram-bot
|
||||
annotations:
|
||||
summary: "Telegram Bot high error rate"
|
||||
description: "Telegram Bot error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: TelegramBotHighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="telegram-bot"}[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: telegram-bot
|
||||
annotations:
|
||||
summary: "Telegram Bot high response time"
|
||||
description: "95th percentile response time is {{ $value }} seconds"
|
||||
|
||||
# AnonBot Health
|
||||
- alert: AnonBotDown
|
||||
expr: up{job="anon-bot"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: anon-bot
|
||||
annotations:
|
||||
summary: "AnonBot is down"
|
||||
description: "AnonBot has been down for more than 1 minute"
|
||||
runbook_url: "https://docs.example.com/runbooks/anon-bot-down"
|
||||
|
||||
- alert: AnonBotHighErrorRate
|
||||
expr: rate(http_requests_total{job="anon-bot",status=~"5.."}[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: anon-bot
|
||||
annotations:
|
||||
summary: "AnonBot high error rate"
|
||||
description: "AnonBot error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: AnonBotHighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="anon-bot"}[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: anon-bot
|
||||
annotations:
|
||||
summary: "AnonBot high response time"
|
||||
description: "95th percentile response time is {{ $value }} seconds"
|
||||
|
||||
# Infrastructure Health Monitoring
|
||||
- name: infrastructure_health
|
||||
rules:
|
||||
# Prometheus Health
|
||||
- alert: PrometheusDown
|
||||
expr: up{job="prometheus"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus is down"
|
||||
description: "Prometheus has been down for more than 1 minute"
|
||||
|
||||
- alert: PrometheusHighMemoryUsage
|
||||
expr: (prometheus_tsdb_head_series / prometheus_tsdb_head_series_limit) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: prometheus
|
||||
annotations:
|
||||
summary: "Prometheus high memory usage"
|
||||
description: "Prometheus memory usage is {{ $value | humanizePercentage }} of limit"
|
||||
|
||||
# Grafana Health
|
||||
- alert: GrafanaDown
|
||||
expr: up{job="grafana"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: grafana
|
||||
annotations:
|
||||
summary: "Grafana is down"
|
||||
description: "Grafana has been down for more than 1 minute"
|
||||
|
||||
# Nginx Health
|
||||
- alert: NginxDown
|
||||
expr: up{job="nginx"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: nginx
|
||||
annotations:
|
||||
summary: "Nginx is down"
|
||||
description: "Nginx has been down for more than 1 minute"
|
||||
|
||||
- alert: NginxHighErrorRate
|
||||
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: nginx
|
||||
annotations:
|
||||
summary: "Nginx high error rate"
|
||||
description: "Nginx error rate is {{ $value }} errors per second"
|
||||
|
||||
# System Resource Monitoring
|
||||
- name: system_resources
|
||||
rules:
|
||||
# High CPU Usage
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: system
|
||||
annotations:
|
||||
summary: "High CPU usage"
|
||||
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
|
||||
|
||||
- alert: VeryHighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: system
|
||||
annotations:
|
||||
summary: "Very high CPU usage"
|
||||
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
|
||||
|
||||
# High Memory Usage
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: system
|
||||
annotations:
|
||||
summary: "High memory usage"
|
||||
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
|
||||
|
||||
- alert: VeryHighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: system
|
||||
annotations:
|
||||
summary: "Very high memory usage"
|
||||
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
|
||||
|
||||
# Disk Space
|
||||
- alert: LowDiskSpace
|
||||
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: system
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
- alert: VeryLowDiskSpace
|
||||
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: system
|
||||
annotations:
|
||||
summary: "Very low disk space"
|
||||
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
# Docker Container Monitoring
|
||||
- name: docker_containers
|
||||
rules:
|
||||
# Container Restart
|
||||
- alert: ContainerRestarting
|
||||
expr: rate(container_start_time_seconds[10m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: docker
|
||||
annotations:
|
||||
summary: "Container restarting"
|
||||
description: "Container {{ $labels.name }} is restarting frequently"
|
||||
|
||||
# Container High Memory Usage
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: docker
|
||||
annotations:
|
||||
summary: "Container high memory usage"
|
||||
description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
|
||||
|
||||
# Container High CPU Usage
|
||||
- alert: ContainerHighCPUUsage
|
||||
expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: docker
|
||||
annotations:
|
||||
summary: "Container high CPU usage"
|
||||
description: "Container {{ $labels.name }} CPU usage is {{ $value }}%"
|
||||
|
||||
# Database Monitoring
|
||||
- name: database_health
|
||||
rules:
|
||||
# Database Connection Issues
|
||||
- alert: DatabaseConnectionFailed
|
||||
expr: increase(database_connection_errors_total[5m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: database
|
||||
annotations:
|
||||
summary: "Database connection failures"
|
||||
description: "{{ $value }} database connection failures in the last 5 minutes"
|
||||
|
||||
# Database High Query Time
|
||||
- alert: DatabaseHighQueryTime
|
||||
expr: histogram_quantile(0.95, rate(database_query_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: database
|
||||
annotations:
|
||||
summary: "Database high query time"
|
||||
description: "95th percentile database query time is {{ $value }} seconds"
|
||||
Reference in New Issue
Block a user