feat: integrate Uptime Kuma and Alertmanager into Docker setup

- Add Uptime Kuma service for status monitoring with health checks. - Introduce Alertmanager service for alert management and notifications. - Update docker-compose.yml to include new services and their configurations. - Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs. - Modify Ansible playbook to install necessary packages and configure SSL for new services. - Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager. - Adjust Prometheus configuration to include alert rules and external URLs.
2025-09-16 21:50:56 +03:00
parent 5e10204137
commit 9ec3f02767
20 changed files with 2173 additions and 38 deletions
--- a/infra/prometheus/alert_rules.yml
+++ b/infra/prometheus/alert_rules.yml
@@ -0,0 +1,253 @@
+# Prometheus Alert Rules
+# This file defines alerting rules for monitoring the bot infrastructure
+
+groups:
+  # Bot Health Monitoring
+  - name: bot_health
+    rules:
+      # Telegram Bot Health
+      - alert: TelegramBotDown
+        expr: up{job="telegram-bot"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: telegram-bot
+        annotations:
+          summary: "Telegram Bot is down"
+          description: "Telegram Bot has been down for more than 1 minute"
+          runbook_url: "https://docs.example.com/runbooks/telegram-bot-down"
+
+      - alert: TelegramBotHighErrorRate
+        expr: rate(http_requests_total{job="telegram-bot",status=~"5.."}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          service: telegram-bot
+        annotations:
+          summary: "Telegram Bot high error rate"
+          description: "Telegram Bot error rate is {{ $value }} errors per second"
+
+      - alert: TelegramBotHighResponseTime
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="telegram-bot"}[5m])) > 2
+        for: 5m
+        labels:
+          severity: warning
+          service: telegram-bot
+        annotations:
+          summary: "Telegram Bot high response time"
+          description: "95th percentile response time is {{ $value }} seconds"
+
+      # AnonBot Health
+      - alert: AnonBotDown
+        expr: up{job="anon-bot"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: anon-bot
+        annotations:
+          summary: "AnonBot is down"
+          description: "AnonBot has been down for more than 1 minute"
+          runbook_url: "https://docs.example.com/runbooks/anon-bot-down"
+
+      - alert: AnonBotHighErrorRate
+        expr: rate(http_requests_total{job="anon-bot",status=~"5.."}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          service: anon-bot
+        annotations:
+          summary: "AnonBot high error rate"
+          description: "AnonBot error rate is {{ $value }} errors per second"
+
+      - alert: AnonBotHighResponseTime
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="anon-bot"}[5m])) > 2
+        for: 5m
+        labels:
+          severity: warning
+          service: anon-bot
+        annotations:
+          summary: "AnonBot high response time"
+          description: "95th percentile response time is {{ $value }} seconds"
+
+  # Infrastructure Health Monitoring
+  - name: infrastructure_health
+    rules:
+      # Prometheus Health
+      - alert: PrometheusDown
+        expr: up{job="prometheus"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: prometheus
+        annotations:
+          summary: "Prometheus is down"
+          description: "Prometheus has been down for more than 1 minute"
+
+      - alert: PrometheusHighMemoryUsage
+        expr: (prometheus_tsdb_head_series / prometheus_tsdb_head_series_limit) > 0.8
+        for: 5m
+        labels:
+          severity: warning
+          service: prometheus
+        annotations:
+          summary: "Prometheus high memory usage"
+          description: "Prometheus memory usage is {{ $value | humanizePercentage }} of limit"
+
+      # Grafana Health
+      - alert: GrafanaDown
+        expr: up{job="grafana"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: grafana
+        annotations:
+          summary: "Grafana is down"
+          description: "Grafana has been down for more than 1 minute"
+
+      # Nginx Health
+      - alert: NginxDown
+        expr: up{job="nginx"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: nginx
+        annotations:
+          summary: "Nginx is down"
+          description: "Nginx has been down for more than 1 minute"
+
+      - alert: NginxHighErrorRate
+        expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+          service: nginx
+        annotations:
+          summary: "Nginx high error rate"
+          description: "Nginx error rate is {{ $value }} errors per second"
+
+  # System Resource Monitoring
+  - name: system_resources
+    rules:
+      # High CPU Usage
+      - alert: HighCPUUsage
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+          service: system
+        annotations:
+          summary: "High CPU usage"
+          description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
+
+      - alert: VeryHighCPUUsage
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
+        for: 2m
+        labels:
+          severity: critical
+          service: system
+        annotations:
+          summary: "Very high CPU usage"
+          description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
+
+      # High Memory Usage
+      - alert: HighMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          service: system
+        annotations:
+          summary: "High memory usage"
+          description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
+
+      - alert: VeryHighMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+          service: system
+        annotations:
+          summary: "Very high memory usage"
+          description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
+
+      # Disk Space
+      - alert: LowDiskSpace
+        expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          service: system
+        annotations:
+          summary: "Low disk space"
+          description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
+
+      - alert: VeryLowDiskSpace
+        expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+          service: system
+        annotations:
+          summary: "Very low disk space"
+          description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
+
+  # Docker Container Monitoring
+  - name: docker_containers
+    rules:
+      # Container Restart
+      - alert: ContainerRestarting
+        expr: rate(container_start_time_seconds[10m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+          service: docker
+        annotations:
+          summary: "Container restarting"
+          description: "Container {{ $labels.name }} is restarting frequently"
+
+      # Container High Memory Usage
+      - alert: ContainerHighMemoryUsage
+        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+          service: docker
+        annotations:
+          summary: "Container high memory usage"
+          description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
+
+      # Container High CPU Usage
+      - alert: ContainerHighCPUUsage
+        expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+          service: docker
+        annotations:
+          summary: "Container high CPU usage"
+          description: "Container {{ $labels.name }} CPU usage is {{ $value }}%"
+
+  # Database Monitoring
+  - name: database_health
+    rules:
+      # Database Connection Issues
+      - alert: DatabaseConnectionFailed
+        expr: increase(database_connection_errors_total[5m]) > 5
+        for: 1m
+        labels:
+          severity: critical
+          service: database
+        annotations:
+          summary: "Database connection failures"
+          description: "{{ $value }} database connection failures in the last 5 minutes"
+
+      # Database High Query Time
+      - alert: DatabaseHighQueryTime
+        expr: histogram_quantile(0.95, rate(database_query_duration_seconds_bucket[5m])) > 1
+        for: 5m
+        labels:
+          severity: warning
+          service: database
+        annotations:
+          summary: "Database high query time"
+          description: "95th percentile database query time is {{ $value }} seconds"
--- a/infra/prometheus/prometheus.yml
+++ b/infra/prometheus/prometheus.yml
@@ -3,8 +3,7 @@ global:
  evaluation_interval: 15s

 rule_files:
-  # - "first_rules.yml"
-  # - "second_rules.yml"
+  - "alert_rules.yml"

 scrape_configs:
  - job_name: 'prometheus'
@@ -46,4 +45,4 @@ alerting:
  alertmanagers:
    - static_configs:
        - targets:
-          # - alertmanager:9093
+          - alertmanager:9093