From 9ec3f027671b9e763e0e109ba0bba6255a0e8854 Mon Sep 17 00:00:00 2001 From: Andrey Date: Tue, 16 Sep 2025 21:50:56 +0300 Subject: [PATCH] feat: integrate Uptime Kuma and Alertmanager into Docker setup - Add Uptime Kuma service for status monitoring with health checks. - Introduce Alertmanager service for alert management and notifications. - Update docker-compose.yml to include new services and their configurations. - Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs. - Modify Ansible playbook to install necessary packages and configure SSL for new services. - Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager. - Adjust Prometheus configuration to include alert rules and external URLs. --- Makefile | 53 ++ docker-compose.yml | 61 +- infra/alertmanager/alertmanager-simple.yml | 17 + infra/alertmanager/alertmanager.yml | 185 ++++++ infra/ansible/playbook.yml | 117 +++- infra/grafana/dashboards/bot-monitoring.json | 529 ++++++++++++++++++ .../dashboards/infrastructure-monitoring.json | 523 +++++++++++++++++ .../provisioning/dashboards/dashboards.yml | 16 + .../provisioning/datasources/prometheus.yml | 10 +- infra/nginx/conf.d/alertmanager.conf | 61 ++ infra/nginx/conf.d/grafana.conf | 6 - infra/nginx/conf.d/prometheus.conf | 9 +- infra/nginx/conf.d/status.conf | 39 +- infra/nginx/conf.d/uptime-kuma.conf | 69 +++ infra/nginx/nginx.conf | 35 +- infra/nginx/ssl/letsencrypt.conf | 27 + infra/prometheus/alert_rules.yml | 253 +++++++++ infra/prometheus/prometheus.yml | 5 +- infra/uptime-kuma/docker-compose.yml | 33 ++ scripts/setup-ssl.sh | 163 ++++++ 20 files changed, 2173 insertions(+), 38 deletions(-) create mode 100644 infra/alertmanager/alertmanager-simple.yml create mode 100644 infra/alertmanager/alertmanager.yml create mode 100644 infra/grafana/dashboards/bot-monitoring.json create mode 100644 infra/grafana/dashboards/infrastructure-monitoring.json create mode 100644 infra/grafana/provisioning/dashboards/dashboards.yml create mode 100644 infra/nginx/conf.d/alertmanager.conf create mode 100644 infra/nginx/conf.d/uptime-kuma.conf create mode 100644 infra/nginx/ssl/letsencrypt.conf create mode 100644 infra/prometheus/alert_rules.yml create mode 100644 infra/uptime-kuma/docker-compose.yml create mode 100755 scripts/setup-ssl.sh diff --git a/Makefile b/Makefile index aa72363..7f169ce 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,8 @@ help: ## Показать справку @echo "📊 Мониторинг:" @echo " Prometheus: http://localhost:9090" @echo " Grafana: http://localhost:3000 (admin/admin)" + @echo " Uptime Kuma: http://localhost:3001" + @echo " Alertmanager: http://localhost:9093" @echo " Server Monitor: http://localhost:9091/health" @echo " Bot Health: http://localhost:8080/health" @echo " AnonBot Health: http://localhost:8081/health" @@ -37,6 +39,12 @@ logs-bot: ## Показать логи Telegram бота logs-anonBot: ## Показать логи AnonBot docker-compose logs -f anon-bot +logs-uptime-kuma: ## Показать логи Uptime Kuma + docker-compose logs -f uptime-kuma + +logs-alertmanager: ## Показать логи Alertmanager + docker-compose logs -f alertmanager + restart: ## Перезапустить все сервисы docker-compose down docker-compose build --no-cache @@ -54,6 +62,12 @@ restart-bot: ## Перезапустить только Telegram бота restart-anonBot: ## Перезапустить только AnonBot docker-compose restart anon-bot +restart-uptime-kuma: ## Перезапустить только Uptime Kuma + docker-compose restart uptime-kuma + +restart-alertmanager: ## Перезапустить только Alertmanager + docker-compose restart alertmanager + status: ## Показать статус контейнеров docker-compose ps @@ -63,6 +77,8 @@ health: ## Проверить здоровье сервисов @curl -f http://localhost:8081/health || echo "❌ AnonBot health check failed" @curl -f http://localhost:9090/-/healthy || echo "❌ Prometheus health check failed" @curl -f http://localhost:3000/api/health || echo "❌ Grafana health check failed" + @curl -f http://localhost:3001 || echo "❌ Uptime Kuma health check failed" + @curl -f http://localhost:9093/-/healthy || echo "❌ Alertmanager health check failed" @curl -f http://localhost:9091/health || echo "❌ Server monitor health check failed" deploy: ## Полный деплой на продакшен @@ -120,6 +136,8 @@ start: build up ## Собрать и запустить все сервисы @echo "🏗️ Production Infrastructure запущена!" @echo "📊 Prometheus: http://localhost:9090" @echo "📈 Grafana: http://localhost:3000 (admin/admin)" + @echo "📊 Uptime Kuma: http://localhost:3001" + @echo "🚨 Alertmanager: http://localhost:9093" @echo "🤖 Bot Health: http://localhost:8080/health" @echo "🔒 AnonBot Health: http://localhost:8081/health" @echo "📡 Server Monitor: http://localhost:9091/health" @@ -191,6 +209,7 @@ test-clean: ## Очистить все файлы тестирования и о @find . -name "*.pyc" -delete 2>/dev/null || true @find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true @echo "✅ Файлы тестирования очищены" + check-ports: ## Проверить занятые порты @echo "🔍 Checking occupied ports..." @@ -242,3 +261,37 @@ reload-prometheus: ## Перезагрузить конфигурацию Promet reload-grafana: ## Перезагрузить конфигурацию Grafana @echo "🔄 Reloading Grafana configuration..." @docker-compose restart grafana + +ssl-setup: ## Настроить SSL сертификаты (самоподписанный) + @echo "🔒 Setting up self-signed SSL certificates..." + @if [ -z "$(SERVER_IP)" ]; then echo "❌ Please set SERVER_IP variable in .env file"; exit 1; fi + @mkdir -p /etc/letsencrypt/live/$(SERVER_IP) + @openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout /etc/letsencrypt/live/$(SERVER_IP)/privkey.pem \ + -out /etc/letsencrypt/live/$(SERVER_IP)/fullchain.pem \ + -subj "/CN=$(SERVER_IP)" + @echo "✅ Self-signed certificate created for $(SERVER_IP)" + +ssl-renew: ## Обновить SSL сертификаты + @echo "🔄 Renewing SSL certificates..." + @sudo /usr/local/bin/ssl-renewal.sh + +ssl-status: ## Проверить статус SSL сертификатов + @echo "🔍 Checking SSL certificate status..." + @sudo certbot certificates + +uptime-kuma: ## Открыть Uptime Kuma в браузере + @echo "📊 Opening Uptime Kuma..." + @open http://localhost:3001 || xdg-open http://localhost:3001 || echo "Please open manually: http://localhost:3001" + +alertmanager: ## Открыть Alertmanager в браузере + @echo "🚨 Opening Alertmanager..." + @open http://localhost:9093 || xdg-open http://localhost:9093 || echo "Please open manually: http://localhost:9093" + +monitoring-all: ## Открыть все мониторинг сервисы + @echo "📊 Opening all monitoring services..." + @echo " - Grafana: http://localhost:3000" + @echo " - Prometheus: http://localhost:9090" + @echo " - Uptime Kuma: http://localhost:3001" + @echo " - Alertmanager: http://localhost:9093" + @open http://localhost:3000 || xdg-open http://localhost:3000 || echo "Please open manually" diff --git a/docker-compose.yml b/docker-compose.yml index d620e95..9a700bb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,10 +12,12 @@ services: - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_DAYS:-30}d' - '--web.enable-lifecycle' + - '--web.external-url=https://${SERVER_IP}/prometheus/' ports: - "9090:9090" volumes: - ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./infra/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro - prometheus_data:/prometheus networks: - bots_network @@ -35,9 +37,9 @@ services: - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP=false - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource - - GF_SERVER_ROOT_URL=https://${SERVER_IP:-localhost}/grafana/ + - GF_SERVER_ROOT_URL=https://${SERVER_IP}/grafana/ - GF_SERVER_SERVE_FROM_SUB_PATH=true - - GF_SERVER_DOMAIN=${SERVER_IP:-localhost} + - GF_SERVER_DOMAIN=${SERVER_IP} ports: - "3000:3000" volumes: @@ -53,6 +55,51 @@ services: timeout: 10s retries: 3 + # Uptime Kuma Status Page + uptime-kuma: + image: louislam/uptime-kuma:latest + container_name: bots_uptime_kuma + restart: unless-stopped + volumes: + - uptime_kuma_data:/app/data + ports: + - "3001:3001" + environment: + - UPTIME_KUMA_PORT=3001 + networks: + - bots_network + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # Alertmanager + alertmanager: + image: prom/alertmanager:latest + container_name: bots_alertmanager + restart: unless-stopped + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + - '--web.external-url=https://${SERVER_IP}/alertmanager/' + - '--web.route-prefix=/' + ports: + - "9093:9093" + volumes: + - alertmanager_data:/alertmanager + - ./infra/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + networks: + - bots_network + depends_on: + - prometheus + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + # Nginx Reverse Proxy nginx: image: nginx:alpine @@ -61,16 +108,20 @@ services: ports: - "80:80" - "443:443" + environment: + - SERVER_IP=${SERVER_IP} volumes: - - ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro + - ./infra/nginx/nginx.conf:/etc/nginx/templates/nginx.conf.template:ro - ./infra/nginx/conf.d:/etc/nginx/conf.d:ro - ./infra/nginx/ssl:/etc/nginx/ssl:ro - ./infra/nginx/.htpasswd:/etc/nginx/.htpasswd:ro + - /etc/letsencrypt:/etc/letsencrypt:ro networks: - bots_network depends_on: - grafana - prometheus + - uptime-kuma healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost/nginx-health"] interval: 30s @@ -194,6 +245,10 @@ volumes: driver: local grafana_data: driver: local + uptime_kuma_data: + driver: local + alertmanager_data: + driver: local networks: bots_network: diff --git a/infra/alertmanager/alertmanager-simple.yml b/infra/alertmanager/alertmanager-simple.yml new file mode 100644 index 0000000..d12a1f3 --- /dev/null +++ b/infra/alertmanager/alertmanager-simple.yml @@ -0,0 +1,17 @@ +# Simplified Alertmanager Configuration +global: + smtp_smarthost: 'localhost:587' + smtp_from: 'alerts@localhost' + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'web.hook' + +receivers: + - name: 'web.hook' + webhook_configs: + - url: 'http://localhost:5001/' + send_resolved: true diff --git a/infra/alertmanager/alertmanager.yml b/infra/alertmanager/alertmanager.yml new file mode 100644 index 0000000..933c0ba --- /dev/null +++ b/infra/alertmanager/alertmanager.yml @@ -0,0 +1,185 @@ +# Alertmanager Configuration +# This file configures how alerts are handled and routed + +global: + # SMTP configuration for email notifications + smtp_smarthost: 'localhost:587' + smtp_from: 'alerts@{{DOMAIN}}' + smtp_auth_username: 'alerts@{{DOMAIN}}' + smtp_auth_password: '{{SMTP_PASSWORD}}' + smtp_require_tls: true + + # Resolve timeout + resolve_timeout: 5m + +# Templates for alert formatting +templates: + - '/etc/alertmanager/templates/*.tmpl' + +# Route configuration - defines how alerts are routed +route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'web.hook' + routes: + # Critical alerts - immediate notification + - match: + severity: critical + receiver: 'critical-alerts' + group_wait: 5s + repeat_interval: 5m + + # Warning alerts - grouped notification + - match: + severity: warning + receiver: 'warning-alerts' + group_wait: 30s + repeat_interval: 30m + + # Bot-specific alerts + - match: + service: telegram-bot + receiver: 'bot-alerts' + group_wait: 10s + repeat_interval: 15m + + - match: + service: anon-bot + receiver: 'bot-alerts' + group_wait: 10s + repeat_interval: 15m + + # Infrastructure alerts + - match: + service: prometheus + receiver: 'infrastructure-alerts' + group_wait: 30s + repeat_interval: 1h + + - match: + service: grafana + receiver: 'infrastructure-alerts' + group_wait: 30s + repeat_interval: 1h + + - match: + service: nginx + receiver: 'infrastructure-alerts' + group_wait: 30s + repeat_interval: 1h + +# Inhibition rules - suppress certain alerts when others are firing +inhibit_rules: + # Suppress warning alerts when critical alerts are firing + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'cluster', 'service'] + + # Suppress individual instance alerts when the entire service is down + - source_match: + alertname: 'ServiceDown' + target_match: + alertname: 'InstanceDown' + equal: ['service'] + +# Receiver configurations +receivers: + # Default webhook receiver (for testing) + - name: 'web.hook' + webhook_configs: + - url: 'http://localhost:5001/' + send_resolved: true + + # Critical alerts - immediate notification via multiple channels + - name: 'critical-alerts' + email_configs: + - to: 'admin@{{DOMAIN}}' + subject: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}' + body: | + {{ range .Alerts }} + Alert: {{ .Annotations.summary }} + Description: {{ .Annotations.description }} + Severity: {{ .Labels.severity }} + Service: {{ .Labels.service }} + Instance: {{ .Labels.instance }} + Time: {{ .StartsAt }} + {{ end }} + html: | +

🚨 Critical Alert

+ + + + +
Alert:{{ .GroupLabels.alertname }}
Service:{{ .GroupLabels.service }}
Time:{{ .GroupLabels.time }}
+

Alerts:

+ + webhook_configs: + - url: 'http://localhost:5001/critical' + send_resolved: true + + # Warning alerts - less urgent notification + - name: 'warning-alerts' + email_configs: + - to: 'admin@{{DOMAIN}}' + subject: '⚠️ WARNING: {{ .GroupLabels.alertname }}' + body: | + {{ range .Alerts }} + Alert: {{ .Annotations.summary }} + Description: {{ .Annotations.description }} + Severity: {{ .Labels.severity }} + Service: {{ .Labels.service }} + Instance: {{ .Labels.instance }} + Time: {{ .StartsAt }} + {{ end }} + webhook_configs: + - url: 'http://localhost:5001/warning' + send_resolved: true + + # Bot-specific alerts + - name: 'bot-alerts' + email_configs: + - to: 'bot-admin@{{DOMAIN}}' + subject: '🤖 Bot Alert: {{ .GroupLabels.alertname }}' + body: | + Bot Alert: {{ .GroupLabels.alertname }} + Service: {{ .GroupLabels.service }} + + {{ range .Alerts }} + - {{ .Annotations.summary }} + {{ .Annotations.description }} + Instance: {{ .Labels.instance }} + Time: {{ .StartsAt }} + {{ end }} + webhook_configs: + - url: 'http://localhost:5001/bot' + send_resolved: true + + # Infrastructure alerts + - name: 'infrastructure-alerts' + email_configs: + - to: 'infra@{{DOMAIN}}' + subject: '🏗️ Infrastructure Alert: {{ .GroupLabels.alertname }}' + body: | + Infrastructure Alert: {{ .GroupLabels.alertname }} + Service: {{ .GroupLabels.service }} + + {{ range .Alerts }} + - {{ .Annotations.summary }} + {{ .Annotations.description }} + Instance: {{ .Labels.instance }} + Time: {{ .StartsAt }} + {{ end }} + webhook_configs: + - url: 'http://localhost:5001/infrastructure' + send_resolved: true diff --git a/infra/ansible/playbook.yml b/infra/ansible/playbook.yml index 40005bf..df7ec7e 100644 --- a/infra/ansible/playbook.yml +++ b/infra/ansible/playbook.yml @@ -57,6 +57,15 @@ - nginx - openssl - apache2-utils + - certbot + - python3-certbot-nginx + state: present + + - name: Установить Python библиотеки для Ansible + pip: + name: + - passlib + - bcrypt state: present - name: Установить часовой пояс Europe/Moscow @@ -278,14 +287,40 @@ - "{{ project_root }}/infra/nginx" - "{{ project_root }}/infra/nginx/ssl" - "{{ project_root }}/infra/nginx/conf.d" + - "{{ project_root }}/infra/uptime-kuma" + - "{{ project_root }}/infra/alertmanager" + - "{{ project_root }}/infra/grafana/dashboards" + - "{{ project_root }}/scripts" - - name: Сгенерировать самоподписанный SSL сертификат + - name: Сгенерировать самоподписанный SSL сертификат (fallback) command: > openssl req -x509 -newkey rsa:4096 -keyout {{ project_root }}/infra/nginx/ssl/key.pem -out {{ project_root }}/infra/nginx/ssl/cert.pem -days 365 -nodes -subj "/CN={{ ansible_host }}/O=Monitoring/C=RU" args: creates: "{{ project_root }}/infra/nginx/ssl/cert.pem" + when: not use_letsencrypt | default(false) + + - name: Создать директории для Let's Encrypt + file: + path: "{{ item }}" + state: directory + owner: root + group: root + mode: '0755' + loop: + - /etc/letsencrypt + - /etc/letsencrypt/live + - /etc/letsencrypt/archive + - /etc/letsencrypt/renewal + when: use_letsencrypt | default(false) + + - name: Настроить cron для автоматического обновления SSL сертификатов + cron: + name: "SSL Certificate Renewal" + job: "0 2 * * 1 /usr/local/bin/ssl-renewal.sh" + user: root + when: use_letsencrypt | default(false) - name: Установить права на SSL сертификаты file: @@ -314,6 +349,7 @@ group: root mode: '0644' backup: yes + remote_src: yes - name: Скопировать конфигурации nginx для сервисов copy: @@ -323,6 +359,7 @@ group: root mode: '0644' backup: yes + remote_src: yes - name: Скопировать SSL сертификаты copy: @@ -332,6 +369,7 @@ group: root mode: '0600' backup: yes + remote_src: yes - name: Скопировать htpasswd файл copy: @@ -341,6 +379,47 @@ group: root mode: '0644' backup: yes + remote_src: yes + + - name: Скопировать конфигурацию Alertmanager + copy: + src: "{{ project_root }}/infra/alertmanager/alertmanager.yml" + dest: "{{ project_root }}/infra/alertmanager/alertmanager.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_user }}" + mode: '0644' + backup: yes + remote_src: yes + + - name: Скопировать правила алертов Prometheus + copy: + src: "{{ project_root }}/infra/prometheus/alert_rules.yml" + dest: "{{ project_root }}/infra/prometheus/alert_rules.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_user }}" + mode: '0644' + backup: yes + remote_src: yes + + - name: Скопировать дашборды Grafana + copy: + src: "{{ project_root }}/infra/grafana/dashboards/" + dest: "{{ project_root }}/infra/grafana/dashboards/" + owner: "{{ deploy_user }}" + group: "{{ deploy_user }}" + mode: '0644' + backup: yes + remote_src: yes + + - name: Скопировать скрипт настройки SSL + copy: + src: "{{ project_root }}/scripts/setup-ssl.sh" + dest: /usr/local/bin/setup-ssl.sh + owner: root + group: root + mode: '0755' + backup: yes + remote_src: yes - name: Проверить конфигурацию nginx command: nginx -t @@ -811,6 +890,20 @@ timeout: 30 state: started + - name: Проверить, что порт 3001 (Uptime Kuma) открыт + wait_for: + port: 3001 + host: "{{ ansible_host }}" + timeout: 30 + state: started + + - name: Проверить, что порт 9093 (Alertmanager) открыт + wait_for: + port: 9093 + host: "{{ ansible_host }}" + timeout: 30 + state: started + - name: Проверить доступность Nginx uri: url: "http://{{ ansible_host }}/nginx-health" @@ -849,6 +942,26 @@ retries: 5 delay: 10 + - name: Проверить доступность Uptime Kuma через Nginx + uri: + url: "https://{{ ansible_host }}/status" + method: GET + status_code: 200 + validate_certs: no + register: uptime_kuma_nginx_health + retries: 5 + delay: 10 + + - name: Проверить доступность Alertmanager через Nginx + uri: + url: "https://{{ ansible_host }}/alertmanager/" + method: GET + status_code: 200 + validate_certs: no + register: alertmanager_nginx_health + retries: 5 + delay: 10 + - name: Закрыть старый SSH порт 22 в UFW (финальный шаг) ufw: @@ -858,7 +971,7 @@ - name: Проверка запуска ботов завершена — всё работает 🟢 debug: - msg: "Все сервисы запущены и слушают нужные порты. SSH настроен на порт 15722, Fail2ban активен, параметры безопасности ядра применены. Порт 22 закрыт для безопасности." + msg: "Все сервисы запущены и слушают нужные порты. SSH настроен на порт 15722, Fail2ban активен, параметры безопасности ядра применены. Порт 22 закрыт для безопасности. Добавлены: Uptime Kuma (статусная страница), Alertmanager (мониторинг), Let's Encrypt SSL, Grafana дашборды." # handlers для перезагрузки сервисов handlers: diff --git a/infra/grafana/dashboards/bot-monitoring.json b/infra/grafana/dashboards/bot-monitoring.json new file mode 100644 index 0000000..8b106d7 --- /dev/null +++ b/infra/grafana/dashboards/bot-monitoring.json @@ -0,0 +1,529 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(http_requests_total{job=~\"telegram-bot|anon-bot\"}[5m])", + "interval": "", + "legendFormat": "{{job}} - {{method}} {{status}}", + "refId": "A" + } + ], + "title": "Bot Request Rate", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job=~\"telegram-bot|anon-bot\"}[5m]))", + "interval": "", + "legendFormat": "{{job}} - 95th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{job=~\"telegram-bot|anon-bot\"}[5m]))", + "interval": "", + "legendFormat": "{{job}} - 50th percentile", + "refId": "B" + } + ], + "title": "Bot Response Time", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(http_requests_total{job=~\"telegram-bot|anon-bot\",status=~\"5..\"}[5m]) / rate(http_requests_total{job=~\"telegram-bot|anon-bot\"}[5m]) * 100", + "interval": "", + "legendFormat": "{{job}} - Error Rate", + "refId": "A" + } + ], + "title": "Bot Error Rate", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=~\"telegram-bot|anon-bot\"}", + "interval": "", + "legendFormat": "{{job}} - Memory Usage", + "refId": "A" + } + ], + "title": "Bot Memory Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "up{job=~\"telegram-bot|anon-bot\"}", + "interval": "", + "legendFormat": "{{job}} - Status", + "refId": "A" + } + ], + "title": "Bot Health Status", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{job=~\"telegram-bot|anon-bot\"}[5m]) * 100", + "interval": "", + "legendFormat": "{{job}} - CPU Usage", + "refId": "A" + } + ], + "title": "Bot CPU Usage", + "type": "timeseries" + } + ], + "schemaVersion": 27, + "style": "dark", + "tags": ["bots", "monitoring"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Bot Monitoring Dashboard", + "uid": "bot-monitoring", + "version": 1 +} diff --git a/infra/grafana/dashboards/infrastructure-monitoring.json b/infra/grafana/dashboards/infrastructure-monitoring.json new file mode 100644 index 0000000..4a77335 --- /dev/null +++ b/infra/grafana/dashboards/infrastructure-monitoring.json @@ -0,0 +1,523 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "interval": "", + "legendFormat": "CPU Usage - {{instance}}", + "refId": "A" + } + ], + "title": "System CPU Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "interval": "", + "legendFormat": "Memory Usage - {{instance}}", + "refId": "A" + } + ], + "title": "System Memory Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100", + "interval": "", + "legendFormat": "Disk Usage - {{instance}} {{mountpoint}}", + "refId": "A" + } + ], + "title": "Disk Usage", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "up{job=~\"prometheus|grafana|nginx|alertmanager|uptime-kuma\"}", + "interval": "", + "legendFormat": "{{job}} - Status", + "refId": "A" + } + ], + "title": "Service Health Status", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(nginx_http_requests_total[5m])", + "interval": "", + "legendFormat": "Nginx - {{status}}", + "refId": "A" + } + ], + "title": "Nginx Request Rate", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "container_memory_usage_bytes{name=~\"bots_.*\"}", + "interval": "", + "legendFormat": "{{name}} - Memory", + "refId": "A" + } + ], + "title": "Container Memory Usage", + "type": "timeseries" + } + ], + "schemaVersion": 27, + "style": "dark", + "tags": ["infrastructure", "monitoring"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Infrastructure Monitoring Dashboard", + "uid": "infrastructure-monitoring", + "version": 1 +} diff --git a/infra/grafana/provisioning/dashboards/dashboards.yml b/infra/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..5781e55 --- /dev/null +++ b/infra/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,16 @@ +# Grafana Dashboard Provisioning Configuration +# This file configures automatic dashboard import + +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards + foldersFromFilesStructure: true diff --git a/infra/grafana/provisioning/datasources/prometheus.yml b/infra/grafana/provisioning/datasources/prometheus.yml index 86fd346..a0e6527 100644 --- a/infra/grafana/provisioning/datasources/prometheus.yml +++ b/infra/grafana/provisioning/datasources/prometheus.yml @@ -4,5 +4,13 @@ datasources: - name: Prometheus type: prometheus access: proxy - url: http://prometheus:9090 + url: http://prometheus:9090/prometheus isDefault: true + jsonData: + httpMethod: POST + manageAlerts: true + prometheusType: Prometheus + prometheusVersion: 2.40.0 + cacheLevel: 'High' + disableRecordingRules: false + incrementalQueryOverlapWindow: 10m diff --git a/infra/nginx/conf.d/alertmanager.conf b/infra/nginx/conf.d/alertmanager.conf new file mode 100644 index 0000000..4406026 --- /dev/null +++ b/infra/nginx/conf.d/alertmanager.conf @@ -0,0 +1,61 @@ +# Alertmanager Nginx Configuration +# Proxies requests to Alertmanager + +# Alertmanager location +location /alertmanager/ { + # Rate limiting + limit_req zone=api burst=10 nodelay; + + # Remove trailing slash for proxy + rewrite ^/alertmanager/(.*)$ /$1 break; + + # Proxy to Alertmanager + proxy_pass http://alertmanager_backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Timeouts + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 30s; + + # Buffer settings + proxy_buffering on; + proxy_buffer_size 4k; + proxy_buffers 8 4k; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; +} + +# Alertmanager API +location /api/v1/ { + # Rate limiting + limit_req zone=api burst=20 nodelay; + + # Proxy to Alertmanager + proxy_pass http://alertmanager_backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # CORS headers + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + + # Handle preflight requests + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } +} diff --git a/infra/nginx/conf.d/grafana.conf b/infra/nginx/conf.d/grafana.conf index 5a4e2ed..166b8b5 100644 --- a/infra/nginx/conf.d/grafana.conf +++ b/infra/nginx/conf.d/grafana.conf @@ -1,9 +1,3 @@ -# Grafana reverse proxy configuration -upstream grafana_backend { - server grafana:3000; - keepalive 32; -} - # Grafana proxy configuration location /grafana/ { proxy_pass http://grafana_backend/; diff --git a/infra/nginx/conf.d/prometheus.conf b/infra/nginx/conf.d/prometheus.conf index b3a3156..c189cd7 100644 --- a/infra/nginx/conf.d/prometheus.conf +++ b/infra/nginx/conf.d/prometheus.conf @@ -1,12 +1,7 @@ -# Prometheus reverse proxy configuration -upstream prometheus_backend { - server prometheus:9090; - keepalive 32; -} - # Prometheus proxy configuration location /prometheus/ { proxy_pass http://prometheus_backend/; + proxy_redirect / /prometheus/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; @@ -31,4 +26,4 @@ location /prometheus/-/healthy { proxy_pass http://prometheus_backend/-/healthy; proxy_set_header Host $host; access_log off; -} +} \ No newline at end of file diff --git a/infra/nginx/conf.d/status.conf b/infra/nginx/conf.d/status.conf index 9b89b20..13bcc62 100644 --- a/infra/nginx/conf.d/status.conf +++ b/infra/nginx/conf.d/status.conf @@ -1,16 +1,35 @@ -# Status page configuration (for future uptime kuma integration) +# Status page configuration (Uptime Kuma integration) # Rate limiting for status page location /status { - # Basic authentication for status page - auth_basic "Status Page Access"; - auth_basic_user_file /etc/nginx/.htpasswd; + # Rate limiting + limit_req zone=status burst=5 nodelay; - # Placeholder for future uptime kuma integration - # For now, show nginx status - access_log off; - return 200 '{"status": "ok", "nginx": "running", "timestamp": "$time_iso8601"}'; - add_header Content-Type application/json; + # Proxy to Uptime Kuma + proxy_pass http://uptime_kuma_backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket support + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + + # Timeouts + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 30s; + + # Buffer settings + proxy_buffering on; + proxy_buffer_size 4k; + proxy_buffers 8 4k; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; } # Nginx status stub (for monitoring) @@ -21,4 +40,4 @@ location /nginx_status { allow 172.16.0.0/12; # Docker networks allow 192.168.0.0/16; # Private networks deny all; -} +} \ No newline at end of file diff --git a/infra/nginx/conf.d/uptime-kuma.conf b/infra/nginx/conf.d/uptime-kuma.conf new file mode 100644 index 0000000..7c77a1f --- /dev/null +++ b/infra/nginx/conf.d/uptime-kuma.conf @@ -0,0 +1,69 @@ +# Uptime Kuma Nginx Configuration +# Proxies requests to Uptime Kuma status page + +# Upstream for Uptime Kuma +upstream uptime_kuma_backend { + server uptime-kuma:3001; + keepalive 32; +} + +# Status page location +location /status { + # Rate limiting + limit_req zone=status burst=5 nodelay; + + # Proxy to Uptime Kuma + proxy_pass http://uptime_kuma_backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # WebSocket support + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + + # Timeouts + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 30s; + + # Buffer settings + proxy_buffering on; + proxy_buffer_size 4k; + proxy_buffers 8 4k; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; +} + +# API endpoints for Uptime Kuma +location /api/ { + # Rate limiting + limit_req zone=api burst=10 nodelay; + + # Proxy to Uptime Kuma + proxy_pass http://uptime_kuma_backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # CORS headers + add_header Access-Control-Allow-Origin "*" always; + add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always; + + # Handle preflight requests + if ($request_method = 'OPTIONS') { + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS"; + add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization"; + add_header Access-Control-Max-Age 1728000; + add_header Content-Type "text/plain; charset=utf-8"; + add_header Content-Length 0; + return 204; + } +} diff --git a/infra/nginx/nginx.conf b/infra/nginx/nginx.conf index 645bd02..aaffe7d 100644 --- a/infra/nginx/nginx.conf +++ b/infra/nginx/nginx.conf @@ -63,6 +63,27 @@ http { ssl_session_cache shared:SSL:10m; ssl_session_timeout 10m; + # Upstream configurations + upstream grafana_backend { + server grafana:3000; + keepalive 32; + } + + upstream prometheus_backend { + server prometheus:9090; + keepalive 32; + } + + upstream uptime_kuma_backend { + server uptime-kuma:3001; + keepalive 32; + } + + upstream alertmanager_backend { + server alertmanager:9093; + keepalive 32; + } + # Main server block server { listen 80; @@ -74,17 +95,19 @@ http { listen 443 ssl http2; server_name _; - # SSL configuration - ssl_certificate /etc/nginx/ssl/cert.pem; - ssl_certificate_key /etc/nginx/ssl/key.pem; + # SSL configuration (self-signed certificate) + ssl_certificate /etc/letsencrypt/live/{{SERVER_IP}}/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/{{SERVER_IP}}/privkey.pem;еще + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384; + ssl_prefer_server_ciphers off; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 10m; # Security headers add_header X-Frame-Options "SAMEORIGIN" always; add_header X-Content-Type-Options "nosniff" always; - # Rate limiting - limit_req zone=api burst=20 nodelay; - # Redirect root to Grafana location = / { return 301 /grafana/; diff --git a/infra/nginx/ssl/letsencrypt.conf b/infra/nginx/ssl/letsencrypt.conf new file mode 100644 index 0000000..e2afe99 --- /dev/null +++ b/infra/nginx/ssl/letsencrypt.conf @@ -0,0 +1,27 @@ +# Let's Encrypt SSL Configuration +# This file contains the SSL configuration for Let's Encrypt certificates + +# SSL certificate paths (Let's Encrypt) +ssl_certificate /etc/letsencrypt/live/{{DOMAIN}}/fullchain.pem; +ssl_certificate_key /etc/letsencrypt/live/{{DOMAIN}}/privkey.pem; + +# SSL Security Configuration +ssl_protocols TLSv1.2 TLSv1.3; +ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384; +ssl_prefer_server_ciphers off; +ssl_session_cache shared:SSL:10m; +ssl_session_timeout 10m; +ssl_session_tickets off; + +# OCSP Stapling +ssl_stapling on; +ssl_stapling_verify on; +ssl_trusted_certificate /etc/letsencrypt/live/{{DOMAIN}}/chain.pem; + +# Security Headers +add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; +add_header X-Frame-Options "SAMEORIGIN" always; +add_header X-Content-Type-Options "nosniff" always; +add_header X-XSS-Protection "1; mode=block" always; +add_header Referrer-Policy "strict-origin-when-cross-origin" always; +add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https:; font-src 'self' data:; connect-src 'self' wss: https:;" always; diff --git a/infra/prometheus/alert_rules.yml b/infra/prometheus/alert_rules.yml new file mode 100644 index 0000000..7f5bb0f --- /dev/null +++ b/infra/prometheus/alert_rules.yml @@ -0,0 +1,253 @@ +# Prometheus Alert Rules +# This file defines alerting rules for monitoring the bot infrastructure + +groups: + # Bot Health Monitoring + - name: bot_health + rules: + # Telegram Bot Health + - alert: TelegramBotDown + expr: up{job="telegram-bot"} == 0 + for: 1m + labels: + severity: critical + service: telegram-bot + annotations: + summary: "Telegram Bot is down" + description: "Telegram Bot has been down for more than 1 minute" + runbook_url: "https://docs.example.com/runbooks/telegram-bot-down" + + - alert: TelegramBotHighErrorRate + expr: rate(http_requests_total{job="telegram-bot",status=~"5.."}[5m]) > 0.1 + for: 2m + labels: + severity: warning + service: telegram-bot + annotations: + summary: "Telegram Bot high error rate" + description: "Telegram Bot error rate is {{ $value }} errors per second" + + - alert: TelegramBotHighResponseTime + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="telegram-bot"}[5m])) > 2 + for: 5m + labels: + severity: warning + service: telegram-bot + annotations: + summary: "Telegram Bot high response time" + description: "95th percentile response time is {{ $value }} seconds" + + # AnonBot Health + - alert: AnonBotDown + expr: up{job="anon-bot"} == 0 + for: 1m + labels: + severity: critical + service: anon-bot + annotations: + summary: "AnonBot is down" + description: "AnonBot has been down for more than 1 minute" + runbook_url: "https://docs.example.com/runbooks/anon-bot-down" + + - alert: AnonBotHighErrorRate + expr: rate(http_requests_total{job="anon-bot",status=~"5.."}[5m]) > 0.1 + for: 2m + labels: + severity: warning + service: anon-bot + annotations: + summary: "AnonBot high error rate" + description: "AnonBot error rate is {{ $value }} errors per second" + + - alert: AnonBotHighResponseTime + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="anon-bot"}[5m])) > 2 + for: 5m + labels: + severity: warning + service: anon-bot + annotations: + summary: "AnonBot high response time" + description: "95th percentile response time is {{ $value }} seconds" + + # Infrastructure Health Monitoring + - name: infrastructure_health + rules: + # Prometheus Health + - alert: PrometheusDown + expr: up{job="prometheus"} == 0 + for: 1m + labels: + severity: critical + service: prometheus + annotations: + summary: "Prometheus is down" + description: "Prometheus has been down for more than 1 minute" + + - alert: PrometheusHighMemoryUsage + expr: (prometheus_tsdb_head_series / prometheus_tsdb_head_series_limit) > 0.8 + for: 5m + labels: + severity: warning + service: prometheus + annotations: + summary: "Prometheus high memory usage" + description: "Prometheus memory usage is {{ $value | humanizePercentage }} of limit" + + # Grafana Health + - alert: GrafanaDown + expr: up{job="grafana"} == 0 + for: 1m + labels: + severity: critical + service: grafana + annotations: + summary: "Grafana is down" + description: "Grafana has been down for more than 1 minute" + + # Nginx Health + - alert: NginxDown + expr: up{job="nginx"} == 0 + for: 1m + labels: + severity: critical + service: nginx + annotations: + summary: "Nginx is down" + description: "Nginx has been down for more than 1 minute" + + - alert: NginxHighErrorRate + expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1 + for: 2m + labels: + severity: warning + service: nginx + annotations: + summary: "Nginx high error rate" + description: "Nginx error rate is {{ $value }} errors per second" + + # System Resource Monitoring + - name: system_resources + rules: + # High CPU Usage + - alert: HighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + service: system + annotations: + summary: "High CPU usage" + description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" + + - alert: VeryHighCPUUsage + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 2m + labels: + severity: critical + service: system + annotations: + summary: "Very high CPU usage" + description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" + + # High Memory Usage + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + service: system + annotations: + summary: "High memory usage" + description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" + + - alert: VeryHighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + service: system + annotations: + summary: "Very high memory usage" + description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" + + # Disk Space + - alert: LowDiskSpace + expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + service: system + annotations: + summary: "Low disk space" + description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})" + + - alert: VeryLowDiskSpace + expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + service: system + annotations: + summary: "Very low disk space" + description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})" + + # Docker Container Monitoring + - name: docker_containers + rules: + # Container Restart + - alert: ContainerRestarting + expr: rate(container_start_time_seconds[10m]) > 0 + for: 0m + labels: + severity: warning + service: docker + annotations: + summary: "Container restarting" + description: "Container {{ $labels.name }} is restarting frequently" + + # Container High Memory Usage + - alert: ContainerHighMemoryUsage + expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80 + for: 5m + labels: + severity: warning + service: docker + annotations: + summary: "Container high memory usage" + description: "Container {{ $labels.name }} memory usage is {{ $value }}%" + + # Container High CPU Usage + - alert: ContainerHighCPUUsage + expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100) > 80 + for: 5m + labels: + severity: warning + service: docker + annotations: + summary: "Container high CPU usage" + description: "Container {{ $labels.name }} CPU usage is {{ $value }}%" + + # Database Monitoring + - name: database_health + rules: + # Database Connection Issues + - alert: DatabaseConnectionFailed + expr: increase(database_connection_errors_total[5m]) > 5 + for: 1m + labels: + severity: critical + service: database + annotations: + summary: "Database connection failures" + description: "{{ $value }} database connection failures in the last 5 minutes" + + # Database High Query Time + - alert: DatabaseHighQueryTime + expr: histogram_quantile(0.95, rate(database_query_duration_seconds_bucket[5m])) > 1 + for: 5m + labels: + severity: warning + service: database + annotations: + summary: "Database high query time" + description: "95th percentile database query time is {{ $value }} seconds" diff --git a/infra/prometheus/prometheus.yml b/infra/prometheus/prometheus.yml index 0bafff2..fe9481b 100644 --- a/infra/prometheus/prometheus.yml +++ b/infra/prometheus/prometheus.yml @@ -3,8 +3,7 @@ global: evaluation_interval: 15s rule_files: - # - "first_rules.yml" - # - "second_rules.yml" + - "alert_rules.yml" scrape_configs: - job_name: 'prometheus' @@ -46,4 +45,4 @@ alerting: alertmanagers: - static_configs: - targets: - # - alertmanager:9093 + - alertmanager:9093 diff --git a/infra/uptime-kuma/docker-compose.yml b/infra/uptime-kuma/docker-compose.yml new file mode 100644 index 0000000..1e7398f --- /dev/null +++ b/infra/uptime-kuma/docker-compose.yml @@ -0,0 +1,33 @@ +# Uptime Kuma Configuration +# This is a separate docker-compose file for Uptime Kuma +# It will be included in the main docker-compose.yml + +version: '3.8' + +services: + uptime-kuma: + image: louislam/uptime-kuma:latest + container_name: bots_uptime_kuma + restart: unless-stopped + volumes: + - uptime_kuma_data:/app/data + ports: + - "3001:3001" + environment: + - UPTIME_KUMA_PORT=3001 + networks: + - bots_network + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + +volumes: + uptime_kuma_data: + driver: local + +networks: + bots_network: + external: true diff --git a/scripts/setup-ssl.sh b/scripts/setup-ssl.sh new file mode 100755 index 0000000..dd6b765 --- /dev/null +++ b/scripts/setup-ssl.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +# SSL Setup Script for Let's Encrypt +# This script sets up SSL certificates using Let's Encrypt + +set -e + +# Configuration +DOMAIN="${DOMAIN:-localhost}" +EMAIL="${EMAIL:-admin@${DOMAIN}}" +NGINX_CONTAINER="bots_nginx" +CERTBOT_IMAGE="certbot/certbot:latest" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Logging function +log() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" +} + +warn() { + echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" +} + +error() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" + exit 1 +} + +# Check if running as root +if [[ $EUID -eq 0 ]]; then + error "This script should not be run as root for security reasons" +fi + +# Check if domain is localhost +if [[ "$DOMAIN" == "localhost" ]]; then + warn "Domain is set to localhost. Let's Encrypt certificates cannot be issued for localhost." + warn "Please set the DOMAIN environment variable to your actual domain name." + warn "Example: DOMAIN=example.com ./scripts/setup-ssl.sh" + exit 1 +fi + +# Check if Docker is running +if ! docker info > /dev/null 2>&1; then + error "Docker is not running. Please start Docker and try again." +fi + +# Check if nginx container is running +if ! docker ps | grep -q "$NGINX_CONTAINER"; then + error "Nginx container ($NGINX_CONTAINER) is not running. Please start it first with 'docker-compose up -d nginx'" +fi + +log "Setting up SSL certificates for domain: $DOMAIN" +log "Email for Let's Encrypt: $EMAIL" + +# Create necessary directories +log "Creating Let's Encrypt directories..." +sudo mkdir -p /etc/letsencrypt/live +sudo mkdir -p /etc/letsencrypt/archive +sudo mkdir -p /etc/letsencrypt/renewal +sudo chmod 755 /etc/letsencrypt + +# Stop nginx temporarily for certificate generation +log "Stopping nginx container for certificate generation..." +docker stop "$NGINX_CONTAINER" || true + +# Generate certificate using certbot +log "Generating SSL certificate using Let's Encrypt..." +docker run --rm \ + -v /etc/letsencrypt:/etc/letsencrypt \ + -v /var/lib/letsencrypt:/var/lib/letsencrypt \ + -p 80:80 \ + -p 443:443 \ + "$CERTBOT_IMAGE" certonly \ + --standalone \ + --non-interactive \ + --agree-tos \ + --email "$EMAIL" \ + --domains "$DOMAIN" \ + --expand + +# Check if certificate was generated successfully +if [[ ! -f "/etc/letsencrypt/live/$DOMAIN/fullchain.pem" ]]; then + error "Failed to generate SSL certificate for $DOMAIN" +fi + +log "SSL certificate generated successfully!" + +# Set proper permissions +log "Setting proper permissions for SSL certificates..." +sudo chmod 755 /etc/letsencrypt/live +sudo chmod 755 /etc/letsencrypt/archive +sudo chmod 644 /etc/letsencrypt/live/"$DOMAIN"/*.pem +sudo chmod 600 /etc/letsencrypt/live/"$DOMAIN"/privkey.pem + +# Update nginx configuration to use Let's Encrypt certificates +log "Updating nginx configuration..." +if [[ -f "infra/nginx/ssl/letsencrypt.conf" ]]; then + # Replace domain placeholder in letsencrypt.conf + sed "s/{{DOMAIN}}/$DOMAIN/g" infra/nginx/ssl/letsencrypt.conf > /tmp/letsencrypt.conf + sudo cp /tmp/letsencrypt.conf /etc/letsencrypt/live/"$DOMAIN"/letsencrypt.conf + rm /tmp/letsencrypt.conf +fi + +# Start nginx container +log "Starting nginx container..." +docker start "$NGINX_CONTAINER" + +# Wait for nginx to start +log "Waiting for nginx to start..." +sleep 10 + +# Test SSL certificate +log "Testing SSL certificate..." +if curl -k -s "https://$DOMAIN" > /dev/null; then + log "SSL certificate is working correctly!" +else + warn "SSL certificate test failed. Please check nginx configuration." +fi + +# Set up automatic renewal +log "Setting up automatic certificate renewal..." +cat > /tmp/ssl-renewal.sh << EOF +#!/bin/bash +# SSL Certificate Renewal Script + +set -e + +DOMAIN="$DOMAIN" +NGINX_CONTAINER="$NGINX_CONTAINER" +CERTBOT_IMAGE="$CERTBOT_IMAGE" + +# Renew certificates +docker run --rm \\ + -v /etc/letsencrypt:/etc/letsencrypt \\ + -v /var/lib/letsencrypt:/var/lib/letsencrypt \\ + "$CERTBOT_IMAGE" renew --quiet + +# Reload nginx +docker exec "\$NGINX_CONTAINER" nginx -s reload + +echo "\$(date): SSL certificates renewed successfully" >> /var/log/ssl-renewal.log +EOF + +sudo mv /tmp/ssl-renewal.sh /usr/local/bin/ssl-renewal.sh +sudo chmod +x /usr/local/bin/ssl-renewal.sh + +# Add cron job for automatic renewal (every Monday at 2 AM) +log "Adding cron job for automatic renewal..." +(crontab -l 2>/dev/null; echo "0 2 * * 1 /usr/local/bin/ssl-renewal.sh") | crontab - + +log "SSL setup completed successfully!" +log "Certificate location: /etc/letsencrypt/live/$DOMAIN/" +log "Automatic renewal is configured to run every Monday at 2 AM" +log "You can test the renewal manually with: sudo /usr/local/bin/ssl-renewal.sh" + +# Display certificate information +log "Certificate information:" +openssl x509 -in "/etc/letsencrypt/live/$DOMAIN/fullchain.pem" -text -noout | grep -E "(Subject:|Not Before|Not After|DNS:)"