feat: integrate Uptime Kuma and Alertmanager into Docker setup
- Add Uptime Kuma service for status monitoring with health checks. - Introduce Alertmanager service for alert management and notifications. - Update docker-compose.yml to include new services and their configurations. - Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs. - Modify Ansible playbook to install necessary packages and configure SSL for new services. - Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager. - Adjust Prometheus configuration to include alert rules and external URLs.
This commit is contained in:
53
Makefile
53
Makefile
@@ -9,6 +9,8 @@ help: ## Показать справку
|
|||||||
@echo "📊 Мониторинг:"
|
@echo "📊 Мониторинг:"
|
||||||
@echo " Prometheus: http://localhost:9090"
|
@echo " Prometheus: http://localhost:9090"
|
||||||
@echo " Grafana: http://localhost:3000 (admin/admin)"
|
@echo " Grafana: http://localhost:3000 (admin/admin)"
|
||||||
|
@echo " Uptime Kuma: http://localhost:3001"
|
||||||
|
@echo " Alertmanager: http://localhost:9093"
|
||||||
@echo " Server Monitor: http://localhost:9091/health"
|
@echo " Server Monitor: http://localhost:9091/health"
|
||||||
@echo " Bot Health: http://localhost:8080/health"
|
@echo " Bot Health: http://localhost:8080/health"
|
||||||
@echo " AnonBot Health: http://localhost:8081/health"
|
@echo " AnonBot Health: http://localhost:8081/health"
|
||||||
@@ -37,6 +39,12 @@ logs-bot: ## Показать логи Telegram бота
|
|||||||
logs-anonBot: ## Показать логи AnonBot
|
logs-anonBot: ## Показать логи AnonBot
|
||||||
docker-compose logs -f anon-bot
|
docker-compose logs -f anon-bot
|
||||||
|
|
||||||
|
logs-uptime-kuma: ## Показать логи Uptime Kuma
|
||||||
|
docker-compose logs -f uptime-kuma
|
||||||
|
|
||||||
|
logs-alertmanager: ## Показать логи Alertmanager
|
||||||
|
docker-compose logs -f alertmanager
|
||||||
|
|
||||||
restart: ## Перезапустить все сервисы
|
restart: ## Перезапустить все сервисы
|
||||||
docker-compose down
|
docker-compose down
|
||||||
docker-compose build --no-cache
|
docker-compose build --no-cache
|
||||||
@@ -54,6 +62,12 @@ restart-bot: ## Перезапустить только Telegram бота
|
|||||||
restart-anonBot: ## Перезапустить только AnonBot
|
restart-anonBot: ## Перезапустить только AnonBot
|
||||||
docker-compose restart anon-bot
|
docker-compose restart anon-bot
|
||||||
|
|
||||||
|
restart-uptime-kuma: ## Перезапустить только Uptime Kuma
|
||||||
|
docker-compose restart uptime-kuma
|
||||||
|
|
||||||
|
restart-alertmanager: ## Перезапустить только Alertmanager
|
||||||
|
docker-compose restart alertmanager
|
||||||
|
|
||||||
status: ## Показать статус контейнеров
|
status: ## Показать статус контейнеров
|
||||||
docker-compose ps
|
docker-compose ps
|
||||||
|
|
||||||
@@ -63,6 +77,8 @@ health: ## Проверить здоровье сервисов
|
|||||||
@curl -f http://localhost:8081/health || echo "❌ AnonBot health check failed"
|
@curl -f http://localhost:8081/health || echo "❌ AnonBot health check failed"
|
||||||
@curl -f http://localhost:9090/-/healthy || echo "❌ Prometheus health check failed"
|
@curl -f http://localhost:9090/-/healthy || echo "❌ Prometheus health check failed"
|
||||||
@curl -f http://localhost:3000/api/health || echo "❌ Grafana health check failed"
|
@curl -f http://localhost:3000/api/health || echo "❌ Grafana health check failed"
|
||||||
|
@curl -f http://localhost:3001 || echo "❌ Uptime Kuma health check failed"
|
||||||
|
@curl -f http://localhost:9093/-/healthy || echo "❌ Alertmanager health check failed"
|
||||||
@curl -f http://localhost:9091/health || echo "❌ Server monitor health check failed"
|
@curl -f http://localhost:9091/health || echo "❌ Server monitor health check failed"
|
||||||
|
|
||||||
deploy: ## Полный деплой на продакшен
|
deploy: ## Полный деплой на продакшен
|
||||||
@@ -120,6 +136,8 @@ start: build up ## Собрать и запустить все сервисы
|
|||||||
@echo "🏗️ Production Infrastructure запущена!"
|
@echo "🏗️ Production Infrastructure запущена!"
|
||||||
@echo "📊 Prometheus: http://localhost:9090"
|
@echo "📊 Prometheus: http://localhost:9090"
|
||||||
@echo "📈 Grafana: http://localhost:3000 (admin/admin)"
|
@echo "📈 Grafana: http://localhost:3000 (admin/admin)"
|
||||||
|
@echo "📊 Uptime Kuma: http://localhost:3001"
|
||||||
|
@echo "🚨 Alertmanager: http://localhost:9093"
|
||||||
@echo "🤖 Bot Health: http://localhost:8080/health"
|
@echo "🤖 Bot Health: http://localhost:8080/health"
|
||||||
@echo "🔒 AnonBot Health: http://localhost:8081/health"
|
@echo "🔒 AnonBot Health: http://localhost:8081/health"
|
||||||
@echo "📡 Server Monitor: http://localhost:9091/health"
|
@echo "📡 Server Monitor: http://localhost:9091/health"
|
||||||
@@ -191,6 +209,7 @@ test-clean: ## Очистить все файлы тестирования и о
|
|||||||
@find . -name "*.pyc" -delete 2>/dev/null || true
|
@find . -name "*.pyc" -delete 2>/dev/null || true
|
||||||
@find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
|
@find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true
|
||||||
@echo "✅ Файлы тестирования очищены"
|
@echo "✅ Файлы тестирования очищены"
|
||||||
|
|
||||||
|
|
||||||
check-ports: ## Проверить занятые порты
|
check-ports: ## Проверить занятые порты
|
||||||
@echo "🔍 Checking occupied ports..."
|
@echo "🔍 Checking occupied ports..."
|
||||||
@@ -242,3 +261,37 @@ reload-prometheus: ## Перезагрузить конфигурацию Promet
|
|||||||
reload-grafana: ## Перезагрузить конфигурацию Grafana
|
reload-grafana: ## Перезагрузить конфигурацию Grafana
|
||||||
@echo "🔄 Reloading Grafana configuration..."
|
@echo "🔄 Reloading Grafana configuration..."
|
||||||
@docker-compose restart grafana
|
@docker-compose restart grafana
|
||||||
|
|
||||||
|
ssl-setup: ## Настроить SSL сертификаты (самоподписанный)
|
||||||
|
@echo "🔒 Setting up self-signed SSL certificates..."
|
||||||
|
@if [ -z "$(SERVER_IP)" ]; then echo "❌ Please set SERVER_IP variable in .env file"; exit 1; fi
|
||||||
|
@mkdir -p /etc/letsencrypt/live/$(SERVER_IP)
|
||||||
|
@openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
|
||||||
|
-keyout /etc/letsencrypt/live/$(SERVER_IP)/privkey.pem \
|
||||||
|
-out /etc/letsencrypt/live/$(SERVER_IP)/fullchain.pem \
|
||||||
|
-subj "/CN=$(SERVER_IP)"
|
||||||
|
@echo "✅ Self-signed certificate created for $(SERVER_IP)"
|
||||||
|
|
||||||
|
ssl-renew: ## Обновить SSL сертификаты
|
||||||
|
@echo "🔄 Renewing SSL certificates..."
|
||||||
|
@sudo /usr/local/bin/ssl-renewal.sh
|
||||||
|
|
||||||
|
ssl-status: ## Проверить статус SSL сертификатов
|
||||||
|
@echo "🔍 Checking SSL certificate status..."
|
||||||
|
@sudo certbot certificates
|
||||||
|
|
||||||
|
uptime-kuma: ## Открыть Uptime Kuma в браузере
|
||||||
|
@echo "📊 Opening Uptime Kuma..."
|
||||||
|
@open http://localhost:3001 || xdg-open http://localhost:3001 || echo "Please open manually: http://localhost:3001"
|
||||||
|
|
||||||
|
alertmanager: ## Открыть Alertmanager в браузере
|
||||||
|
@echo "🚨 Opening Alertmanager..."
|
||||||
|
@open http://localhost:9093 || xdg-open http://localhost:9093 || echo "Please open manually: http://localhost:9093"
|
||||||
|
|
||||||
|
monitoring-all: ## Открыть все мониторинг сервисы
|
||||||
|
@echo "📊 Opening all monitoring services..."
|
||||||
|
@echo " - Grafana: http://localhost:3000"
|
||||||
|
@echo " - Prometheus: http://localhost:9090"
|
||||||
|
@echo " - Uptime Kuma: http://localhost:3001"
|
||||||
|
@echo " - Alertmanager: http://localhost:9093"
|
||||||
|
@open http://localhost:3000 || xdg-open http://localhost:3000 || echo "Please open manually"
|
||||||
|
|||||||
@@ -12,10 +12,12 @@ services:
|
|||||||
- '--web.console.templates=/etc/prometheus/consoles'
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||||||
- '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_DAYS:-30}d'
|
- '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_DAYS:-30}d'
|
||||||
- '--web.enable-lifecycle'
|
- '--web.enable-lifecycle'
|
||||||
|
- '--web.external-url=https://${SERVER_IP}/prometheus/'
|
||||||
ports:
|
ports:
|
||||||
- "9090:9090"
|
- "9090:9090"
|
||||||
volumes:
|
volumes:
|
||||||
- ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
- ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- ./infra/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
|
||||||
- prometheus_data:/prometheus
|
- prometheus_data:/prometheus
|
||||||
networks:
|
networks:
|
||||||
- bots_network
|
- bots_network
|
||||||
@@ -35,9 +37,9 @@ services:
|
|||||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||||
- GF_USERS_ALLOW_SIGN_UP=false
|
- GF_USERS_ALLOW_SIGN_UP=false
|
||||||
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
|
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
|
||||||
- GF_SERVER_ROOT_URL=https://${SERVER_IP:-localhost}/grafana/
|
- GF_SERVER_ROOT_URL=https://${SERVER_IP}/grafana/
|
||||||
- GF_SERVER_SERVE_FROM_SUB_PATH=true
|
- GF_SERVER_SERVE_FROM_SUB_PATH=true
|
||||||
- GF_SERVER_DOMAIN=${SERVER_IP:-localhost}
|
- GF_SERVER_DOMAIN=${SERVER_IP}
|
||||||
ports:
|
ports:
|
||||||
- "3000:3000"
|
- "3000:3000"
|
||||||
volumes:
|
volumes:
|
||||||
@@ -53,6 +55,51 @@ services:
|
|||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|
||||||
|
# Uptime Kuma Status Page
|
||||||
|
uptime-kuma:
|
||||||
|
image: louislam/uptime-kuma:latest
|
||||||
|
container_name: bots_uptime_kuma
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- uptime_kuma_data:/app/data
|
||||||
|
ports:
|
||||||
|
- "3001:3001"
|
||||||
|
environment:
|
||||||
|
- UPTIME_KUMA_PORT=3001
|
||||||
|
networks:
|
||||||
|
- bots_network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
|
||||||
|
# Alertmanager
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:latest
|
||||||
|
container_name: bots_alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
- '--web.external-url=https://${SERVER_IP}/alertmanager/'
|
||||||
|
- '--web.route-prefix=/'
|
||||||
|
ports:
|
||||||
|
- "9093:9093"
|
||||||
|
volumes:
|
||||||
|
- alertmanager_data:/alertmanager
|
||||||
|
- ./infra/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
networks:
|
||||||
|
- bots_network
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
# Nginx Reverse Proxy
|
# Nginx Reverse Proxy
|
||||||
nginx:
|
nginx:
|
||||||
image: nginx:alpine
|
image: nginx:alpine
|
||||||
@@ -61,16 +108,20 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "80:80"
|
- "80:80"
|
||||||
- "443:443"
|
- "443:443"
|
||||||
|
environment:
|
||||||
|
- SERVER_IP=${SERVER_IP}
|
||||||
volumes:
|
volumes:
|
||||||
- ./infra/nginx/nginx.conf:/etc/nginx/nginx.conf:ro
|
- ./infra/nginx/nginx.conf:/etc/nginx/templates/nginx.conf.template:ro
|
||||||
- ./infra/nginx/conf.d:/etc/nginx/conf.d:ro
|
- ./infra/nginx/conf.d:/etc/nginx/conf.d:ro
|
||||||
- ./infra/nginx/ssl:/etc/nginx/ssl:ro
|
- ./infra/nginx/ssl:/etc/nginx/ssl:ro
|
||||||
- ./infra/nginx/.htpasswd:/etc/nginx/.htpasswd:ro
|
- ./infra/nginx/.htpasswd:/etc/nginx/.htpasswd:ro
|
||||||
|
- /etc/letsencrypt:/etc/letsencrypt:ro
|
||||||
networks:
|
networks:
|
||||||
- bots_network
|
- bots_network
|
||||||
depends_on:
|
depends_on:
|
||||||
- grafana
|
- grafana
|
||||||
- prometheus
|
- prometheus
|
||||||
|
- uptime-kuma
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost/nginx-health"]
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost/nginx-health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
@@ -194,6 +245,10 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
grafana_data:
|
grafana_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
uptime_kuma_data:
|
||||||
|
driver: local
|
||||||
|
alertmanager_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
bots_network:
|
bots_network:
|
||||||
|
|||||||
17
infra/alertmanager/alertmanager-simple.yml
Normal file
17
infra/alertmanager/alertmanager-simple.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# Simplified Alertmanager Configuration
|
||||||
|
global:
|
||||||
|
smtp_smarthost: 'localhost:587'
|
||||||
|
smtp_from: 'alerts@localhost'
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 10s
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: 'web.hook'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'web.hook'
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/'
|
||||||
|
send_resolved: true
|
||||||
185
infra/alertmanager/alertmanager.yml
Normal file
185
infra/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
# Alertmanager Configuration
|
||||||
|
# This file configures how alerts are handled and routed
|
||||||
|
|
||||||
|
global:
|
||||||
|
# SMTP configuration for email notifications
|
||||||
|
smtp_smarthost: 'localhost:587'
|
||||||
|
smtp_from: 'alerts@{{DOMAIN}}'
|
||||||
|
smtp_auth_username: 'alerts@{{DOMAIN}}'
|
||||||
|
smtp_auth_password: '{{SMTP_PASSWORD}}'
|
||||||
|
smtp_require_tls: true
|
||||||
|
|
||||||
|
# Resolve timeout
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
# Templates for alert formatting
|
||||||
|
templates:
|
||||||
|
- '/etc/alertmanager/templates/*.tmpl'
|
||||||
|
|
||||||
|
# Route configuration - defines how alerts are routed
|
||||||
|
route:
|
||||||
|
group_by: ['alertname', 'cluster', 'service']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 10s
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: 'web.hook'
|
||||||
|
routes:
|
||||||
|
# Critical alerts - immediate notification
|
||||||
|
- match:
|
||||||
|
severity: critical
|
||||||
|
receiver: 'critical-alerts'
|
||||||
|
group_wait: 5s
|
||||||
|
repeat_interval: 5m
|
||||||
|
|
||||||
|
# Warning alerts - grouped notification
|
||||||
|
- match:
|
||||||
|
severity: warning
|
||||||
|
receiver: 'warning-alerts'
|
||||||
|
group_wait: 30s
|
||||||
|
repeat_interval: 30m
|
||||||
|
|
||||||
|
# Bot-specific alerts
|
||||||
|
- match:
|
||||||
|
service: telegram-bot
|
||||||
|
receiver: 'bot-alerts'
|
||||||
|
group_wait: 10s
|
||||||
|
repeat_interval: 15m
|
||||||
|
|
||||||
|
- match:
|
||||||
|
service: anon-bot
|
||||||
|
receiver: 'bot-alerts'
|
||||||
|
group_wait: 10s
|
||||||
|
repeat_interval: 15m
|
||||||
|
|
||||||
|
# Infrastructure alerts
|
||||||
|
- match:
|
||||||
|
service: prometheus
|
||||||
|
receiver: 'infrastructure-alerts'
|
||||||
|
group_wait: 30s
|
||||||
|
repeat_interval: 1h
|
||||||
|
|
||||||
|
- match:
|
||||||
|
service: grafana
|
||||||
|
receiver: 'infrastructure-alerts'
|
||||||
|
group_wait: 30s
|
||||||
|
repeat_interval: 1h
|
||||||
|
|
||||||
|
- match:
|
||||||
|
service: nginx
|
||||||
|
receiver: 'infrastructure-alerts'
|
||||||
|
group_wait: 30s
|
||||||
|
repeat_interval: 1h
|
||||||
|
|
||||||
|
# Inhibition rules - suppress certain alerts when others are firing
|
||||||
|
inhibit_rules:
|
||||||
|
# Suppress warning alerts when critical alerts are firing
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
equal: ['alertname', 'cluster', 'service']
|
||||||
|
|
||||||
|
# Suppress individual instance alerts when the entire service is down
|
||||||
|
- source_match:
|
||||||
|
alertname: 'ServiceDown'
|
||||||
|
target_match:
|
||||||
|
alertname: 'InstanceDown'
|
||||||
|
equal: ['service']
|
||||||
|
|
||||||
|
# Receiver configurations
|
||||||
|
receivers:
|
||||||
|
# Default webhook receiver (for testing)
|
||||||
|
- name: 'web.hook'
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# Critical alerts - immediate notification via multiple channels
|
||||||
|
- name: 'critical-alerts'
|
||||||
|
email_configs:
|
||||||
|
- to: 'admin@{{DOMAIN}}'
|
||||||
|
subject: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
|
||||||
|
body: |
|
||||||
|
{{ range .Alerts }}
|
||||||
|
Alert: {{ .Annotations.summary }}
|
||||||
|
Description: {{ .Annotations.description }}
|
||||||
|
Severity: {{ .Labels.severity }}
|
||||||
|
Service: {{ .Labels.service }}
|
||||||
|
Instance: {{ .Labels.instance }}
|
||||||
|
Time: {{ .StartsAt }}
|
||||||
|
{{ end }}
|
||||||
|
html: |
|
||||||
|
<h2>🚨 Critical Alert</h2>
|
||||||
|
<table>
|
||||||
|
<tr><td><strong>Alert:</strong></td><td>{{ .GroupLabels.alertname }}</td></tr>
|
||||||
|
<tr><td><strong>Service:</strong></td><td>{{ .GroupLabels.service }}</td></tr>
|
||||||
|
<tr><td><strong>Time:</strong></td><td>{{ .GroupLabels.time }}</td></tr>
|
||||||
|
</table>
|
||||||
|
<h3>Alerts:</h3>
|
||||||
|
<ul>
|
||||||
|
{{ range .Alerts }}
|
||||||
|
<li><strong>{{ .Annotations.summary }}</strong><br/>
|
||||||
|
{{ .Annotations.description }}<br/>
|
||||||
|
<small>Instance: {{ .Labels.instance }} | Time: {{ .StartsAt }}</small>
|
||||||
|
</li>
|
||||||
|
{{ end }}
|
||||||
|
</ul>
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/critical'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# Warning alerts - less urgent notification
|
||||||
|
- name: 'warning-alerts'
|
||||||
|
email_configs:
|
||||||
|
- to: 'admin@{{DOMAIN}}'
|
||||||
|
subject: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
||||||
|
body: |
|
||||||
|
{{ range .Alerts }}
|
||||||
|
Alert: {{ .Annotations.summary }}
|
||||||
|
Description: {{ .Annotations.description }}
|
||||||
|
Severity: {{ .Labels.severity }}
|
||||||
|
Service: {{ .Labels.service }}
|
||||||
|
Instance: {{ .Labels.instance }}
|
||||||
|
Time: {{ .StartsAt }}
|
||||||
|
{{ end }}
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/warning'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# Bot-specific alerts
|
||||||
|
- name: 'bot-alerts'
|
||||||
|
email_configs:
|
||||||
|
- to: 'bot-admin@{{DOMAIN}}'
|
||||||
|
subject: '🤖 Bot Alert: {{ .GroupLabels.alertname }}'
|
||||||
|
body: |
|
||||||
|
Bot Alert: {{ .GroupLabels.alertname }}
|
||||||
|
Service: {{ .GroupLabels.service }}
|
||||||
|
|
||||||
|
{{ range .Alerts }}
|
||||||
|
- {{ .Annotations.summary }}
|
||||||
|
{{ .Annotations.description }}
|
||||||
|
Instance: {{ .Labels.instance }}
|
||||||
|
Time: {{ .StartsAt }}
|
||||||
|
{{ end }}
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/bot'
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# Infrastructure alerts
|
||||||
|
- name: 'infrastructure-alerts'
|
||||||
|
email_configs:
|
||||||
|
- to: 'infra@{{DOMAIN}}'
|
||||||
|
subject: '🏗️ Infrastructure Alert: {{ .GroupLabels.alertname }}'
|
||||||
|
body: |
|
||||||
|
Infrastructure Alert: {{ .GroupLabels.alertname }}
|
||||||
|
Service: {{ .GroupLabels.service }}
|
||||||
|
|
||||||
|
{{ range .Alerts }}
|
||||||
|
- {{ .Annotations.summary }}
|
||||||
|
{{ .Annotations.description }}
|
||||||
|
Instance: {{ .Labels.instance }}
|
||||||
|
Time: {{ .StartsAt }}
|
||||||
|
{{ end }}
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://localhost:5001/infrastructure'
|
||||||
|
send_resolved: true
|
||||||
@@ -57,6 +57,15 @@
|
|||||||
- nginx
|
- nginx
|
||||||
- openssl
|
- openssl
|
||||||
- apache2-utils
|
- apache2-utils
|
||||||
|
- certbot
|
||||||
|
- python3-certbot-nginx
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Установить Python библиотеки для Ansible
|
||||||
|
pip:
|
||||||
|
name:
|
||||||
|
- passlib
|
||||||
|
- bcrypt
|
||||||
state: present
|
state: present
|
||||||
|
|
||||||
- name: Установить часовой пояс Europe/Moscow
|
- name: Установить часовой пояс Europe/Moscow
|
||||||
@@ -278,14 +287,40 @@
|
|||||||
- "{{ project_root }}/infra/nginx"
|
- "{{ project_root }}/infra/nginx"
|
||||||
- "{{ project_root }}/infra/nginx/ssl"
|
- "{{ project_root }}/infra/nginx/ssl"
|
||||||
- "{{ project_root }}/infra/nginx/conf.d"
|
- "{{ project_root }}/infra/nginx/conf.d"
|
||||||
|
- "{{ project_root }}/infra/uptime-kuma"
|
||||||
|
- "{{ project_root }}/infra/alertmanager"
|
||||||
|
- "{{ project_root }}/infra/grafana/dashboards"
|
||||||
|
- "{{ project_root }}/scripts"
|
||||||
|
|
||||||
- name: Сгенерировать самоподписанный SSL сертификат
|
- name: Сгенерировать самоподписанный SSL сертификат (fallback)
|
||||||
command: >
|
command: >
|
||||||
openssl req -x509 -newkey rsa:4096 -keyout {{ project_root }}/infra/nginx/ssl/key.pem
|
openssl req -x509 -newkey rsa:4096 -keyout {{ project_root }}/infra/nginx/ssl/key.pem
|
||||||
-out {{ project_root }}/infra/nginx/ssl/cert.pem -days 365 -nodes
|
-out {{ project_root }}/infra/nginx/ssl/cert.pem -days 365 -nodes
|
||||||
-subj "/CN={{ ansible_host }}/O=Monitoring/C=RU"
|
-subj "/CN={{ ansible_host }}/O=Monitoring/C=RU"
|
||||||
args:
|
args:
|
||||||
creates: "{{ project_root }}/infra/nginx/ssl/cert.pem"
|
creates: "{{ project_root }}/infra/nginx/ssl/cert.pem"
|
||||||
|
when: not use_letsencrypt | default(false)
|
||||||
|
|
||||||
|
- name: Создать директории для Let's Encrypt
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
loop:
|
||||||
|
- /etc/letsencrypt
|
||||||
|
- /etc/letsencrypt/live
|
||||||
|
- /etc/letsencrypt/archive
|
||||||
|
- /etc/letsencrypt/renewal
|
||||||
|
when: use_letsencrypt | default(false)
|
||||||
|
|
||||||
|
- name: Настроить cron для автоматического обновления SSL сертификатов
|
||||||
|
cron:
|
||||||
|
name: "SSL Certificate Renewal"
|
||||||
|
job: "0 2 * * 1 /usr/local/bin/ssl-renewal.sh"
|
||||||
|
user: root
|
||||||
|
when: use_letsencrypt | default(false)
|
||||||
|
|
||||||
- name: Установить права на SSL сертификаты
|
- name: Установить права на SSL сертификаты
|
||||||
file:
|
file:
|
||||||
@@ -314,6 +349,7 @@
|
|||||||
group: root
|
group: root
|
||||||
mode: '0644'
|
mode: '0644'
|
||||||
backup: yes
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
- name: Скопировать конфигурации nginx для сервисов
|
- name: Скопировать конфигурации nginx для сервисов
|
||||||
copy:
|
copy:
|
||||||
@@ -323,6 +359,7 @@
|
|||||||
group: root
|
group: root
|
||||||
mode: '0644'
|
mode: '0644'
|
||||||
backup: yes
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
- name: Скопировать SSL сертификаты
|
- name: Скопировать SSL сертификаты
|
||||||
copy:
|
copy:
|
||||||
@@ -332,6 +369,7 @@
|
|||||||
group: root
|
group: root
|
||||||
mode: '0600'
|
mode: '0600'
|
||||||
backup: yes
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
- name: Скопировать htpasswd файл
|
- name: Скопировать htpasswd файл
|
||||||
copy:
|
copy:
|
||||||
@@ -341,6 +379,47 @@
|
|||||||
group: root
|
group: root
|
||||||
mode: '0644'
|
mode: '0644'
|
||||||
backup: yes
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
- name: Скопировать конфигурацию Alertmanager
|
||||||
|
copy:
|
||||||
|
src: "{{ project_root }}/infra/alertmanager/alertmanager.yml"
|
||||||
|
dest: "{{ project_root }}/infra/alertmanager/alertmanager.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_user }}"
|
||||||
|
mode: '0644'
|
||||||
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
- name: Скопировать правила алертов Prometheus
|
||||||
|
copy:
|
||||||
|
src: "{{ project_root }}/infra/prometheus/alert_rules.yml"
|
||||||
|
dest: "{{ project_root }}/infra/prometheus/alert_rules.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_user }}"
|
||||||
|
mode: '0644'
|
||||||
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
- name: Скопировать дашборды Grafana
|
||||||
|
copy:
|
||||||
|
src: "{{ project_root }}/infra/grafana/dashboards/"
|
||||||
|
dest: "{{ project_root }}/infra/grafana/dashboards/"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_user }}"
|
||||||
|
mode: '0644'
|
||||||
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
|
- name: Скопировать скрипт настройки SSL
|
||||||
|
copy:
|
||||||
|
src: "{{ project_root }}/scripts/setup-ssl.sh"
|
||||||
|
dest: /usr/local/bin/setup-ssl.sh
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
backup: yes
|
||||||
|
remote_src: yes
|
||||||
|
|
||||||
- name: Проверить конфигурацию nginx
|
- name: Проверить конфигурацию nginx
|
||||||
command: nginx -t
|
command: nginx -t
|
||||||
@@ -811,6 +890,20 @@
|
|||||||
timeout: 30
|
timeout: 30
|
||||||
state: started
|
state: started
|
||||||
|
|
||||||
|
- name: Проверить, что порт 3001 (Uptime Kuma) открыт
|
||||||
|
wait_for:
|
||||||
|
port: 3001
|
||||||
|
host: "{{ ansible_host }}"
|
||||||
|
timeout: 30
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Проверить, что порт 9093 (Alertmanager) открыт
|
||||||
|
wait_for:
|
||||||
|
port: 9093
|
||||||
|
host: "{{ ansible_host }}"
|
||||||
|
timeout: 30
|
||||||
|
state: started
|
||||||
|
|
||||||
- name: Проверить доступность Nginx
|
- name: Проверить доступность Nginx
|
||||||
uri:
|
uri:
|
||||||
url: "http://{{ ansible_host }}/nginx-health"
|
url: "http://{{ ansible_host }}/nginx-health"
|
||||||
@@ -849,6 +942,26 @@
|
|||||||
retries: 5
|
retries: 5
|
||||||
delay: 10
|
delay: 10
|
||||||
|
|
||||||
|
- name: Проверить доступность Uptime Kuma через Nginx
|
||||||
|
uri:
|
||||||
|
url: "https://{{ ansible_host }}/status"
|
||||||
|
method: GET
|
||||||
|
status_code: 200
|
||||||
|
validate_certs: no
|
||||||
|
register: uptime_kuma_nginx_health
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
- name: Проверить доступность Alertmanager через Nginx
|
||||||
|
uri:
|
||||||
|
url: "https://{{ ansible_host }}/alertmanager/"
|
||||||
|
method: GET
|
||||||
|
status_code: 200
|
||||||
|
validate_certs: no
|
||||||
|
register: alertmanager_nginx_health
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
|
|
||||||
|
|
||||||
- name: Закрыть старый SSH порт 22 в UFW (финальный шаг)
|
- name: Закрыть старый SSH порт 22 в UFW (финальный шаг)
|
||||||
ufw:
|
ufw:
|
||||||
@@ -858,7 +971,7 @@
|
|||||||
|
|
||||||
- name: Проверка запуска ботов завершена — всё работает 🟢
|
- name: Проверка запуска ботов завершена — всё работает 🟢
|
||||||
debug:
|
debug:
|
||||||
msg: "Все сервисы запущены и слушают нужные порты. SSH настроен на порт 15722, Fail2ban активен, параметры безопасности ядра применены. Порт 22 закрыт для безопасности."
|
msg: "Все сервисы запущены и слушают нужные порты. SSH настроен на порт 15722, Fail2ban активен, параметры безопасности ядра применены. Порт 22 закрыт для безопасности. Добавлены: Uptime Kuma (статусная страница), Alertmanager (мониторинг), Let's Encrypt SSL, Grafana дашборды."
|
||||||
|
|
||||||
# handlers для перезагрузки сервисов
|
# handlers для перезагрузки сервисов
|
||||||
handlers:
|
handlers:
|
||||||
|
|||||||
529
infra/grafana/dashboards/bot-monitoring.json
Normal file
529
infra/grafana/dashboards/bot-monitoring.json
Normal file
@@ -0,0 +1,529 @@
|
|||||||
|
{
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": "-- Grafana --",
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "reqps"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"id": 1,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(http_requests_total{job=~\"telegram-bot|anon-bot\"}[5m])",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - {{method}} {{status}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot Request Rate",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "s"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"id": 2,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job=~\"telegram-bot|anon-bot\"}[5m]))",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - 95th percentile",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{job=~\"telegram-bot|anon-bot\"}[5m]))",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - 50th percentile",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot Response Time",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"id": 3,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(http_requests_total{job=~\"telegram-bot|anon-bot\",status=~\"5..\"}[5m]) / rate(http_requests_total{job=~\"telegram-bot|anon-bot\"}[5m]) * 100",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - Error Rate",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot Error Rate",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"id": 4,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "process_resident_memory_bytes{job=~\"telegram-bot|anon-bot\"}",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - Memory Usage",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot Memory Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"id": 5,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "up{job=~\"telegram-bot|anon-bot\"}",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - Status",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot Health Status",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"id": 6,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(process_cpu_seconds_total{job=~\"telegram-bot|anon-bot\"}[5m]) * 100",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - CPU Usage",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Bot CPU Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 27,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["bots", "monitoring"],
|
||||||
|
"templating": {
|
||||||
|
"list": []
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "",
|
||||||
|
"title": "Bot Monitoring Dashboard",
|
||||||
|
"uid": "bot-monitoring",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
523
infra/grafana/dashboards/infrastructure-monitoring.json
Normal file
523
infra/grafana/dashboards/infrastructure-monitoring.json
Normal file
@@ -0,0 +1,523 @@
|
|||||||
|
{
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": "-- Grafana --",
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"id": 1,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "CPU Usage - {{instance}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "System CPU Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"id": 2,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "Memory Usage - {{instance}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "System Memory Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"id": 3,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "Disk Usage - {{instance}} {{mountpoint}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Disk Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"id": 4,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "up{job=~\"prometheus|grafana|nginx|alertmanager|uptime-kuma\"}",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{job}} - Status",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Service Health Status",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "reqps"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"id": 5,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(nginx_http_requests_total[5m])",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "Nginx - {{status}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Nginx Request Rate",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "Prometheus",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
},
|
||||||
|
"custom": {
|
||||||
|
"axisLabel": "",
|
||||||
|
"axisPlacement": "auto",
|
||||||
|
"barAlignment": 0,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"gradientMode": "none",
|
||||||
|
"hideFrom": {
|
||||||
|
"legend": false,
|
||||||
|
"tooltip": false,
|
||||||
|
"vis": false
|
||||||
|
},
|
||||||
|
"lineInterpolation": "linear",
|
||||||
|
"lineWidth": 1,
|
||||||
|
"pointSize": 5,
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "linear"
|
||||||
|
},
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": false,
|
||||||
|
"stacking": {
|
||||||
|
"group": "A",
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"thresholdsStyle": {
|
||||||
|
"mode": "off"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 16
|
||||||
|
},
|
||||||
|
"id": 6,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"calcs": [],
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "single"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "container_memory_usage_bytes{name=~\"bots_.*\"}",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}} - Memory",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Container Memory Usage",
|
||||||
|
"type": "timeseries"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 27,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": ["infrastructure", "monitoring"],
|
||||||
|
"templating": {
|
||||||
|
"list": []
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "",
|
||||||
|
"title": "Infrastructure Monitoring Dashboard",
|
||||||
|
"uid": "infrastructure-monitoring",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
16
infra/grafana/provisioning/dashboards/dashboards.yml
Normal file
16
infra/grafana/provisioning/dashboards/dashboards.yml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Grafana Dashboard Provisioning Configuration
|
||||||
|
# This file configures automatic dashboard import
|
||||||
|
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: 'default'
|
||||||
|
orgId: 1
|
||||||
|
folder: ''
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 10
|
||||||
|
allowUiUpdates: true
|
||||||
|
options:
|
||||||
|
path: /etc/grafana/provisioning/dashboards
|
||||||
|
foldersFromFilesStructure: true
|
||||||
@@ -4,5 +4,13 @@ datasources:
|
|||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://prometheus:9090
|
url: http://prometheus:9090/prometheus
|
||||||
isDefault: true
|
isDefault: true
|
||||||
|
jsonData:
|
||||||
|
httpMethod: POST
|
||||||
|
manageAlerts: true
|
||||||
|
prometheusType: Prometheus
|
||||||
|
prometheusVersion: 2.40.0
|
||||||
|
cacheLevel: 'High'
|
||||||
|
disableRecordingRules: false
|
||||||
|
incrementalQueryOverlapWindow: 10m
|
||||||
|
|||||||
61
infra/nginx/conf.d/alertmanager.conf
Normal file
61
infra/nginx/conf.d/alertmanager.conf
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Alertmanager Nginx Configuration
|
||||||
|
# Proxies requests to Alertmanager
|
||||||
|
|
||||||
|
# Alertmanager location
|
||||||
|
location /alertmanager/ {
|
||||||
|
# Rate limiting
|
||||||
|
limit_req zone=api burst=10 nodelay;
|
||||||
|
|
||||||
|
# Remove trailing slash for proxy
|
||||||
|
rewrite ^/alertmanager/(.*)$ /$1 break;
|
||||||
|
|
||||||
|
# Proxy to Alertmanager
|
||||||
|
proxy_pass http://alertmanager_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# Timeouts
|
||||||
|
proxy_connect_timeout 30s;
|
||||||
|
proxy_send_timeout 30s;
|
||||||
|
proxy_read_timeout 30s;
|
||||||
|
|
||||||
|
# Buffer settings
|
||||||
|
proxy_buffering on;
|
||||||
|
proxy_buffer_size 4k;
|
||||||
|
proxy_buffers 8 4k;
|
||||||
|
|
||||||
|
# Security headers
|
||||||
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Alertmanager API
|
||||||
|
location /api/v1/ {
|
||||||
|
# Rate limiting
|
||||||
|
limit_req zone=api burst=20 nodelay;
|
||||||
|
|
||||||
|
# Proxy to Alertmanager
|
||||||
|
proxy_pass http://alertmanager_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# CORS headers
|
||||||
|
add_header Access-Control-Allow-Origin "*" always;
|
||||||
|
add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always;
|
||||||
|
add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always;
|
||||||
|
|
||||||
|
# Handle preflight requests
|
||||||
|
if ($request_method = 'OPTIONS') {
|
||||||
|
add_header Access-Control-Allow-Origin "*";
|
||||||
|
add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS";
|
||||||
|
add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization";
|
||||||
|
add_header Access-Control-Max-Age 1728000;
|
||||||
|
add_header Content-Type "text/plain; charset=utf-8";
|
||||||
|
add_header Content-Length 0;
|
||||||
|
return 204;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,3 @@
|
|||||||
# Grafana reverse proxy configuration
|
|
||||||
upstream grafana_backend {
|
|
||||||
server grafana:3000;
|
|
||||||
keepalive 32;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Grafana proxy configuration
|
# Grafana proxy configuration
|
||||||
location /grafana/ {
|
location /grafana/ {
|
||||||
proxy_pass http://grafana_backend/;
|
proxy_pass http://grafana_backend/;
|
||||||
|
|||||||
@@ -1,12 +1,7 @@
|
|||||||
# Prometheus reverse proxy configuration
|
|
||||||
upstream prometheus_backend {
|
|
||||||
server prometheus:9090;
|
|
||||||
keepalive 32;
|
|
||||||
}
|
|
||||||
|
|
||||||
# Prometheus proxy configuration
|
# Prometheus proxy configuration
|
||||||
location /prometheus/ {
|
location /prometheus/ {
|
||||||
proxy_pass http://prometheus_backend/;
|
proxy_pass http://prometheus_backend/;
|
||||||
|
proxy_redirect / /prometheus/;
|
||||||
proxy_set_header Host $host;
|
proxy_set_header Host $host;
|
||||||
proxy_set_header X-Real-IP $remote_addr;
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
@@ -31,4 +26,4 @@ location /prometheus/-/healthy {
|
|||||||
proxy_pass http://prometheus_backend/-/healthy;
|
proxy_pass http://prometheus_backend/-/healthy;
|
||||||
proxy_set_header Host $host;
|
proxy_set_header Host $host;
|
||||||
access_log off;
|
access_log off;
|
||||||
}
|
}
|
||||||
@@ -1,16 +1,35 @@
|
|||||||
# Status page configuration (for future uptime kuma integration)
|
# Status page configuration (Uptime Kuma integration)
|
||||||
|
|
||||||
# Rate limiting for status page
|
# Rate limiting for status page
|
||||||
location /status {
|
location /status {
|
||||||
# Basic authentication for status page
|
# Rate limiting
|
||||||
auth_basic "Status Page Access";
|
limit_req zone=status burst=5 nodelay;
|
||||||
auth_basic_user_file /etc/nginx/.htpasswd;
|
|
||||||
|
|
||||||
# Placeholder for future uptime kuma integration
|
# Proxy to Uptime Kuma
|
||||||
# For now, show nginx status
|
proxy_pass http://uptime_kuma_backend;
|
||||||
access_log off;
|
proxy_set_header Host $host;
|
||||||
return 200 '{"status": "ok", "nginx": "running", "timestamp": "$time_iso8601"}';
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
add_header Content-Type application/json;
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# WebSocket support
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
|
||||||
|
# Timeouts
|
||||||
|
proxy_connect_timeout 30s;
|
||||||
|
proxy_send_timeout 30s;
|
||||||
|
proxy_read_timeout 30s;
|
||||||
|
|
||||||
|
# Buffer settings
|
||||||
|
proxy_buffering on;
|
||||||
|
proxy_buffer_size 4k;
|
||||||
|
proxy_buffers 8 4k;
|
||||||
|
|
||||||
|
# Security headers
|
||||||
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
}
|
}
|
||||||
|
|
||||||
# Nginx status stub (for monitoring)
|
# Nginx status stub (for monitoring)
|
||||||
@@ -21,4 +40,4 @@ location /nginx_status {
|
|||||||
allow 172.16.0.0/12; # Docker networks
|
allow 172.16.0.0/12; # Docker networks
|
||||||
allow 192.168.0.0/16; # Private networks
|
allow 192.168.0.0/16; # Private networks
|
||||||
deny all;
|
deny all;
|
||||||
}
|
}
|
||||||
69
infra/nginx/conf.d/uptime-kuma.conf
Normal file
69
infra/nginx/conf.d/uptime-kuma.conf
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
# Uptime Kuma Nginx Configuration
|
||||||
|
# Proxies requests to Uptime Kuma status page
|
||||||
|
|
||||||
|
# Upstream for Uptime Kuma
|
||||||
|
upstream uptime_kuma_backend {
|
||||||
|
server uptime-kuma:3001;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Status page location
|
||||||
|
location /status {
|
||||||
|
# Rate limiting
|
||||||
|
limit_req zone=status burst=5 nodelay;
|
||||||
|
|
||||||
|
# Proxy to Uptime Kuma
|
||||||
|
proxy_pass http://uptime_kuma_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# WebSocket support
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
|
||||||
|
# Timeouts
|
||||||
|
proxy_connect_timeout 30s;
|
||||||
|
proxy_send_timeout 30s;
|
||||||
|
proxy_read_timeout 30s;
|
||||||
|
|
||||||
|
# Buffer settings
|
||||||
|
proxy_buffering on;
|
||||||
|
proxy_buffer_size 4k;
|
||||||
|
proxy_buffers 8 4k;
|
||||||
|
|
||||||
|
# Security headers
|
||||||
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
}
|
||||||
|
|
||||||
|
# API endpoints for Uptime Kuma
|
||||||
|
location /api/ {
|
||||||
|
# Rate limiting
|
||||||
|
limit_req zone=api burst=10 nodelay;
|
||||||
|
|
||||||
|
# Proxy to Uptime Kuma
|
||||||
|
proxy_pass http://uptime_kuma_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# CORS headers
|
||||||
|
add_header Access-Control-Allow-Origin "*" always;
|
||||||
|
add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS" always;
|
||||||
|
add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization" always;
|
||||||
|
|
||||||
|
# Handle preflight requests
|
||||||
|
if ($request_method = 'OPTIONS') {
|
||||||
|
add_header Access-Control-Allow-Origin "*";
|
||||||
|
add_header Access-Control-Allow-Methods "GET, POST, PUT, DELETE, OPTIONS";
|
||||||
|
add_header Access-Control-Allow-Headers "DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization";
|
||||||
|
add_header Access-Control-Max-Age 1728000;
|
||||||
|
add_header Content-Type "text/plain; charset=utf-8";
|
||||||
|
add_header Content-Length 0;
|
||||||
|
return 204;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -63,6 +63,27 @@ http {
|
|||||||
ssl_session_cache shared:SSL:10m;
|
ssl_session_cache shared:SSL:10m;
|
||||||
ssl_session_timeout 10m;
|
ssl_session_timeout 10m;
|
||||||
|
|
||||||
|
# Upstream configurations
|
||||||
|
upstream grafana_backend {
|
||||||
|
server grafana:3000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream prometheus_backend {
|
||||||
|
server prometheus:9090;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream uptime_kuma_backend {
|
||||||
|
server uptime-kuma:3001;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
upstream alertmanager_backend {
|
||||||
|
server alertmanager:9093;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
# Main server block
|
# Main server block
|
||||||
server {
|
server {
|
||||||
listen 80;
|
listen 80;
|
||||||
@@ -74,17 +95,19 @@ http {
|
|||||||
listen 443 ssl http2;
|
listen 443 ssl http2;
|
||||||
server_name _;
|
server_name _;
|
||||||
|
|
||||||
# SSL configuration
|
# SSL configuration (self-signed certificate)
|
||||||
ssl_certificate /etc/nginx/ssl/cert.pem;
|
ssl_certificate /etc/letsencrypt/live/{{SERVER_IP}}/fullchain.pem;
|
||||||
ssl_certificate_key /etc/nginx/ssl/key.pem;
|
ssl_certificate_key /etc/letsencrypt/live/{{SERVER_IP}}/privkey.pem;еще
|
||||||
|
ssl_protocols TLSv1.2 TLSv1.3;
|
||||||
|
ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384;
|
||||||
|
ssl_prefer_server_ciphers off;
|
||||||
|
ssl_session_cache shared:SSL:10m;
|
||||||
|
ssl_session_timeout 10m;
|
||||||
|
|
||||||
# Security headers
|
# Security headers
|
||||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
add_header X-Content-Type-Options "nosniff" always;
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
|
||||||
# Rate limiting
|
|
||||||
limit_req zone=api burst=20 nodelay;
|
|
||||||
|
|
||||||
# Redirect root to Grafana
|
# Redirect root to Grafana
|
||||||
location = / {
|
location = / {
|
||||||
return 301 /grafana/;
|
return 301 /grafana/;
|
||||||
|
|||||||
27
infra/nginx/ssl/letsencrypt.conf
Normal file
27
infra/nginx/ssl/letsencrypt.conf
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# Let's Encrypt SSL Configuration
|
||||||
|
# This file contains the SSL configuration for Let's Encrypt certificates
|
||||||
|
|
||||||
|
# SSL certificate paths (Let's Encrypt)
|
||||||
|
ssl_certificate /etc/letsencrypt/live/{{DOMAIN}}/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/{{DOMAIN}}/privkey.pem;
|
||||||
|
|
||||||
|
# SSL Security Configuration
|
||||||
|
ssl_protocols TLSv1.2 TLSv1.3;
|
||||||
|
ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384;
|
||||||
|
ssl_prefer_server_ciphers off;
|
||||||
|
ssl_session_cache shared:SSL:10m;
|
||||||
|
ssl_session_timeout 10m;
|
||||||
|
ssl_session_tickets off;
|
||||||
|
|
||||||
|
# OCSP Stapling
|
||||||
|
ssl_stapling on;
|
||||||
|
ssl_stapling_verify on;
|
||||||
|
ssl_trusted_certificate /etc/letsencrypt/live/{{DOMAIN}}/chain.pem;
|
||||||
|
|
||||||
|
# Security Headers
|
||||||
|
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
|
||||||
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
add_header X-XSS-Protection "1; mode=block" always;
|
||||||
|
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||||
|
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https:; font-src 'self' data:; connect-src 'self' wss: https:;" always;
|
||||||
253
infra/prometheus/alert_rules.yml
Normal file
253
infra/prometheus/alert_rules.yml
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
# Prometheus Alert Rules
|
||||||
|
# This file defines alerting rules for monitoring the bot infrastructure
|
||||||
|
|
||||||
|
groups:
|
||||||
|
# Bot Health Monitoring
|
||||||
|
- name: bot_health
|
||||||
|
rules:
|
||||||
|
# Telegram Bot Health
|
||||||
|
- alert: TelegramBotDown
|
||||||
|
expr: up{job="telegram-bot"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: telegram-bot
|
||||||
|
annotations:
|
||||||
|
summary: "Telegram Bot is down"
|
||||||
|
description: "Telegram Bot has been down for more than 1 minute"
|
||||||
|
runbook_url: "https://docs.example.com/runbooks/telegram-bot-down"
|
||||||
|
|
||||||
|
- alert: TelegramBotHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="telegram-bot",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: telegram-bot
|
||||||
|
annotations:
|
||||||
|
summary: "Telegram Bot high error rate"
|
||||||
|
description: "Telegram Bot error rate is {{ $value }} errors per second"
|
||||||
|
|
||||||
|
- alert: TelegramBotHighResponseTime
|
||||||
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="telegram-bot"}[5m])) > 2
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: telegram-bot
|
||||||
|
annotations:
|
||||||
|
summary: "Telegram Bot high response time"
|
||||||
|
description: "95th percentile response time is {{ $value }} seconds"
|
||||||
|
|
||||||
|
# AnonBot Health
|
||||||
|
- alert: AnonBotDown
|
||||||
|
expr: up{job="anon-bot"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: anon-bot
|
||||||
|
annotations:
|
||||||
|
summary: "AnonBot is down"
|
||||||
|
description: "AnonBot has been down for more than 1 minute"
|
||||||
|
runbook_url: "https://docs.example.com/runbooks/anon-bot-down"
|
||||||
|
|
||||||
|
- alert: AnonBotHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="anon-bot",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: anon-bot
|
||||||
|
annotations:
|
||||||
|
summary: "AnonBot high error rate"
|
||||||
|
description: "AnonBot error rate is {{ $value }} errors per second"
|
||||||
|
|
||||||
|
- alert: AnonBotHighResponseTime
|
||||||
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="anon-bot"}[5m])) > 2
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: anon-bot
|
||||||
|
annotations:
|
||||||
|
summary: "AnonBot high response time"
|
||||||
|
description: "95th percentile response time is {{ $value }} seconds"
|
||||||
|
|
||||||
|
# Infrastructure Health Monitoring
|
||||||
|
- name: infrastructure_health
|
||||||
|
rules:
|
||||||
|
# Prometheus Health
|
||||||
|
- alert: PrometheusDown
|
||||||
|
expr: up{job="prometheus"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: prometheus
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus is down"
|
||||||
|
description: "Prometheus has been down for more than 1 minute"
|
||||||
|
|
||||||
|
- alert: PrometheusHighMemoryUsage
|
||||||
|
expr: (prometheus_tsdb_head_series / prometheus_tsdb_head_series_limit) > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: prometheus
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus high memory usage"
|
||||||
|
description: "Prometheus memory usage is {{ $value | humanizePercentage }} of limit"
|
||||||
|
|
||||||
|
# Grafana Health
|
||||||
|
- alert: GrafanaDown
|
||||||
|
expr: up{job="grafana"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: grafana
|
||||||
|
annotations:
|
||||||
|
summary: "Grafana is down"
|
||||||
|
description: "Grafana has been down for more than 1 minute"
|
||||||
|
|
||||||
|
# Nginx Health
|
||||||
|
- alert: NginxDown
|
||||||
|
expr: up{job="nginx"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: nginx
|
||||||
|
annotations:
|
||||||
|
summary: "Nginx is down"
|
||||||
|
description: "Nginx has been down for more than 1 minute"
|
||||||
|
|
||||||
|
- alert: NginxHighErrorRate
|
||||||
|
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: nginx
|
||||||
|
annotations:
|
||||||
|
summary: "Nginx high error rate"
|
||||||
|
description: "Nginx error rate is {{ $value }} errors per second"
|
||||||
|
|
||||||
|
# System Resource Monitoring
|
||||||
|
- name: system_resources
|
||||||
|
rules:
|
||||||
|
# High CPU Usage
|
||||||
|
- alert: HighCPUUsage
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU usage"
|
||||||
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
- alert: VeryHighCPUUsage
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "Very high CPU usage"
|
||||||
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
# High Memory Usage
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "High memory usage"
|
||||||
|
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
- alert: VeryHighMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "Very high memory usage"
|
||||||
|
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
# Disk Space
|
||||||
|
- alert: LowDiskSpace
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "Low disk space"
|
||||||
|
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||||
|
|
||||||
|
- alert: VeryLowDiskSpace
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 95
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: system
|
||||||
|
annotations:
|
||||||
|
summary: "Very low disk space"
|
||||||
|
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||||
|
|
||||||
|
# Docker Container Monitoring
|
||||||
|
- name: docker_containers
|
||||||
|
rules:
|
||||||
|
# Container Restart
|
||||||
|
- alert: ContainerRestarting
|
||||||
|
expr: rate(container_start_time_seconds[10m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: docker
|
||||||
|
annotations:
|
||||||
|
summary: "Container restarting"
|
||||||
|
description: "Container {{ $labels.name }} is restarting frequently"
|
||||||
|
|
||||||
|
# Container High Memory Usage
|
||||||
|
- alert: ContainerHighMemoryUsage
|
||||||
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: docker
|
||||||
|
annotations:
|
||||||
|
summary: "Container high memory usage"
|
||||||
|
description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
|
||||||
|
|
||||||
|
# Container High CPU Usage
|
||||||
|
- alert: ContainerHighCPUUsage
|
||||||
|
expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota * 100) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: docker
|
||||||
|
annotations:
|
||||||
|
summary: "Container high CPU usage"
|
||||||
|
description: "Container {{ $labels.name }} CPU usage is {{ $value }}%"
|
||||||
|
|
||||||
|
# Database Monitoring
|
||||||
|
- name: database_health
|
||||||
|
rules:
|
||||||
|
# Database Connection Issues
|
||||||
|
- alert: DatabaseConnectionFailed
|
||||||
|
expr: increase(database_connection_errors_total[5m]) > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: database
|
||||||
|
annotations:
|
||||||
|
summary: "Database connection failures"
|
||||||
|
description: "{{ $value }} database connection failures in the last 5 minutes"
|
||||||
|
|
||||||
|
# Database High Query Time
|
||||||
|
- alert: DatabaseHighQueryTime
|
||||||
|
expr: histogram_quantile(0.95, rate(database_query_duration_seconds_bucket[5m])) > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: database
|
||||||
|
annotations:
|
||||||
|
summary: "Database high query time"
|
||||||
|
description: "95th percentile database query time is {{ $value }} seconds"
|
||||||
@@ -3,8 +3,7 @@ global:
|
|||||||
evaluation_interval: 15s
|
evaluation_interval: 15s
|
||||||
|
|
||||||
rule_files:
|
rule_files:
|
||||||
# - "first_rules.yml"
|
- "alert_rules.yml"
|
||||||
# - "second_rules.yml"
|
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'prometheus'
|
- job_name: 'prometheus'
|
||||||
@@ -46,4 +45,4 @@ alerting:
|
|||||||
alertmanagers:
|
alertmanagers:
|
||||||
- static_configs:
|
- static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
# - alertmanager:9093
|
- alertmanager:9093
|
||||||
|
|||||||
33
infra/uptime-kuma/docker-compose.yml
Normal file
33
infra/uptime-kuma/docker-compose.yml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Uptime Kuma Configuration
|
||||||
|
# This is a separate docker-compose file for Uptime Kuma
|
||||||
|
# It will be included in the main docker-compose.yml
|
||||||
|
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
uptime-kuma:
|
||||||
|
image: louislam/uptime-kuma:latest
|
||||||
|
container_name: bots_uptime_kuma
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- uptime_kuma_data:/app/data
|
||||||
|
ports:
|
||||||
|
- "3001:3001"
|
||||||
|
environment:
|
||||||
|
- UPTIME_KUMA_PORT=3001
|
||||||
|
networks:
|
||||||
|
- bots_network
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
uptime_kuma_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
bots_network:
|
||||||
|
external: true
|
||||||
163
scripts/setup-ssl.sh
Executable file
163
scripts/setup-ssl.sh
Executable file
@@ -0,0 +1,163 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# SSL Setup Script for Let's Encrypt
|
||||||
|
# This script sets up SSL certificates using Let's Encrypt
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DOMAIN="${DOMAIN:-localhost}"
|
||||||
|
EMAIL="${EMAIL:-admin@${DOMAIN}}"
|
||||||
|
NGINX_CONTAINER="bots_nginx"
|
||||||
|
CERTBOT_IMAGE="certbot/certbot:latest"
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Logging function
|
||||||
|
log() {
|
||||||
|
echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
warn() {
|
||||||
|
echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
error() {
|
||||||
|
echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if running as root
|
||||||
|
if [[ $EUID -eq 0 ]]; then
|
||||||
|
error "This script should not be run as root for security reasons"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if domain is localhost
|
||||||
|
if [[ "$DOMAIN" == "localhost" ]]; then
|
||||||
|
warn "Domain is set to localhost. Let's Encrypt certificates cannot be issued for localhost."
|
||||||
|
warn "Please set the DOMAIN environment variable to your actual domain name."
|
||||||
|
warn "Example: DOMAIN=example.com ./scripts/setup-ssl.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if Docker is running
|
||||||
|
if ! docker info > /dev/null 2>&1; then
|
||||||
|
error "Docker is not running. Please start Docker and try again."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if nginx container is running
|
||||||
|
if ! docker ps | grep -q "$NGINX_CONTAINER"; then
|
||||||
|
error "Nginx container ($NGINX_CONTAINER) is not running. Please start it first with 'docker-compose up -d nginx'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Setting up SSL certificates for domain: $DOMAIN"
|
||||||
|
log "Email for Let's Encrypt: $EMAIL"
|
||||||
|
|
||||||
|
# Create necessary directories
|
||||||
|
log "Creating Let's Encrypt directories..."
|
||||||
|
sudo mkdir -p /etc/letsencrypt/live
|
||||||
|
sudo mkdir -p /etc/letsencrypt/archive
|
||||||
|
sudo mkdir -p /etc/letsencrypt/renewal
|
||||||
|
sudo chmod 755 /etc/letsencrypt
|
||||||
|
|
||||||
|
# Stop nginx temporarily for certificate generation
|
||||||
|
log "Stopping nginx container for certificate generation..."
|
||||||
|
docker stop "$NGINX_CONTAINER" || true
|
||||||
|
|
||||||
|
# Generate certificate using certbot
|
||||||
|
log "Generating SSL certificate using Let's Encrypt..."
|
||||||
|
docker run --rm \
|
||||||
|
-v /etc/letsencrypt:/etc/letsencrypt \
|
||||||
|
-v /var/lib/letsencrypt:/var/lib/letsencrypt \
|
||||||
|
-p 80:80 \
|
||||||
|
-p 443:443 \
|
||||||
|
"$CERTBOT_IMAGE" certonly \
|
||||||
|
--standalone \
|
||||||
|
--non-interactive \
|
||||||
|
--agree-tos \
|
||||||
|
--email "$EMAIL" \
|
||||||
|
--domains "$DOMAIN" \
|
||||||
|
--expand
|
||||||
|
|
||||||
|
# Check if certificate was generated successfully
|
||||||
|
if [[ ! -f "/etc/letsencrypt/live/$DOMAIN/fullchain.pem" ]]; then
|
||||||
|
error "Failed to generate SSL certificate for $DOMAIN"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "SSL certificate generated successfully!"
|
||||||
|
|
||||||
|
# Set proper permissions
|
||||||
|
log "Setting proper permissions for SSL certificates..."
|
||||||
|
sudo chmod 755 /etc/letsencrypt/live
|
||||||
|
sudo chmod 755 /etc/letsencrypt/archive
|
||||||
|
sudo chmod 644 /etc/letsencrypt/live/"$DOMAIN"/*.pem
|
||||||
|
sudo chmod 600 /etc/letsencrypt/live/"$DOMAIN"/privkey.pem
|
||||||
|
|
||||||
|
# Update nginx configuration to use Let's Encrypt certificates
|
||||||
|
log "Updating nginx configuration..."
|
||||||
|
if [[ -f "infra/nginx/ssl/letsencrypt.conf" ]]; then
|
||||||
|
# Replace domain placeholder in letsencrypt.conf
|
||||||
|
sed "s/{{DOMAIN}}/$DOMAIN/g" infra/nginx/ssl/letsencrypt.conf > /tmp/letsencrypt.conf
|
||||||
|
sudo cp /tmp/letsencrypt.conf /etc/letsencrypt/live/"$DOMAIN"/letsencrypt.conf
|
||||||
|
rm /tmp/letsencrypt.conf
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start nginx container
|
||||||
|
log "Starting nginx container..."
|
||||||
|
docker start "$NGINX_CONTAINER"
|
||||||
|
|
||||||
|
# Wait for nginx to start
|
||||||
|
log "Waiting for nginx to start..."
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# Test SSL certificate
|
||||||
|
log "Testing SSL certificate..."
|
||||||
|
if curl -k -s "https://$DOMAIN" > /dev/null; then
|
||||||
|
log "SSL certificate is working correctly!"
|
||||||
|
else
|
||||||
|
warn "SSL certificate test failed. Please check nginx configuration."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set up automatic renewal
|
||||||
|
log "Setting up automatic certificate renewal..."
|
||||||
|
cat > /tmp/ssl-renewal.sh << EOF
|
||||||
|
#!/bin/bash
|
||||||
|
# SSL Certificate Renewal Script
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
DOMAIN="$DOMAIN"
|
||||||
|
NGINX_CONTAINER="$NGINX_CONTAINER"
|
||||||
|
CERTBOT_IMAGE="$CERTBOT_IMAGE"
|
||||||
|
|
||||||
|
# Renew certificates
|
||||||
|
docker run --rm \\
|
||||||
|
-v /etc/letsencrypt:/etc/letsencrypt \\
|
||||||
|
-v /var/lib/letsencrypt:/var/lib/letsencrypt \\
|
||||||
|
"$CERTBOT_IMAGE" renew --quiet
|
||||||
|
|
||||||
|
# Reload nginx
|
||||||
|
docker exec "\$NGINX_CONTAINER" nginx -s reload
|
||||||
|
|
||||||
|
echo "\$(date): SSL certificates renewed successfully" >> /var/log/ssl-renewal.log
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo mv /tmp/ssl-renewal.sh /usr/local/bin/ssl-renewal.sh
|
||||||
|
sudo chmod +x /usr/local/bin/ssl-renewal.sh
|
||||||
|
|
||||||
|
# Add cron job for automatic renewal (every Monday at 2 AM)
|
||||||
|
log "Adding cron job for automatic renewal..."
|
||||||
|
(crontab -l 2>/dev/null; echo "0 2 * * 1 /usr/local/bin/ssl-renewal.sh") | crontab -
|
||||||
|
|
||||||
|
log "SSL setup completed successfully!"
|
||||||
|
log "Certificate location: /etc/letsencrypt/live/$DOMAIN/"
|
||||||
|
log "Automatic renewal is configured to run every Monday at 2 AM"
|
||||||
|
log "You can test the renewal manually with: sudo /usr/local/bin/ssl-renewal.sh"
|
||||||
|
|
||||||
|
# Display certificate information
|
||||||
|
log "Certificate information:"
|
||||||
|
openssl x509 -in "/etc/letsencrypt/live/$DOMAIN/fullchain.pem" -text -noout | grep -E "(Subject:|Not Before|Not After|DNS:)"
|
||||||
Reference in New Issue
Block a user