From b34da5015d7cf670ac67440938642f87d9e49033 Mon Sep 17 00:00:00 2001 From: Andrey Date: Mon, 8 Sep 2025 23:17:24 +0300 Subject: [PATCH] Implement AnonBot integration and monitoring enhancements - Added AnonBot service to docker-compose with resource limits and environment variables. - Updated Makefile to include commands for AnonBot logs, restart, and dependency checks. - Enhanced Grafana dashboards with AnonBot health metrics and database connection statistics. - Implemented AnonBot status retrieval in the message sender for improved monitoring. - Updated Prometheus configuration to scrape metrics from AnonBot service. --- Makefile | 35 +- docker-compose.yml | 55 +- .../anonbot-overview-dashboard.json | 874 +++++++++++++++++ .../anonbot-performance-dashboard.json | 877 ++++++++++++++++++ .../dashboards/server-dashboard.json | 278 ++++++ infra/monitoring/README_PID_MANAGER.md | 188 ---- infra/monitoring/message_sender.py | 41 +- infra/monitoring/metrics_collector.py | 33 +- infra/prometheus/prometheus.yml | 12 + 9 files changed, 2187 insertions(+), 206 deletions(-) create mode 100644 infra/grafana/provisioning/dashboards/anonbot-overview-dashboard.json create mode 100644 infra/grafana/provisioning/dashboards/anonbot-performance-dashboard.json delete mode 100644 infra/monitoring/README_PID_MANAGER.md diff --git a/Makefile b/Makefile index 905c3db..a7c921a 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help build up down logs clean restart status deploy backup restore update clean-monitoring monitoring check-deps check-bot-deps +.PHONY: help build up down logs clean restart status deploy backup restore update clean-monitoring monitoring check-deps check-bot-deps check-anonBot-deps help: ## Показать справку @echo "🏗️ Production Infrastructure - Доступные команды:" @@ -11,6 +11,7 @@ help: ## Показать справку @echo " Grafana: http://localhost:3000 (admin/admin)" @echo " Server Monitor: http://localhost:9091/health" @echo " Bot Health: http://localhost:8080/health" + @echo " AnonBot Health: http://localhost:8081/health" build: ## Собрать все контейнеры docker-compose build @@ -36,6 +37,9 @@ logs-grafana: ## Показать логи Grafana logs-bot: ## Показать логи Telegram бота docker-compose logs -f telegram-bot +logs-anonBot: ## Показать логи AnonBot + docker-compose logs -f anon-bot + restart: ## Перезапустить все сервисы docker-compose down docker-compose build --no-cache @@ -53,12 +57,16 @@ restart-grafana: ## Перезапустить только Grafana restart-bot: ## Перезапустить только Telegram бота docker-compose restart telegram-bot +restart-anonBot: ## Перезапустить только AnonBot + docker-compose restart anon-bot + status: ## Показать статус контейнеров docker-compose ps health: ## Проверить здоровье сервисов @echo "🏥 Checking service health..." @curl -f http://localhost:8080/health || echo "❌ Bot health check failed" + @curl -f http://localhost:8081/health || echo "❌ AnonBot health check failed" @curl -f http://localhost:9090/-/healthy || echo "❌ Prometheus health check failed" @curl -f http://localhost:3000/api/health || echo "❌ Grafana health check failed" @curl -f http://localhost:9091/health || echo "❌ Server monitor health check failed" @@ -120,22 +128,26 @@ start: build up ## Собрать и запустить все сервисы @echo "📊 Prometheus: http://localhost:9090" @echo "📈 Grafana: http://localhost:3000 (admin/admin)" @echo "🤖 Bot Health: http://localhost:8080/health" + @echo "🔒 AnonBot Health: http://localhost:8081/health" @echo "📡 Server Monitor: http://localhost:9091/health" @echo "📝 Логи: make logs" stop: down ## Остановить все сервисы @echo "🛑 Все сервисы остановлены" -test: check-deps check-bot-deps ## Запустить все тесты в проекте +test: check-deps check-bot-deps check-anonBot-deps ## Запустить все тесты в проекте @echo "🧪 Запускаю все тесты в проекте..." @echo "📊 Тесты инфраструктуры..." @python3 -m pytest tests/infra/ -q --tb=no @echo "🤖 Тесты Telegram бота..." @cd bots/telegram-helper-bot && source .venv/bin/activate && python3 -m pytest tests/ -q --tb=no + @echo "🔒 Тесты AnonBot..." + @cd bots/AnonBot && python3 -m pytest tests/ -q --tb=no @echo "✅ Все тесты завершены!" @echo "📈 Общая статистика:" @echo " - Инфраструктура: $(shell python3 count_tests.py | head -1) тестов" @echo " - Telegram бот: $(shell python3 count_tests.py | head -2 | tail -1) тестов" + @echo " - AnonBot: $(shell python3 count_tests.py | head -3 | tail -1) тестов" @echo " - Всего: $(shell python3 count_tests.py | tail -1) тестов" test-all: ## Запустить все тесты в одном процессе (только для разработчиков) @@ -152,16 +164,23 @@ test-bot: check-bot-deps ## Запустить тесты Telegram бота @echo "🤖 Запускаю тесты Telegram бота..." @cd bots/telegram-helper-bot && source .venv/bin/activate && python3 -m pytest tests/ -v -test-coverage: check-deps check-bot-deps ## Запустить все тесты с отчетом о покрытии +test-anonBot: check-anonBot-deps ## Запустить тесты AnonBot + @echo "🔒 Запускаю тесты AnonBot..." + @cd bots/AnonBot && python3 -m pytest tests/ -v + +test-coverage: check-deps check-bot-deps check-anonBot-deps ## Запустить все тесты с отчетом о покрытии @echo "📊 Запускаю все тесты с отчетом о покрытии..." @echo "📈 Покрытие для инфраструктуры..." @python3 -m pytest tests/infra/ --cov=infra --cov-report=term-missing --cov-report=html:htmlcov/infra @echo "🤖 Покрытие для Telegram бота..." @cd bots/telegram-helper-bot && source .venv/bin/activate && python3 -m pytest tests/ --cov=helper_bot --cov-report=term-missing --cov-report=html:htmlcov/bot + @echo "🔒 Покрытие для AnonBot..." + @cd bots/AnonBot && python3 -m pytest tests/ --cov=. --cov-report=term-missing --cov-report=html:htmlcov/anonbot @echo "📊 Отчеты о покрытии сохранены в htmlcov/" @echo "📈 Общая статистика:" @echo " - Инфраструктура: $(shell python3 count_tests.py | head -1) тестов" @echo " - Telegram бот: $(shell python3 count_tests.py | head -2 | tail -1) тестов" + @echo " - AnonBot: $(shell python3 count_tests.py | head -3 | tail -1) тестов" @echo " - Всего: $(shell python3 count_tests.py | tail -1) тестов" test-clean: ## Очистить все файлы тестирования и отчеты @@ -173,6 +192,9 @@ test-clean: ## Очистить все файлы тестирования и о @rm -rf bots/telegram-helper-bot/.pytest_cache/ @rm -rf bots/telegram-helper-bot/htmlcov/ @rm -rf bots/telegram-helper-bot/.coverage + @rm -rf bots/AnonBot/.pytest_cache/ + @rm -rf bots/AnonBot/htmlcov/ + @rm -rf bots/AnonBot/.coverage @find . -name "*.pyc" -delete 2>/dev/null || true @find . -name "__pycache__" -type d -exec rm -rf {} + 2>/dev/null || true @echo "✅ Файлы тестирования очищены" @@ -187,6 +209,8 @@ check-ports: ## Проверить занятые порты @lsof -i :9091 2>/dev/null || echo " Free" @echo "Port 8080 (Telegram Bot):" @lsof -i :8080 2>/dev/null || echo " Free" + @echo "Port 8081 (AnonBot):" + @lsof -i :8081 2>/dev/null || echo " Free" check-grafana: ## Проверить состояние Grafana @echo "📊 Checking Grafana status..." @@ -202,6 +226,11 @@ check-bot-deps: ## Проверить зависимости Telegram бота @cd bots/telegram-helper-bot && source .venv/bin/activate && python3 -c "import aiogram, aiosqlite, pytest" 2>/dev/null || (echo "❌ Отсутствуют зависимости бота. Установите: cd bots/telegram-helper-bot && source .venv/bin/activate && pip install -r requirements.txt" && exit 1) @echo "✅ Зависимости Telegram бота установлены" +check-anonBot-deps: ## Проверить зависимости AnonBot + @echo "🔍 Проверяю зависимости AnonBot..." + @cd bots/AnonBot && python3 -c "import aiogram, aiosqlite, pytest, loguru, pydantic" 2>/dev/null || (echo "❌ Отсутствуют зависимости AnonBot. Установите: cd bots/AnonBot && pip install -r requirements.txt" && exit 1) + @echo "✅ Зависимости AnonBot установлены" + logs-tail: ## Показать последние логи всех сервисов @echo "📝 Recent logs from all services:" @docker-compose logs --tail=50 diff --git a/docker-compose.yml b/docker-compose.yml index cda9395..fd7465e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -130,11 +130,64 @@ services: deploy: resources: limits: - memory: 512M + memory: 256M cpus: '0.5' reservations: + memory: 128M + cpus: '0.25' + + # AnonBot - Anonymous Q&A Bot + anon-bot: + build: + context: ./bots/AnonBot + dockerfile: Dockerfile + container_name: bots_anon_bot + restart: unless-stopped + env_file: + - ./bots/AnonBot/.env + ports: + - "8081:8081" + environment: + - PYTHONPATH=/app + - PYTHONUNBUFFERED=1 + - DOCKER_CONTAINER=true + - LOG_LEVEL=${LOG_LEVEL:-INFO} + # AnonBot settings + - ANON_BOT_TOKEN=${BOT_TOKEN} + - ANON_BOT_ADMINS=${ADMINS} + - ANON_BOT_DATABASE_PATH=/app/database/anon_qna.db + - ANON_BOT_DEBUG=${DEBUG:-false} + - ANON_BOT_MAX_QUESTION_LENGTH=${MAX_QUESTION_LENGTH:-1000} + - ANON_BOT_MAX_ANSWER_LENGTH=${MAX_ANSWER_LENGTH:-2000} + # Rate limiting settings + - RATE_LIMIT_ENV=${RATE_LIMIT_ENV:-production} + - RATE_LIMIT_MESSAGES_PER_SECOND=${RATE_LIMIT_MESSAGES_PER_SECOND:-0.5} + - RATE_LIMIT_BURST_LIMIT=${RATE_LIMIT_BURST_LIMIT:-2} + - RATE_LIMIT_RETRY_MULTIPLIER=${RATE_LIMIT_RETRY_MULTIPLIER:-1.5} + - RATE_LIMIT_MAX_RETRY_DELAY=${RATE_LIMIT_MAX_RETRY_DELAY:-30.0} + - RATE_LIMIT_MAX_RETRIES=${RATE_LIMIT_MAX_RETRIES:-3} + volumes: + - ./bots/AnonBot/database:/app/database:rw + - ./bots/AnonBot/logs:/app/logs:rw + - ./bots/AnonBot/.env:/app/.env:ro + networks: + - bots_network + depends_on: + - prometheus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8081/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: memory: 256M cpus: '0.25' + reservations: + memory: 128M + cpus: '0.1' volumes: prometheus_data: diff --git a/infra/grafana/provisioning/dashboards/anonbot-overview-dashboard.json b/infra/grafana/provisioning/dashboards/anonbot-overview-dashboard.json new file mode 100644 index 0000000..f1d483c --- /dev/null +++ b/infra/grafana/provisioning/dashboards/anonbot-overview-dashboard.json @@ -0,0 +1,874 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "anon_bot_active_users", + "interval": "", + "legendFormat": "Active Users", + "refId": "A" + } + ], + "title": "Active Users", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "anon_bot_active_questions", + "interval": "", + "legendFormat": "Active Questions", + "refId": "A" + } + ], + "title": "Active Questions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_questions_total{status=\"created\"}[5m]) * 60", + "interval": "", + "legendFormat": "Questions Created/min", + "refId": "A" + }, + { + "expr": "rate(anon_bot_questions_total{status=\"processed\"}[5m]) * 60", + "interval": "", + "legendFormat": "Questions Processed/min", + "refId": "B" + }, + { + "expr": "rate(anon_bot_questions_total{status=\"rejected\"}[5m]) * 60", + "interval": "", + "legendFormat": "Questions Rejected/min", + "refId": "C" + }, + { + "expr": "rate(anon_bot_questions_total{status=\"deleted\"}[5m]) * 60", + "interval": "", + "legendFormat": "Questions Deleted/min", + "refId": "D" + } + ], + "title": "Questions Flow (per minute)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_answers_total{status=\"sent\"}[5m]) * 60", + "interval": "", + "legendFormat": "Answers Sent/min", + "refId": "A" + }, + { + "expr": "rate(anon_bot_answers_total{status=\"delivered\"}[5m]) * 60", + "interval": "", + "legendFormat": "Answers Delivered/min", + "refId": "B" + }, + { + "expr": "rate(anon_bot_answers_total{status=\"delivery_failed\"}[5m]) * 60", + "interval": "", + "legendFormat": "Delivery Failed/min", + "refId": "C" + } + ], + "title": "Answers Flow (per minute)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_users_total{action=\"created\"}[5m]) * 60", + "interval": "", + "legendFormat": "New Users/min", + "refId": "A" + }, + { + "expr": "rate(anon_bot_users_total{action=\"updated\"}[5m]) * 60", + "interval": "", + "legendFormat": "Updated Users/min", + "refId": "B" + } + ], + "title": "User Activity (per minute)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(anon_bot_messages_total[1d])) by (message_type)", + "interval": "", + "legendFormat": "{{message_type}} (daily)", + "refId": "A" + } + ], + "title": "Daily Trends - Messages", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(anon_bot_questions_total[1d])) by (status)", + "interval": "", + "legendFormat": "{{status}} (daily)", + "refId": "A" + } + ], + "title": "Daily Trends - Questions", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "anon_bot_active_users", + "interval": "", + "legendFormat": "Live Active Users", + "refId": "A" + } + ], + "title": "Live Activity - Active Users", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "rate(anon_bot_messages_total[1m]) * 60", + "interval": "", + "legendFormat": "Messages/min", + "refId": "A" + } + ], + "title": "Messages per Minute", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "rate(anon_bot_questions_total[1h]) * 3600", + "interval": "", + "legendFormat": "Questions/hour", + "refId": "A" + } + ], + "title": "Questions per Hour", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "rate(anon_bot_answers_total[1m]) * 60", + "interval": "", + "legendFormat": "Answers/min", + "refId": "A" + } + ], + "title": "Answers per Minute", + "type": "stat" + } + ], + "refresh": "5s", + "schemaVersion": 30, + "style": "dark", + "tags": [ + "anonbot", + "overview", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "AnonBot Overview", + "uid": "anonbot-overview", + "version": 1, + "weekStart": "" +} diff --git a/infra/grafana/provisioning/dashboards/anonbot-performance-dashboard.json b/infra/grafana/provisioning/dashboards/anonbot-performance-dashboard.json new file mode 100644 index 0000000..00af822 --- /dev/null +++ b/infra/grafana/provisioning/dashboards/anonbot-performance-dashboard.json @@ -0,0 +1,877 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(anon_bot_message_processing_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "Message Processing 95th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(anon_bot_question_processing_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "Question Processing 95th percentile", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.95, rate(anon_bot_answer_processing_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "Answer Processing 95th percentile", + "refId": "C" + } + ], + "title": "Response Time - 95th Percentile", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_message_processing_seconds_bucket[5m])", + "interval": "", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Latency Heatmap - Message Processing", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(anon_bot_db_query_duration_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "DB Query 95th percentile - {{operation}}/{{table}}", + "refId": "A" + } + ], + "title": "Database Performance - Query Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "anon_bot_db_connections_active", + "interval": "", + "legendFormat": "Active DB Connections", + "refId": "A" + } + ], + "title": "Database Connections - Active", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.0", + "targets": [ + { + "expr": "anon_bot_db_pool_utilization_percent", + "interval": "", + "legendFormat": "Pool Utilization %", + "refId": "A" + } + ], + "title": "DB Pool Utilization", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_messages_total{status=\"success\"}[5m]) / rate(anon_bot_messages_total[5m]) * 100", + "interval": "", + "legendFormat": "Messages Success Rate", + "refId": "A" + }, + { + "expr": "rate(anon_bot_questions_total{status=\"processed\"}[5m]) / rate(anon_bot_questions_total[5m]) * 100", + "interval": "", + "legendFormat": "Questions Success Rate", + "refId": "B" + }, + { + "expr": "rate(anon_bot_answers_total{status=\"sent\"}[5m]) / rate(anon_bot_answers_total[5m]) * 100", + "interval": "", + "legendFormat": "Answers Success Rate", + "refId": "C" + } + ], + "title": "Success/Error Rates", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_errors_total[5m])", + "interval": "", + "legendFormat": "{{component}} - {{error_type}}", + "refId": "A" + } + ], + "title": "Error Rate by Component", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum(rate(anon_bot_errors_total[5m])) by (error_type)", + "interval": "", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "title": "Error Types Distribution", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_db_queries_total{status=\"error\"}[5m])", + "interval": "", + "legendFormat": "{{operation}}/{{table}}", + "refId": "A" + } + ], + "title": "Database Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "rate(anon_bot_pagination_errors_total[5m])", + "interval": "", + "legendFormat": "{{entity_type}} - {{error_type}}", + "refId": "A" + } + ], + "title": "Pagination Errors", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 30, + "style": "dark", + "tags": [ + "anonbot", + "performance", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Performance AnonBot", + "uid": "anonbot-performance", + "version": 1, + "weekStart": "" +} diff --git a/infra/grafana/provisioning/dashboards/server-dashboard.json b/infra/grafana/provisioning/dashboards/server-dashboard.json index 5ebc127..0ec0790 100644 --- a/infra/grafana/provisioning/dashboards/server-dashboard.json +++ b/infra/grafana/provisioning/dashboards/server-dashboard.json @@ -361,6 +361,284 @@ } }, "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8} + }, + { + "id": 11, + "title": "AnonBot Health Status", + "type": "timeseries", + "targets": [ + { + "expr": "rate(anon_bot_errors_total[5m])", + "legendFormat": "{{component}} - {{error_type}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "short" + } + }, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24} + }, + { + "id": 12, + "title": "AnonBot Database Connections", + "type": "timeseries", + "targets": [ + { + "expr": "anon_bot_db_connections_active", + "legendFormat": "Active Connections" + }, + { + "expr": "rate(anon_bot_db_connections_total[5m])", + "legendFormat": "Total Connections/min - {{status}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "short" + } + }, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24} + }, + { + "id": 13, + "title": "AnonBot System Health", + "type": "stat", + "targets": [ + { + "expr": "anon_bot_active_users", + "legendFormat": "Active Users" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "red", "value": 50} + ] + }, + "unit": "short" + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "gridPos": {"h": 8, "w": 6, "x": 0, "y": 32} + }, + { + "id": 14, + "title": "AnonBot Active Questions", + "type": "stat", + "targets": [ + { + "expr": "anon_bot_active_questions", + "legendFormat": "Active Questions" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 20}, + {"color": "red", "value": 100} + ] + }, + "unit": "short" + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "gridPos": {"h": 8, "w": 6, "x": 6, "y": 32} + }, + { + "id": 15, + "title": "AnonBot Message Rate", + "type": "stat", + "targets": [ + { + "expr": "rate(anon_bot_messages_total[1m]) * 60", + "legendFormat": "Messages/min" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "red", "value": 50} + ] + }, + "unit": "short" + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 32} + }, + { + "id": 16, + "title": "AnonBot Error Rate", + "type": "stat", + "targets": [ + { + "expr": "rate(anon_bot_errors_total[5m])", + "legendFormat": "Errors/min" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + }, + "unit": "short" + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 32} } ], "time": { diff --git a/infra/monitoring/README_PID_MANAGER.md b/infra/monitoring/README_PID_MANAGER.md deleted file mode 100644 index 6d2eace..0000000 --- a/infra/monitoring/README_PID_MANAGER.md +++ /dev/null @@ -1,188 +0,0 @@ -# PID Manager - Управление процессами ботов - -## Описание - -`pid_manager.py` - это общий модуль для управления PID файлами всех ботов в проекте. Он обеспечивает создание, отслеживание и очистку PID файлов для мониторинга состояния процессов. - -## Использование - -### Для новых ботов - -Чтобы добавить PID мониторинг в новый бот, выполните следующие шаги: - -1. **Импортируйте PID менеджер в ваш скрипт запуска:** - -```python -import sys -import os - -# Добавляем путь к инфраструктуре в sys.path -infra_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'infra', 'monitoring') -if infra_path not in sys.path: - sys.path.insert(0, infra_path) - -from pid_manager import get_bot_pid_manager -``` - -2. **Создайте PID менеджер в начале main функции:** - -```python -async def main(): - # Создаем PID менеджер для отслеживания процесса (если доступен) - pid_manager = None - if get_bot_pid_manager: - pid_manager = get_bot_pid_manager("your_bot_name") # Замените на имя вашего бота - if not pid_manager.create_pid_file(): - logger.error("Не удалось создать PID файл, завершаем работу") - return - else: - logger.info("PID менеджер недоступен, запуск без PID файла") - - # Ваш код запуска бота... -``` - -3. **Очистите PID файл при завершении:** - -```python -try: - # Ваш код работы бота... -finally: - # Очищаем PID файл (если PID менеджер доступен) - if pid_manager: - pid_manager.cleanup_pid_file() -``` - -### Для мониторинга - -Чтобы добавить новый бот в систему мониторинга: - -```python -from infra.monitoring.metrics_collector import MetricsCollector - -# Создаем экземпляр коллектора метрик -collector = MetricsCollector() - -# Добавляем новый бот в мониторинг -collector.add_bot_to_monitoring("your_bot_name") - -# Теперь можно проверять статус -status, uptime = collector.check_process_status("your_bot_name") -``` - -## Структура файлов - -``` -prod/ -├── infra/ -│ └── monitoring/ -│ ├── pid_manager.py # Основной модуль -│ ├── metrics_collector.py # Мониторинг процессов -│ └── README_PID_MANAGER.md # Эта документация -├── bots/ -│ ├── telegram-helper-bot/ -│ │ └── run_helper.py # Использует PID менеджер -│ └── your-new-bot/ -│ └── run_your_bot.py # Будет использовать PID менеджер -├── helper_bot.pid # PID файл helper_bot -├── your_bot.pid # PID файл вашего бота -└── .gitignore # Содержит *.pid -``` - -## API - -### PIDManager - -- `create_pid_file()` - Создает PID файл -- `cleanup_pid_file()` - Удаляет PID файл -- `is_running()` - Проверяет, запущен ли процесс -- `get_pid()` - Получает PID из файла - -### Функции - -- `get_bot_pid_manager(bot_name)` - Создает PID менеджер для бота -- `create_pid_manager(process_name, project_root)` - Создает PID менеджер с настройками - -## Примеры - -### Простой бот - -```python -import asyncio -from pid_manager import get_bot_pid_manager - -async def main(): - # Создаем PID менеджер - pid_manager = get_bot_pid_manager("simple_bot") - if not pid_manager.create_pid_file(): - print("Не удалось создать PID файл") - return - - try: - # Ваш код бота - print("Бот запущен...") - await asyncio.sleep(3600) # Работаем час - finally: - # Очищаем PID файл - pid_manager.cleanup_pid_file() - -if __name__ == '__main__': - asyncio.run(main()) -``` - -### Бот с обработкой сигналов - -```python -import asyncio -import signal -from pid_manager import get_bot_pid_manager - -async def main(): - pid_manager = get_bot_pid_manager("advanced_bot") - if not pid_manager.create_pid_file(): - return - - # Флаг для корректного завершения - shutdown_event = asyncio.Event() - - def signal_handler(signum, frame): - print(f"Получен сигнал {signum}, завершаем работу...") - shutdown_event.set() - - # Регистрируем обработчики сигналов - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - # Ваш код бота - await shutdown_event.wait() - finally: - pid_manager.cleanup_pid_file() - -if __name__ == '__main__': - asyncio.run(main()) -``` - -## Примечания - -- PID файлы создаются в корне проекта -- Все PID файлы автоматически игнорируются Git (см. `.gitignore`) -- PID менеджер автоматически обрабатывает сигналы SIGTERM и SIGINT -- При завершении процесса PID файл автоматически удаляется -- Система мониторинга автоматически находит PID файлы в корне проекта - -## Изолированный запуск - -При запуске бота изолированно (без доступа к основному проекту): - -- PID менеджер автоматически не создается -- Бот запускается без PID файла -- В логах появляется сообщение "PID менеджер недоступен (изолированный запуск), PID файл не создается" -- Это позволяет запускать бота в любой среде без ошибок - -## Автоматическое определение - -Система автоматически определяет доступность PID менеджера: - -1. **В основном проекте**: PID менеджер доступен, создается PID файл для мониторинга -2. **Изолированно**: PID менеджер недоступен, бот работает без PID файла -3. **Fallback**: Если PID менеджер недоступен, бот продолжает работать нормально diff --git a/infra/monitoring/message_sender.py b/infra/monitoring/message_sender.py index 27fa026..6669fc0 100644 --- a/infra/monitoring/message_sender.py +++ b/infra/monitoring/message_sender.py @@ -64,6 +64,39 @@ class MessageSender: logger.error(f"Ошибка при отправке сообщения в Telegram: {e}") return False + async def get_anonbot_status(self) -> Tuple[str, str]: + """Получение статуса AnonBot через HTTP API""" + try: + async with aiohttp.ClientSession() as session: + # AnonBot доступен через Docker network + url = "http://bots_anon_bot:8081/status" + + async with session.get(url, timeout=aiohttp.ClientTimeout(total=5)) as response: + if response.status == 200: + data = await response.json() + status = data.get('status', 'unknown') + uptime = data.get('uptime', 'unknown') + + # Форматируем статус с эмодзи + if status == 'running': + status_emoji = "✅" + elif status == 'stopped': + status_emoji = "❌" + else: + status_emoji = "⚠️" + + return f"{status_emoji}", uptime + else: + logger.warning(f"AnonBot API вернул статус {response.status}") + return "⚠️ AnonBot", "API недоступен" + + except aiohttp.ClientError as e: + logger.warning(f"Ошибка подключения к AnonBot API: {e}") + return "❌", "Недоступен" + except Exception as e: + logger.error(f"Неожиданная ошибка при получении статуса AnonBot: {e}") + return "⚠️", "Ошибка" + def should_send_status(self) -> bool: """Проверка, нужно ли отправить статус (каждые N минут)""" now = datetime.now() @@ -147,11 +180,14 @@ class MessageSender: else: return "🚨" - def get_status_message(self, system_info: Dict) -> str: + async def get_status_message(self, system_info: Dict) -> str: """Формирование сообщения со статусом сервера""" try: helper_bot_status, helper_bot_uptime = self.metrics_collector.check_process_status('helper_bot') + # Получаем статус AnonBot + anonbot_status, anonbot_uptime = await self.get_anonbot_status() + # Получаем эмодзи для всех метрик cpu_emoji = self._get_cpu_emoji(system_info['cpu_percent']) ram_emoji = self._get_memory_emoji(system_info['ram_percent']) @@ -183,6 +219,7 @@ Read: {system_info['disk_read_speed']} | Write: {system_info['disk_wri **🤖 Процессы:** {helper_bot_status} helper-bot - {helper_bot_uptime} +{anonbot_status} AnonBot - {anonbot_uptime} --------------------------------- ⏰ Uptime сервера: {system_info['system_uptime']} 🔍 Уровень мониторинга: {level_text} ({monitoring_level})""" @@ -259,7 +296,7 @@ Read: {system_info['disk_read_speed']} | Write: {system_info['disk_wri logger.error("Не удалось получить информацию о системе") return False - status_message = self.get_status_message(system_info) + status_message = await self.get_status_message(system_info) success = await self.send_telegram_message(self.group_for_logs, status_message) # Обновляем время последней отправки только при успешной отправке diff --git a/infra/monitoring/metrics_collector.py b/infra/monitoring/metrics_collector.py index 5805c36..08e201d 100644 --- a/infra/monitoring/metrics_collector.py +++ b/infra/monitoring/metrics_collector.py @@ -590,14 +590,17 @@ class MetricsCollector: alerts.append(('cpu', system_info['cpu_percent'], f"Нагрузка за 1 мин: {system_info['load_avg_1m']}")) logger.warning(f"CPU ALERT: {system_info['cpu_percent']:.1f}% > {self.threshold}% (задержка {self.alert_delays['cpu']}s)") else: - # CPU ниже порога - сбрасываем состояние + # CPU ниже порога - сбрасываем состояние только если был активный алерт if self.alert_states['cpu']: self.alert_states['cpu'] = False recoveries.append(('cpu', system_info['cpu_percent'])) logger.info(f"CPU RECOVERY: {system_info['cpu_percent']:.1f}% < {self.recovery_threshold}%") - - # Сбрасываем время начала превышения - self.alert_start_times['cpu'] = None + # Сбрасываем время начала превышения только после отправки алерта + self.alert_start_times['cpu'] = None + elif system_info['cpu_percent'] < self.recovery_threshold and self.alert_start_times['cpu'] is not None: + # Если CPU опустился значительно ниже порога, сбрасываем время начала превышения + logger.debug(f"CPU значительно ниже порога {self.recovery_threshold}%: {system_info['cpu_percent']:.1f}% - сбрасываем время начала превышения") + self.alert_start_times['cpu'] = None # Проверка RAM с задержкой if system_info['ram_percent'] > self.threshold: @@ -613,14 +616,17 @@ class MetricsCollector: alerts.append(('ram', system_info['ram_percent'], f"Используется: {system_info['ram_used']} GB из {system_info['ram_total']} GB")) logger.warning(f"RAM ALERT: {system_info['ram_percent']:.1f}% > {self.threshold}% (задержка {self.alert_delays['ram']}s)") else: - # RAM ниже порога - сбрасываем состояние + # RAM ниже порога - сбрасываем состояние только если был активный алерт if self.alert_states['ram']: self.alert_states['ram'] = False recoveries.append(('ram', system_info['ram_percent'])) logger.info(f"RAM RECOVERY: {system_info['ram_percent']:.1f}% < {self.recovery_threshold}%") - - # Сбрасываем время начала превышения - self.alert_start_times['ram'] = None + # Сбрасываем время начала превышения только после отправки алерта + self.alert_start_times['ram'] = None + elif system_info['ram_percent'] < self.recovery_threshold and self.alert_start_times['ram'] is not None: + # Если RAM опустился значительно ниже порога, сбрасываем время начала превышения + logger.debug(f"RAM значительно ниже порога {self.recovery_threshold}%: {system_info['ram_percent']:.1f}% - сбрасываем время начала превышения") + self.alert_start_times['ram'] = None # Проверка диска с задержкой if system_info['disk_percent'] > self.threshold: @@ -636,14 +642,17 @@ class MetricsCollector: alerts.append(('disk', system_info['disk_percent'], f"Свободно: {system_info['disk_free']} GB на /")) logger.warning(f"DISK ALERT: {system_info['disk_percent']:.1f}% > {self.threshold}% (задержка {self.alert_delays['disk']}s)") else: - # Диск ниже порога - сбрасываем состояние + # Диск ниже порога - сбрасываем состояние только если был активный алерт if self.alert_states['disk']: self.alert_states['disk'] = False recoveries.append(('disk', system_info['disk_percent'])) logger.info(f"DISK RECOVERY: {system_info['disk_percent']:.1f}% < {self.recovery_threshold}%") - - # Сбрасываем время начала превышения - self.alert_start_times['disk'] = None + # Сбрасываем время начала превышения только после отправки алерта + self.alert_start_times['disk'] = None + elif system_info['disk_percent'] < self.recovery_threshold and self.alert_start_times['disk'] is not None: + # Если диск опустился значительно ниже порога, сбрасываем время начала превышения + logger.debug(f"Disk значительно ниже порога {self.recovery_threshold}%: {system_info['disk_percent']:.1f}% - сбрасываем время начала превышения") + self.alert_start_times['disk'] = None return alerts, recoveries diff --git a/infra/prometheus/prometheus.yml b/infra/prometheus/prometheus.yml index 5952a1c..fd2ee72 100644 --- a/infra/prometheus/prometheus.yml +++ b/infra/prometheus/prometheus.yml @@ -32,6 +32,18 @@ scrape_configs: scrape_timeout: 10s honor_labels: true + - job_name: 'anon-bot' + static_configs: + - targets: ['bots_anon_bot:8081'] # AnonBot на порту 8081 + labels: + bot_name: 'anon-bot' + environment: 'production' + service: 'anon-bot' + metrics_path: '/metrics' + scrape_interval: 15s + scrape_timeout: 10s + honor_labels: true + alerting: alertmanagers: - static_configs: