commit 7378179d98ac0c454cdcf85feb0f5986c7a50e59 Author: Andrey Date: Sun Aug 31 17:55:55 2025 +0300 Initial commit: Add infrastructure and bot project diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c0d305 --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Environment files +.env +.env.local +.env.*.local + +# Logs +logs/ +*.log + +# Docker volumes +prometheus_data/ +grafana_data/ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Python cache (if any Python scripts are added later) +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.cache/ +.mypy_cache/ + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# Temporary files +*.tmp +*.temp +*.pid + +# Node modules (if any Node.js tools are added later) +node_modules/ + +# Build artifacts +*.tar.gz +dist/ +build/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4dd7590 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.9-slim + +# Установка системных зависимостей +RUN apt-get update && apt-get install -y \ + procps \ + && rm -rf /var/lib/apt/lists/* + +# Установка рабочей директории +WORKDIR /app + +# Копирование файлов зависимостей +COPY requirements.txt . + +# Установка Python зависимостей +RUN pip install --no-cache-dir -r requirements.txt + +# Копирование исходного кода +COPY . . + +# Создание пользователя для безопасности +RUN groupadd -g 1000 monitor && \ + useradd -m -u 1000 -g monitor monitor && \ + chown -R 1000:1000 /app +USER 1000 + +# Команда по умолчанию для запуска мониторинга +CMD ["python", "infra/monitoring/main.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b79bf5 --- /dev/null +++ b/README.md @@ -0,0 +1,133 @@ +# Production Environment + +Проект для управления ботами и мониторинга инфраструктуры. + +## Структура проекта + +``` +prod/ +├── bots/ # Боты и их конфигурации +├── infra/ # Инфраструктура +│ ├── grafana/ # Дашборды Grafana +│ ├── monitoring/ # Модуль мониторинга сервера +│ └── prometheus/ # Конфигурация Prometheus +├── scripts/ # Скрипты развертывания +├── docker-compose.yml # Docker Compose конфигурация +├── env.template # Шаблон переменных окружения +└── README.md # Этот файл +``` + +## 🚀 Быстрый запуск + +### 1. Настройка переменных окружения + +Скопируйте шаблон и настройте переменные: + +```bash +cp env.template .env +``` + +Отредактируйте `.env` файл, добавив реальные значения: + +```env +# Telegram Bot Configuration +TELEGRAM_MONITORING_BOT_TOKEN=your_bot_token_here +GROUP_MONITORING_FOR_LOGS=your_telegram_group_id_here +IMPORTANT_MONITORING_LOGS=your_important_logs_channel_id_here + +# Monitoring Configuration +THRESHOLD=80.0 +RECOVERY_THRESHOLD=75.0 + +# Prometheus Configuration +PROMETHEUS_RETENTION_DAYS=30 + +# Grafana Configuration +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=admin +``` + +### 2. Запуск всех сервисов + +```bash +docker-compose up -d +``` + +### 3. Проверка статуса + +```bash +docker-compose ps +``` + +## 📊 Сервисы + +- **Prometheus** (порт 9090) - сбор метрик +- **Grafana** (порт 3000) - дашборды +- **Server Monitor** - мониторинг системы + Telegram уведомления + +## 🌐 Доступные адреса + +| Сервис | Адрес | Описание | +|--------|-------|----------| +| **Grafana** | http://localhost:3000 | Дашборды мониторинга (admin/admin) | +| **Prometheus** | http://localhost:9090 | API метрик и веб-интерфейс | +| **Метрики сервера** | http://localhost:9091/metrics | Endpoint для Prometheus | +| **Health check** | http://localhost:9091/health | Проверка состояния мониторинга | + +## 🔧 Модуль мониторинга + +Модуль автоматически: +- Собирает метрики CPU, RAM, диска каждые 30 секунд +- Отправляет статусы каждые 30 минут в Telegram +- Отправляет алерты при превышении пороговых значений +- Интегрирован с Prometheus/Grafana + +### 📈 Собираемые метрики + +- **CPU**: использование, load average (1m, 5m, 15m) +- **RAM**: использование оперативной памяти +- **Disk**: использование диска, I/O активность +- **Swap**: использование swap +- **System**: uptime системы и мониторинга + +## 📝 Логи + +```bash +# Все сервисы +docker-compose logs + +# Только мониторинг +docker-compose logs -f server_monitor + +# Prometheus +docker logs bots_prometheus + +# Grafana +docker logs bots_grafana +``` + +## 🔍 Проверка статуса + +### Автоматическая проверка +```bash +cd infra/monitoring +python3 check_grafana.py +``` + +### Ручная проверка +```bash +# Проверка метрик +curl http://localhost:9091/metrics + +# Проверка Prometheus targets +curl http://localhost:9090/api/v1/targets + +# Проверка Grafana +curl http://localhost:3000/api/health +``` + +## 🛑 Остановка + +```bash +docker-compose down +``` diff --git a/bots/.gitkeep b/bots/.gitkeep new file mode 100644 index 0000000..9d5dfc1 --- /dev/null +++ b/bots/.gitkeep @@ -0,0 +1 @@ +# This file ensures the bots directory is tracked by git diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0147f63 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,145 @@ + +services: + # Prometheus Monitoring + prometheus: + image: prom/prometheus:latest + container_name: bots_prometheus + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_DAYS:-30}d' + - '--web.enable-lifecycle' + ports: + - "9090:9090" + volumes: + - ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + networks: + - bots_network + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + # Grafana Dashboard + grafana: + image: grafana/grafana:latest + container_name: bots_grafana + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource + ports: + - "3000:3000" + volumes: + - grafana_data:/var/lib/grafana + - ./infra/grafana/provisioning:/etc/grafana/provisioning:ro + networks: + - bots_network + depends_on: + - prometheus + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + # Server Monitoring Service + server_monitor: + build: . + container_name: bots_server_monitor + restart: unless-stopped + environment: + - TELEGRAM_BOT_TOKEN=${TELEGRAM_MONITORING_BOT_TOKEN} + - GROUP_FOR_LOGS=${GROUP_MONITORING_FOR_LOGS} + - IMPORTANT_LOGS=${IMPORTANT_MONITORING_LOGS} + - THRESHOLD=${THRESHOLD:-80.0} + - RECOVERY_THRESHOLD=${RECOVERY_THRESHOLD:-75.0} + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /var/run:/host/var/run:ro + networks: + - bots_network + depends_on: + - prometheus + healthcheck: + test: ["CMD-SHELL", "ps aux | grep python | grep server_monitor || exit 1"] + interval: 60s + timeout: 10s + retries: 3 + + # Telegram Helper Bot + telegram-bot: + build: + context: ./bots/telegram-helper-bot + dockerfile: Dockerfile.bot + container_name: bots_telegram_bot + restart: unless-stopped + ports: + - "8080:8080" + environment: + - PYTHONPATH=/app + - DOCKER_CONTAINER=true + - LOG_LEVEL=${LOG_LEVEL:-INFO} + - LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-30} + - METRICS_HOST=${METRICS_HOST:-0.0.0.0} + - METRICS_PORT=${METRICS_PORT:-8080} + # Telegram settings + - TELEGRAM_BOT_TOKEN=${BOT_TOKEN} + - TELEGRAM_LISTEN_BOT_TOKEN=${LISTEN_BOT_TOKEN} + - TELEGRAM_TEST_BOT_TOKEN=${TEST_BOT_TOKEN} + - TELEGRAM_PREVIEW_LINK=${PREVIEW_LINK:-false} + - TELEGRAM_MAIN_PUBLIC=${MAIN_PUBLIC} + - TELEGRAM_GROUP_FOR_POSTS=${GROUP_FOR_POSTS} + - TELEGRAM_GROUP_FOR_MESSAGE=${GROUP_FOR_MESSAGE} + - TELEGRAM_GROUP_FOR_LOGS=${GROUP_FOR_LOGS} + - TELEGRAM_IMPORTANT_LOGS=${IMPORTANT_LOGS} + - TELEGRAM_ARCHIVE=${ARCHIVE} + - TELEGRAM_TEST_GROUP=${TEST_GROUP} + # Bot settings + - SETTINGS_LOGS=${LOGS:-false} + - SETTINGS_TEST=${TEST:-false} + # Database + - DATABASE_PATH=${DATABASE_PATH:-database/tg-bot-database.db} + volumes: + - ./bots/telegram-helper-bot/database:/app/database:rw + - ./bots/telegram-helper-bot/logs:/app/logs:rw + - ./bots/telegram-helper-bot/.env:/app/.env:ro + networks: + - bots_network + depends_on: + - prometheus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + memory: 512M + cpus: '0.5' + reservations: + memory: 256M + cpus: '0.25' + +volumes: + prometheus_data: + driver: local + grafana_data: + driver: local + +networks: + bots_network: + driver: bridge + ipam: + config: + - subnet: 192.168.100.0/24 diff --git a/env.template b/env.template new file mode 100644 index 0000000..b307c7b --- /dev/null +++ b/env.template @@ -0,0 +1,15 @@ +# Telegram Bot Configuration +TELEGRAM_MONITORING_BOT_TOKEN=your_bot_token_here +GROUP_MONITORING_FOR_LOGS=your_telegram_group_id_here +IMPORTANT_MONITORING_LOGS=your_important_logs_channel_id_here + +# Monitoring Configuration +THRESHOLD=80.0 +RECOVERY_THRESHOLD=75.0 + +# Prometheus Configuration +PROMETHEUS_RETENTION_DAYS=30 + +# Grafana Configuration +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=admin diff --git a/infra/grafana/provisioning/dashboards/all-dashboards.yml b/infra/grafana/provisioning/dashboards/all-dashboards.yml new file mode 100644 index 0000000..9c41a99 --- /dev/null +++ b/infra/grafana/provisioning/dashboards/all-dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'Infrastructure Dashboards' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/infra/grafana/provisioning/dashboards/server-dashboard.json b/infra/grafana/provisioning/dashboards/server-dashboard.json new file mode 100644 index 0000000..62ce6e4 --- /dev/null +++ b/infra/grafana/provisioning/dashboards/server-dashboard.json @@ -0,0 +1,224 @@ +{ + "id": null, + "title": "Server Monitoring", + "tags": ["monitoring", "server"], + "style": "dark", + "timezone": "browser", + "panels": [ + { + "id": 1, + "title": "CPU Usage", + "type": "stat", + "targets": [ + { + "expr": "cpu_usage_percent", + "legendFormat": "CPU %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + }, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 0, "y": 0} + }, + { + "id": 2, + "title": "RAM Usage", + "type": "stat", + "targets": [ + { + "expr": "ram_usage_percent", + "legendFormat": "RAM %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + }, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 6, "y": 0} + }, + { + "id": 3, + "title": "Disk Usage", + "type": "stat", + "targets": [ + { + "expr": "disk_usage_percent", + "legendFormat": "Disk %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 80}, + {"color": "red", "value": 95} + ] + }, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0} + }, + { + "id": 4, + "title": "Load Average", + "type": "timeseries", + "targets": [ + { + "expr": "load_average_1m", + "legendFormat": "1m" + }, + { + "expr": "load_average_5m", + "legendFormat": "5m" + }, + { + "expr": "load_average_15m", + "legendFormat": "15m" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8} + }, + { + "id": 5, + "title": "System Uptime", + "type": "stat", + "targets": [ + { + "expr": "system_uptime_seconds", + "legendFormat": "Uptime" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "unit": "s" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8} + }, + { + "id": 6, + "title": "Disk I/O Usage", + "type": "stat", + "targets": [ + { + "expr": "disk_io_percent", + "legendFormat": "Disk I/O %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 50}, + {"color": "red", "value": 80} + ] + }, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 0, "y": 16} + }, + { + "id": 7, + "title": "Swap Usage", + "type": "stat", + "targets": [ + { + "expr": "swap_usage_percent", + "legendFormat": "Swap %" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 50}, + {"color": "red", "value": 80} + ] + }, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 6, "x": 6, "y": 16} + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "30s" +} diff --git a/infra/grafana/provisioning/dashboards/telegram-bot-dashboards.json b/infra/grafana/provisioning/dashboards/telegram-bot-dashboards.json new file mode 100644 index 0000000..f6f6e18 --- /dev/null +++ b/infra/grafana/provisioning/dashboards/telegram-bot-dashboards.json @@ -0,0 +1,1012 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(bot_commands_total[5m]))", + "refId": "A" + } + ], + "title": "Commands per Second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "histogram_quantile(0.95, rate(method_duration_seconds_bucket[5m]))", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "histogram_quantile(0.99, rate(method_duration_seconds_bucket[5m]))", + "refId": "B" + } + ], + "title": "Method Response Time (P95, P99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(errors_total[5m]))", + "refId": "A" + } + ], + "title": "Errors per Second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(active_users)", + "refId": "A" + } + ], + "title": "Active Users", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m]))", + "refId": "A" + } + ], + "title": "Database Query Time (P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(messages_processed_total[5m]))", + "refId": "A" + } + ], + "title": "Messages Processed per Second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum by(query_type) (rate(db_queries_total[5m]))", + "refId": "A" + } + ], + "title": "Database Queries by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "rate(db_errors_total[5m])", + "refId": "A" + } + ], + "title": "Database Errors per Second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum by(command) (rate(bot_commands_total[5m]))", + "refId": "A" + } + ], + "title": "Commands by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum by(status) (rate(bot_commands_total[5m]))", + "refId": "A" + } + ], + "title": "Commands by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "topk(5, sum by(command) (rate(bot_commands_total[5m])))", + "refId": "A" + } + ], + "title": "Top Commands", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "telegram", + "bot", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Telegram Bot Dashboard", + "uid": "telegram-bot", + "version": 1, + "weekStart": "" +} diff --git a/infra/grafana/provisioning/datasources/prometheus.yml b/infra/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..86fd346 --- /dev/null +++ b/infra/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true diff --git a/infra/monitoring/__init__.py b/infra/monitoring/__init__.py new file mode 100644 index 0000000..62a9270 --- /dev/null +++ b/infra/monitoring/__init__.py @@ -0,0 +1,7 @@ +# Infrastructure Monitoring Module + +from .metrics_collector import MetricsCollector +from .message_sender import MessageSender +from .server_monitor import ServerMonitor + +__all__ = ['MetricsCollector', 'MessageSender', 'ServerMonitor'] diff --git a/infra/monitoring/check_grafana.py b/infra/monitoring/check_grafana.py new file mode 100644 index 0000000..452c248 --- /dev/null +++ b/infra/monitoring/check_grafana.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +Скрипт для проверки статуса Grafana и дашбордов +""" + +import requests +import json +import sys +from datetime import datetime + +def check_grafana_status(): + """Проверка статуса Grafana""" + try: + response = requests.get("http://localhost:3000/api/health", timeout=5) + if response.status_code == 200: + data = response.json() + print(f"✅ Grafana работает (версия: {data.get('version', 'unknown')})") + return True + else: + print(f"❌ Grafana: HTTP {response.status_code}") + return False + except Exception as e: + print(f"❌ Grafana: ошибка подключения - {e}") + return False + +def check_prometheus_connection(): + """Проверка подключения Grafana к Prometheus""" + try: + # Проверяем, что Prometheus доступен + response = requests.get("http://localhost:9090/api/v1/targets", timeout=5) + if response.status_code == 200: + print("✅ Prometheus доступен для Grafana") + return True + else: + print(f"❌ Prometheus: HTTP {response.status_code}") + return False + except Exception as e: + print(f"❌ Prometheus: ошибка подключения - {e}") + return False + +def check_metrics_availability(): + """Проверка доступности метрик""" + try: + response = requests.get("http://localhost:9091/metrics", timeout=5) + if response.status_code == 200: + content = response.text + if "cpu_usage_percent" in content and "ram_usage_percent" in content: + print("✅ Метрики доступны и содержат данные") + return True + else: + print("⚠️ Метрики доступны, но данные неполные") + return False + else: + print(f"❌ Метрики: HTTP {response.status_code}") + return False + except Exception as e: + print(f"❌ Метрики: ошибка подключения - {e}") + return False + +def check_prometheus_targets(): + """Проверка статуса targets в Prometheus""" + try: + response = requests.get("http://localhost:9090/api/v1/targets", timeout=5) + if response.status_code == 200: + data = response.json() + targets = data.get('data', {}).get('activeTargets', []) + + print("\n📊 Статус targets в Prometheus:") + for target in targets: + job = target.get('labels', {}).get('job', 'unknown') + instance = target.get('labels', {}).get('instance', 'unknown') + health = target.get('health', 'unknown') + last_error = target.get('lastError', '') + + status_emoji = "✅" if health == "up" else "❌" + print(f" {status_emoji} {job} ({instance}): {health}") + + if last_error: + print(f" Ошибка: {last_error}") + + return True + else: + print(f"❌ Prometheus API: HTTP {response.status_code}") + return False + except Exception as e: + print(f"❌ Prometheus API: ошибка подключения - {e}") + return False + +def main(): + """Основная функция проверки""" + print(f"🔍 Проверка Grafana и системы мониторинга - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("=" * 70) + + # Проверяем все компоненты + all_ok = True + + if not check_grafana_status(): + all_ok = False + + if not check_prometheus_connection(): + all_ok = False + + if not check_metrics_availability(): + all_ok = False + + if not check_prometheus_targets(): + all_ok = False + + print("\n" + "=" * 70) + if all_ok: + print("🎉 Все компоненты работают корректно!") + print("\n📋 Доступные адреса:") + print(" • Grafana: http://localhost:3000 (admin/admin)") + print(" • Prometheus: http://localhost:9090") + print(" • Метрики: http://localhost:9091/metrics") + print("\n📊 Дашборды должны быть доступны в Grafana:") + print(" • Server Monitoring") + print(" • Server Monitoring Dashboard") + print("\n💡 Если дашборды не видны, используйте ручную настройку:") + print(" • См. файл: GRAFANA_MANUAL_SETUP.md") + else: + print("⚠️ Обнаружены проблемы в системе мониторинга") + print(" Проверьте логи и настройки") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/infra/monitoring/main.py b/infra/monitoring/main.py new file mode 100644 index 0000000..afb7ebc --- /dev/null +++ b/infra/monitoring/main.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +""" +Основной скрипт для запуска модуля мониторинга сервера +""" + +import asyncio +import logging +import os +import sys + +# Добавляем корневую папку проекта в путь +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +from dotenv import load_dotenv +from infra.monitoring.server_monitor import ServerMonitor + +# Загружаем переменные окружения из .env файла +load_dotenv() + +# Настройка логирования +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + + +async def main(): + """Основная функция запуска мониторинга""" + try: + # Создаем экземпляр мониторинга + monitor = ServerMonitor() + + # Отправляем статус при запуске + await monitor.send_startup_status() + + # Запускаем основной цикл мониторинга + await monitor.monitor_loop() + + except KeyboardInterrupt: + logger.info("Мониторинг остановлен пользователем") + except Exception as e: + logger.error(f"Критическая ошибка в мониторинге: {e}") + raise + + +if __name__ == "__main__": + # Запускаем асинхронную функцию + asyncio.run(main()) diff --git a/infra/monitoring/message_sender.py b/infra/monitoring/message_sender.py new file mode 100644 index 0000000..a622855 --- /dev/null +++ b/infra/monitoring/message_sender.py @@ -0,0 +1,252 @@ +import os +import aiohttp +import logging +from datetime import datetime +from typing import Dict, List, Tuple +try: + from .metrics_collector import MetricsCollector +except ImportError: + from metrics_collector import MetricsCollector + +logger = logging.getLogger(__name__) + + +class MessageSender: + def __init__(self): + # Получаем переменные окружения + self.telegram_bot_token = os.getenv('TELEGRAM_MONITORING_BOT_TOKEN') + self.group_for_logs = os.getenv('GROUP_MONITORING_FOR_LOGS') + self.important_logs = os.getenv('IMPORTANT_MONITORING_LOGS') + + # Создаем экземпляр сборщика метрик + self.metrics_collector = MetricsCollector() + + # Время последней отправки статуса + self.last_status_time = None + + if not self.telegram_bot_token: + logger.warning("TELEGRAM_MONITORING_BOT_TOKEN не установлен в переменных окружения") + if not self.group_for_logs: + logger.warning("GROUP_MONITORING_FOR_LOGS не установлен в переменных окружения") + if not self.important_logs: + logger.warning("IMPORTANT_MONITORING_LOGS не установлен в переменных окружения") + + async def send_telegram_message(self, chat_id: str, message: str) -> bool: + """Отправка сообщения в Telegram через прямое обращение к API""" + if not self.telegram_bot_token: + logger.error("TELEGRAM_MONITORING_BOT_TOKEN не установлен") + return False + + try: + async with aiohttp.ClientSession() as session: + url = f"https://api.telegram.org/bot{self.telegram_bot_token}/sendMessage" + payload = { + "chat_id": chat_id, + "text": message, + "parse_mode": "HTML" + } + + async with session.post(url, json=payload) as response: + if response.status == 200: + logger.info(f"Сообщение успешно отправлено в чат {chat_id}") + return True + else: + response_text = await response.text() + logger.error(f"Ошибка отправки в Telegram: {response.status} - {response_text}") + return False + + except Exception as e: + logger.error(f"Ошибка при отправке сообщения в Telegram: {e}") + return False + + def should_send_status(self) -> bool: + """Проверка, нужно ли отправить статус (каждые 30 минут в 00 и 30 минут часа)""" + now = datetime.now() + + # Проверяем, что сейчас 00 или 30 минут часа + if now.minute in [0, 30]: + # Проверяем, не отправляли ли мы уже статус в эту минуту + if (self.last_status_time is None or + self.last_status_time.hour != now.hour or + self.last_status_time.minute != now.minute): + self.last_status_time = now + return True + + return False + + def should_send_startup_status(self) -> bool: + """Проверка, нужно ли отправить статус при запуске""" + return self.last_status_time is None + + def _get_disk_space_emoji(self, disk_percent: float) -> str: + """Получение эмодзи для дискового пространства""" + if disk_percent < 60: + return "🟢" + elif disk_percent < 90: + return "⚠️" + else: + return "🚨" + + def get_status_message(self, system_info: Dict) -> str: + """Формирование сообщения со статусом сервера""" + try: + voice_bot_status, voice_bot_uptime = self.metrics_collector.check_process_status('voice_bot') + helper_bot_status, helper_bot_uptime = self.metrics_collector.check_process_status('helper_bot') + + # Получаем эмодзи для дискового пространства + disk_emoji = self._get_disk_space_emoji(system_info['disk_percent']) + + message = f"""🖥 **Статус Сервера** | {system_info['current_time']} +--------------------------------- +**📊 Общая нагрузка:** +CPU: {system_info['cpu_percent']}% | LA: {system_info['load_avg_1m']} / {system_info['cpu_count']} | IO Wait: {system_info['disk_percent']}% + +**💾 Память:** +RAM: {system_info['ram_used']}/{system_info['ram_total']} GB ({system_info['ram_percent']}%) +Swap: {system_info['swap_used']}/{system_info['swap_total']} GB ({system_info['swap_percent']}%) + +**🗂️ Дисковое пространство:** +Диск (/): {system_info['disk_used']}/{system_info['disk_total']} GB ({system_info['disk_percent']}%) {disk_emoji} + +**💿 Диск I/O:** +Read: {system_info['disk_read_speed']} | Write: {system_info['disk_write_speed']} +Диск загружен: {system_info['disk_io_percent']}% + +**🤖 Процессы:** +{voice_bot_status} voice-bot - {voice_bot_uptime} +{helper_bot_status} helper-bot - {helper_bot_uptime} +--------------------------------- +⏰ Uptime сервера: {system_info['system_uptime']}""" + + return message + + except Exception as e: + logger.error(f"Ошибка при формировании статуса сервера: {e}") + return f"Ошибка при получении статуса сервера: {e}" + + def get_alert_message(self, metric_name: str, current_value: float, details: str) -> str: + """Формирование сообщения об алерте""" + try: + message = f"""🚨 **ALERT: Высокая нагрузка на сервере!** +--------------------------------- +**Показатель:** {metric_name} +**Текущее значение:** {current_value}% ⚠️ +**Пороговое значение:** 80% + +**Детали:** +{details} + +**Сервер:** `{self.metrics_collector.os_type.upper()}` +**Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}` +---------------------------------""" + + return message + + except Exception as e: + logger.error(f"Ошибка при формировании алерта: {e}") + return f"Ошибка при формировании алерта: {e}" + + def get_recovery_message(self, metric_name: str, current_value: float, peak_value: float) -> str: + """Формирование сообщения о восстановлении""" + try: + message = f"""✅ **RECOVERY: Нагрузка нормализовалась** +--------------------------------- +**Показатель:** {metric_name} +**Текущее значение:** {current_value}% ✔️ +**Было превышение:** До {peak_value}% + +**Сервер:** `{self.metrics_collector.os_type.upper()}` +**Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}` +---------------------------------""" + + return message + + except Exception as e: + logger.error(f"Ошибка при формировании сообщения о восстановлении: {e}") + return f"Ошибка при формировании сообщения о восстановлении: {e}" + + async def send_status_message(self) -> bool: + """Отправка статуса сервера в группу логов""" + if not self.group_for_logs: + logger.warning("GROUP_MONITORING_FOR_LOGS не установлен, пропускаем отправку статуса") + return False + + try: + system_info = self.metrics_collector.get_system_info() + if not system_info: + logger.error("Не удалось получить информацию о системе") + return False + + status_message = self.get_status_message(system_info) + return await self.send_telegram_message(self.group_for_logs, status_message) + + except Exception as e: + logger.error(f"Ошибка при отправке статуса: {e}") + return False + + async def send_alert_message(self, metric_type: str, current_value: float, details: str) -> bool: + """Отправка сообщения об алерте в важные логи""" + if not self.important_logs: + logger.warning("IMPORTANT_MONITORING_LOGS не установлен, пропускаем отправку алерта") + return False + + try: + metric_names = { + 'cpu': 'Использование CPU', + 'ram': 'Использование оперативной памяти', + 'disk': 'Заполнение диска (/)' + } + + metric_name = metric_names.get(metric_type, metric_type) + alert_message = self.get_alert_message(metric_name, current_value, details) + return await self.send_telegram_message(self.important_logs, alert_message) + + except Exception as e: + logger.error(f"Ошибка при отправке алерта: {e}") + return False + + async def send_recovery_message(self, metric_type: str, current_value: float, peak_value: float) -> bool: + """Отправка сообщения о восстановлении в важные логи""" + if not self.important_logs: + logger.warning("IMPORTANT_MONITORING_LOGS не установлен, пропускаем отправку сообщения о восстановлении") + return False + + try: + metric_names = { + 'cpu': 'Использование CPU', + 'ram': 'Использование оперативной памяти', + 'disk': 'Заполнение диска (/)' + } + + metric_name = metric_names.get(metric_type, metric_type) + recovery_message = self.get_recovery_message(metric_name, current_value, peak_value) + return await self.send_telegram_message(self.important_logs, recovery_message) + + except Exception as e: + logger.error(f"Ошибка при отправке сообщения о восстановлении: {e}") + return False + + async def process_alerts_and_recoveries(self) -> None: + """Обработка алертов и восстановлений""" + try: + system_info = self.metrics_collector.get_system_info() + if not system_info: + return + + # Проверка алертов + alerts, recoveries = self.metrics_collector.check_alerts(system_info) + + # Отправка алертов + for metric_type, value, details in alerts: + await self.send_alert_message(metric_type, value, details) + logger.warning(f"ALERT отправлен: {metric_type} - {value}% - {details}") + + # Отправка сообщений о восстановлении + for metric_type, value in recoveries: + # Находим пиковое значение для сообщения о восстановлении + peak_value = self.metrics_collector.threshold + await self.send_recovery_message(metric_type, value, peak_value) + logger.info(f"RECOVERY отправлен: {metric_type} - {value}%") + + except Exception as e: + logger.error(f"Ошибка при обработке алертов и восстановлений: {e}") diff --git a/infra/monitoring/metrics_collector.py b/infra/monitoring/metrics_collector.py new file mode 100644 index 0000000..2deba8f --- /dev/null +++ b/infra/monitoring/metrics_collector.py @@ -0,0 +1,495 @@ +import os +import psutil +import time +import platform +from datetime import datetime +from typing import Dict, Optional, Tuple +import logging + +logger = logging.getLogger(__name__) + + +class MetricsCollector: + def __init__(self): + # Определяем ОС + self.os_type = self._detect_os() + logger.info(f"Обнаружена ОС: {self.os_type}") + + # Пороговые значения для алертов + self.threshold = float(os.getenv('THRESHOLD', '80.0')) + self.recovery_threshold = float(os.getenv('RECOVERY_THRESHOLD', '75.0')) + + # Состояние алертов для предотвращения спама + self.alert_states = { + 'cpu': False, + 'ram': False, + 'disk': False + } + + # PID файлы для отслеживания процессов + self.pid_files = { + 'voice_bot': 'voice_bot.pid', + 'helper_bot': 'helper_bot.pid' + } + + # Для расчета скорости диска + self.last_disk_io = None + self.last_disk_io_time = None + + # Для расчета процента загрузки диска (отдельные переменные) + self.last_disk_io_for_percent = None + self.last_disk_io_time_for_percent = None + + # Инициализируем базовые значения для скорости диска при первом вызове + self._initialize_disk_io() + + + + # Время запуска мониторинга для расчета uptime + self.monitor_start_time = time.time() + + def _detect_os(self) -> str: + """Определение типа операционной системы""" + system = platform.system().lower() + if system == "darwin": + return "macos" + elif system == "linux": + return "ubuntu" + else: + return "unknown" + + def _initialize_disk_io(self): + """Инициализация базовых значений для расчета скорости диска""" + try: + disk_io = self._get_disk_io_counters() + if disk_io: + self.last_disk_io = disk_io + self.last_disk_io_time = time.time() + logger.debug("Инициализированы базовые значения для расчета скорости диска") + except Exception as e: + logger.error(f"Ошибка при инициализации диска I/O: {e}") + + def _get_disk_path(self) -> str: + """Получение пути к диску в зависимости от ОС""" + if self.os_type == "macos": + return "/" + elif self.os_type == "ubuntu": + return "/" + else: + return "/" + + def _get_disk_usage(self) -> Optional[object]: + """Получение информации о диске с учетом ОС""" + try: + if self.os_type == "macos": + # На macOS используем diskutil для получения реального использования диска + return self._get_macos_disk_usage() + else: + disk_path = self._get_disk_path() + return psutil.disk_usage(disk_path) + except Exception as e: + logger.error(f"Ошибка при получении информации о диске: {e}") + return None + + def _get_macos_disk_usage(self) -> Optional[object]: + """Получение информации о диске на macOS через diskutil""" + try: + import subprocess + import re + + # Получаем информацию о диске через diskutil + result = subprocess.run(['diskutil', 'info', '/'], capture_output=True, text=True) + if result.returncode != 0: + # Fallback к psutil + return psutil.disk_usage('/') + + output = result.stdout + + # Извлекаем размеры из вывода diskutil + total_match = re.search(r'Container Total Space:\s+(\d+\.\d+)\s+GB', output) + free_match = re.search(r'Container Free Space:\s+(\d+\.\d+)\s+GB', output) + + if total_match and free_match: + total_gb = float(total_match.group(1)) + free_gb = float(free_match.group(1)) + used_gb = total_gb - free_gb + + # Создаем объект, похожий на результат psutil.disk_usage + class DiskUsage: + def __init__(self, total, used, free): + self.total = total * (1024**3) # Конвертируем в байты + self.used = used * (1024**3) + self.free = free * (1024**3) + + return DiskUsage(total_gb, used_gb, free_gb) + else: + # Fallback к psutil + return psutil.disk_usage('/') + + except Exception as e: + logger.error(f"Ошибка при получении информации о диске macOS: {e}") + # Fallback к psutil + return psutil.disk_usage('/') + + def _get_disk_io_counters(self): + """Получение статистики диска с учетом ОС""" + try: + if self.os_type == "macos": + # На macOS может быть несколько дисков, берем основной + return psutil.disk_io_counters(perdisk=False) + elif self.os_type == "ubuntu": + # На Ubuntu обычно один диск + return psutil.disk_io_counters(perdisk=False) + else: + return psutil.disk_io_counters() + except Exception as e: + logger.error(f"Ошибка при получении статистики диска: {e}") + return None + + def _get_system_uptime(self) -> float: + """Получение uptime системы с учетом ОС""" + try: + if self.os_type == "macos": + # На macOS используем boot_time + boot_time = psutil.boot_time() + return time.time() - boot_time + elif self.os_type == "ubuntu": + # На Ubuntu также используем boot_time + boot_time = psutil.boot_time() + return time.time() - boot_time + else: + boot_time = psutil.boot_time() + return time.time() - boot_time + except Exception as e: + logger.error(f"Ошибка при получении uptime системы: {e}") + return 0.0 + + def get_monitor_uptime(self) -> str: + """Получение uptime мониторинга""" + uptime_seconds = time.time() - self.monitor_start_time + return self._format_uptime(uptime_seconds) + + def get_system_info(self) -> Dict: + """Получение информации о системе""" + try: + # CPU + cpu_percent = psutil.cpu_percent(interval=1) + load_avg = psutil.getloadavg() + cpu_count = psutil.cpu_count() + + # Память + memory = psutil.virtual_memory() + swap = psutil.swap_memory() + + # Используем единый расчет для всех ОС: used / total для получения процента занятой памяти + # Это обеспечивает консистентность между macOS и Ubuntu + ram_percent = (memory.used / memory.total) * 100 + + # Диск + disk = self._get_disk_usage() + disk_io = self._get_disk_io_counters() + + if disk is None: + logger.error("Не удалось получить информацию о диске") + return {} + + # Сначала рассчитываем процент загрузки диска (до обновления last_disk_io_time) + disk_io_percent = self._calculate_disk_io_percent() + + # Затем рассчитываем скорость диска (это обновит last_disk_io_time) + disk_read_speed, disk_write_speed = self._calculate_disk_speed(disk_io) + + # Диагностика диска для отладки + if disk_io: + logger.debug(f"Диск I/O статистика: read_count={disk_io.read_count}, write_count={disk_io.write_count}, " + f"read_bytes={disk_io.read_bytes}, write_bytes={disk_io.write_bytes}") + + # Система + system_uptime = self._get_system_uptime() + + # Получаем имя хоста в зависимости от ОС + if self.os_type == "macos": + hostname = os.uname().nodename + elif self.os_type == "ubuntu": + hostname = os.uname().nodename + else: + hostname = "unknown" + + return { + 'cpu_percent': cpu_percent, + 'load_avg_1m': round(load_avg[0], 2), + 'load_avg_5m': round(load_avg[1], 2), + 'load_avg_15m': round(load_avg[2], 2), + 'cpu_count': cpu_count, + 'ram_used': round(memory.used / (1024**3), 2), + 'ram_total': round(memory.total / (1024**3), 2), + 'ram_percent': round(ram_percent, 1), # Исправленный процент занятой памяти + 'swap_used': round(swap.used / (1024**3), 2), + 'swap_total': round(swap.total / (1024**3), 2), + 'swap_percent': swap.percent, + 'disk_used': round(disk.used / (1024**3), 2), + 'disk_total': round(disk.total / (1024**3), 2), + 'disk_percent': round((disk.used / disk.total) * 100, 1), + 'disk_free': round(disk.free / (1024**3), 2), + 'disk_read_speed': disk_read_speed, + 'disk_write_speed': disk_write_speed, + 'disk_io_percent': disk_io_percent, + 'system_uptime': self._format_uptime(system_uptime), + 'monitor_uptime': self.get_monitor_uptime(), + 'server_hostname': hostname, + 'current_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + except Exception as e: + logger.error(f"Ошибка при получении информации о системе: {e}") + return {} + + def _format_bytes(self, bytes_value: int) -> str: + """Форматирование байтов в человекочитаемый вид""" + if bytes_value == 0: + return "0 B" + + size_names = ["B", "KB", "MB", "GB", "TB"] + i = 0 + while bytes_value >= 1024 and i < len(size_names) - 1: + bytes_value /= 1024.0 + i += 1 + + return f"{bytes_value:.1f} {size_names[i]}" + + def _format_uptime(self, seconds: float) -> str: + """Форматирование времени работы системы""" + days = int(seconds // 86400) + hours = int((seconds % 86400) // 3600) + minutes = int((seconds % 3600) // 60) + + if days > 0: + return f"{days}д {hours}ч {minutes}м" + elif hours > 0: + return f"{hours}ч {minutes}м" + else: + return f"{minutes}м" + + def check_process_status(self, process_name: str) -> Tuple[str, str]: + """Проверка статуса процесса и возврат статуса с uptime""" + try: + # Сначала проверяем по PID файлу + pid_file = self.pid_files.get(process_name) + if pid_file and os.path.exists(pid_file): + try: + with open(pid_file, 'r') as f: + content = f.read().strip() + if content and content != '# Этот файл будет автоматически обновляться при запуске бота': + pid = int(content) + if psutil.pid_exists(pid): + # Получаем uptime процесса + try: + proc = psutil.Process(pid) + proc_uptime = time.time() - proc.create_time() + uptime_str = self._format_uptime(proc_uptime) + return "✅", f"Uptime {uptime_str}" + except: + return "✅", "Uptime неизвестно" + except (ValueError, FileNotFoundError): + pass + + # Проверяем по имени процесса более точно + for proc in psutil.process_iter(['pid', 'name', 'cmdline']): + try: + proc_name = proc.info['name'].lower() + cmdline = ' '.join(proc.info['cmdline']).lower() if proc.info['cmdline'] else '' + + # Более точная проверка для каждого бота + if process_name == 'voice_bot': + # Проверяем voice_bot + if ('voice_bot' in proc_name or + 'voice_bot' in cmdline or + 'voice_bot_v2.py' in cmdline): + # Получаем uptime процесса + try: + proc_uptime = time.time() - proc.create_time() + uptime_str = self._format_uptime(proc_uptime) + return "✅", f"Uptime {uptime_str}" + except: + return "✅", "Uptime неизвестно" + elif process_name == 'helper_bot': + # Проверяем helper_bot + if ('helper_bot' in proc_name or + 'helper_bot' in cmdline or + 'run_helper.py' in cmdline or + 'python' in proc_name and 'helper_bot' in cmdline): + # Получаем uptime процесса + try: + proc_uptime = time.time() - proc.create_time() + uptime_str = self._format_uptime(proc_uptime) + return "✅", f"Uptime {uptime_str}" + except: + return "✅", "Uptime неизвестно" + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + return "❌", "Выключен" + except Exception as e: + logger.error(f"Ошибка при проверке процесса {process_name}: {e}") + return "❌", "Выключен" + + def _calculate_disk_speed(self, current_disk_io) -> Tuple[str, str]: + """Расчет скорости чтения/записи диска""" + current_time = time.time() + + if self.last_disk_io is None or self.last_disk_io_time is None: + self.last_disk_io = current_disk_io + self.last_disk_io_time = current_time + return "0 B/s", "0 B/s" + + time_diff = current_time - self.last_disk_io_time + if time_diff < 1: # Минимальный интервал 1 секунда + return "0 B/s", "0 B/s" + + read_diff = current_disk_io.read_bytes - self.last_disk_io.read_bytes + write_diff = current_disk_io.write_bytes - self.last_disk_io.write_bytes + + read_speed = read_diff / time_diff + write_speed = write_diff / time_diff + + # Обновляем предыдущие значения + self.last_disk_io = current_disk_io + self.last_disk_io_time = current_time + + return self._format_bytes(read_speed) + "/s", self._format_bytes(write_speed) + "/s" + + def _calculate_disk_io_percent(self) -> int: + """Расчет процента загрузки диска на основе реальной скорости I/O""" + try: + # Получаем текущую статистику диска + current_disk_io = self._get_disk_io_counters() + if current_disk_io is None: + return 0 + + current_time = time.time() + + # Если это первое измерение, инициализируем + if self.last_disk_io_for_percent is None or self.last_disk_io_time_for_percent is None: + logger.debug("Первое измерение диска для процента, инициализируем базовые значения") + self.last_disk_io_for_percent = current_disk_io + self.last_disk_io_time_for_percent = current_time + return 0 + + # Рассчитываем время между измерениями + time_diff = current_time - self.last_disk_io_time_for_percent + if time_diff < 0.1: # Минимальный интервал 0.1 секунды для более точных измерений + logger.debug(f"Интервал между измерениями слишком мал: {time_diff:.3f}s, возвращаем 0%") + return 0 + + # Рассчитываем скорость операций в секунду + read_ops_diff = current_disk_io.read_count - self.last_disk_io_for_percent.read_count + write_ops_diff = current_disk_io.write_count - self.last_disk_io_for_percent.write_count + + read_ops_per_sec = read_ops_diff / time_diff + write_ops_per_sec = write_ops_diff / time_diff + total_ops_per_sec = read_ops_per_sec + write_ops_per_sec + + # Рассчитываем скорость передачи данных в байтах в секунду + read_bytes_diff = current_disk_io.read_bytes - self.last_disk_io_for_percent.read_bytes + write_bytes_diff = current_disk_io.write_bytes - self.last_disk_io_for_percent.write_bytes + + read_bytes_per_sec = read_bytes_diff / time_diff + write_bytes_per_sec = write_bytes_diff / time_diff + total_bytes_per_sec = read_bytes_per_sec + write_bytes_per_sec + + # Обновляем предыдущие значения для процента + self.last_disk_io_for_percent = current_disk_io + self.last_disk_io_time_for_percent = current_time + + # Определяем максимальную производительность диска в зависимости от ОС + if self.os_type == "macos": + # macOS обычно имеет SSD с высокой производительностью + max_ops_per_sec = 50000 # Операций в секунду + max_bytes_per_sec = 3 * (1024**3) # 3 GB/s + elif self.os_type == "ubuntu": + # Ubuntu может быть на разных типах дисков + max_ops_per_sec = 30000 # Операций в секунду + max_bytes_per_sec = 2 * (1024**3) # 2 GB/s + else: + max_ops_per_sec = 40000 + max_bytes_per_sec = 2.5 * (1024**3) + + # Рассчитываем процент загрузки на основе операций и байтов + # Защита от деления на ноль + if max_ops_per_sec > 0: + ops_percent = min(100, (total_ops_per_sec / max_ops_per_sec) * 100) + else: + ops_percent = 0 + + if max_bytes_per_sec > 0: + bytes_percent = min(100, (total_bytes_per_sec / max_bytes_per_sec) * 100) + else: + bytes_percent = 0 + + # Взвешенный средний процент (операции важнее для большинства случаев) + final_percent = (ops_percent * 0.7) + (bytes_percent * 0.3) + + # Логируем для отладки (только при высоких значениях) + if final_percent > 10: + logger.debug(f"Диск I/O: {total_ops_per_sec:.1f} ops/s, {total_bytes_per_sec/(1024**2):.1f} MB/s, " + f"Загрузка: {final_percent:.1f}% (ops: {ops_percent:.1f}%, bytes: {bytes_percent:.1f}%)") + + # Округляем до целого числа + return round(final_percent) + + except Exception as e: + logger.error(f"Ошибка при расчете процента загрузки диска: {e}") + return 0 + + def get_metrics_data(self) -> Dict: + """Получение данных для метрик Prometheus""" + system_info = self.get_system_info() + if not system_info: + return {} + + return { + 'cpu_usage_percent': system_info.get('cpu_percent', 0), + 'ram_usage_percent': system_info.get('ram_percent', 0), + 'disk_usage_percent': system_info.get('disk_percent', 0), + 'load_average_1m': system_info.get('load_avg_1m', 0), + 'load_average_5m': system_info.get('load_avg_5m', 0), + 'load_average_15m': system_info.get('load_avg_15m', 0), + 'swap_usage_percent': system_info.get('swap_percent', 0), + 'disk_io_percent': system_info.get('disk_io_percent', 0), + 'system_uptime_seconds': self._get_system_uptime(), + 'monitor_uptime_seconds': time.time() - self.monitor_start_time + } + + def check_alerts(self, system_info: Dict) -> Tuple[bool, Optional[str]]: + """Проверка необходимости отправки алертов""" + alerts = [] + + # Проверка CPU + if system_info['cpu_percent'] > self.threshold and not self.alert_states['cpu']: + self.alert_states['cpu'] = True + alerts.append(('cpu', system_info['cpu_percent'], f"Нагрузка за 1 мин: {system_info['load_avg_1m']}")) + + # Проверка RAM + if system_info['ram_percent'] > self.threshold and not self.alert_states['ram']: + self.alert_states['ram'] = True + alerts.append(('ram', system_info['ram_percent'], f"Используется: {system_info['ram_used']} GB из {system_info['ram_total']} GB")) + + # Проверка диска + if system_info['disk_percent'] > self.threshold and not self.alert_states['disk']: + self.alert_states['disk'] = True + alerts.append(('disk', system_info['disk_percent'], f"Свободно: {system_info['disk_free']} GB на /")) + + # Проверка восстановления + recoveries = [] + if system_info['cpu_percent'] < self.recovery_threshold and self.alert_states['cpu']: + self.alert_states['cpu'] = False + recoveries.append(('cpu', system_info['cpu_percent'])) + + if system_info['ram_percent'] < self.recovery_threshold and self.alert_states['ram']: + self.alert_states['ram'] = False + recoveries.append(('ram', system_info['ram_percent'])) + + if system_info['disk_percent'] < self.recovery_threshold and self.alert_states['disk']: + self.alert_states['disk'] = False + recoveries.append(('disk', system_info['disk_percent'])) + + return alerts, recoveries diff --git a/infra/monitoring/prometheus_server.py b/infra/monitoring/prometheus_server.py new file mode 100644 index 0000000..cf00562 --- /dev/null +++ b/infra/monitoring/prometheus_server.py @@ -0,0 +1,143 @@ +import asyncio +import logging +from aiohttp import web +try: + from .metrics_collector import MetricsCollector +except ImportError: + from metrics_collector import MetricsCollector + +logger = logging.getLogger(__name__) + + +class PrometheusServer: + def __init__(self, host='0.0.0.0', port=9091): + self.host = host + self.port = port + self.metrics_collector = MetricsCollector() + self.app = web.Application() + self.setup_routes() + + def setup_routes(self): + """Настройка маршрутов для Prometheus""" + self.app.router.add_get('/', self.root_handler) + self.app.router.add_get('/metrics', self.metrics_handler) + self.app.router.add_get('/health', self.health_handler) + + async def root_handler(self, request): + """Главная страница""" + return web.Response( + text="Prometheus Metrics Server\n\n" + "Available endpoints:\n" + "- /metrics - Prometheus metrics\n" + "- /health - Health check", + content_type='text/plain' + ) + + async def health_handler(self, request): + """Health check endpoint""" + return web.Response( + text="OK", + content_type='text/plain' + ) + + async def metrics_handler(self, request): + """Endpoint для Prometheus метрик""" + try: + metrics_data = self.metrics_collector.get_metrics_data() + prometheus_metrics = self._format_prometheus_metrics(metrics_data) + + return web.Response( + text=prometheus_metrics, + content_type='text/plain' + ) + + except Exception as e: + logger.error(f"Ошибка при получении метрик: {e}") + return web.Response( + text=f"Error: {str(e)}", + status=500, + content_type='text/plain' + ) + + def _format_prometheus_metrics(self, metrics_data: dict) -> str: + """Форматирование метрик в Prometheus формат""" + lines = [] + + # Системная информация + lines.append("# HELP system_info System information") + lines.append("# TYPE system_info gauge") + lines.append(f"system_info{{os=\"{self.metrics_collector.os_type}\"}} 1") + + # CPU метрики + if 'cpu_usage_percent' in metrics_data: + lines.append("# HELP cpu_usage_percent CPU usage percentage") + lines.append("# TYPE cpu_usage_percent gauge") + lines.append(f"cpu_usage_percent {metrics_data['cpu_usage_percent']}") + + if 'load_average_1m' in metrics_data: + lines.append("# HELP load_average_1m 1 minute load average") + lines.append("# TYPE load_average_1m gauge") + lines.append(f"load_average_1m {metrics_data['load_average_1m']}") + + if 'load_average_5m' in metrics_data: + lines.append("# HELP load_average_5m 5 minute load average") + lines.append("# TYPE load_average_5m gauge") + lines.append(f"load_average_5m {metrics_data['load_average_5m']}") + + if 'load_average_15m' in metrics_data: + lines.append("# HELP load_average_15m 15 minute load average") + lines.append("# TYPE load_average_15m gauge") + lines.append(f"load_average_15m {metrics_data['load_average_15m']}") + + # RAM метрики + if 'ram_usage_percent' in metrics_data: + lines.append("# HELP ram_usage_percent RAM usage percentage") + lines.append("# TYPE ram_usage_percent gauge") + lines.append(f"ram_usage_percent {metrics_data['ram_usage_percent']}") + + # Disk метрики + if 'disk_usage_percent' in metrics_data: + lines.append("# HELP disk_usage_percent Disk usage percentage") + lines.append("# TYPE disk_usage_percent gauge") + lines.append(f"disk_usage_percent {metrics_data['disk_usage_percent']}") + + if 'disk_io_percent' in metrics_data: + lines.append("# HELP disk_io_percent Disk I/O usage percentage") + lines.append("# TYPE disk_io_percent gauge") + lines.append(f"disk_io_percent {metrics_data['disk_io_percent']}") + + # Swap метрики + if 'swap_usage_percent' in metrics_data: + lines.append("# HELP swap_usage_percent Swap usage percentage") + lines.append("# TYPE swap_usage_percent gauge") + lines.append(f"swap_usage_percent {metrics_data['swap_usage_percent']}") + + # Uptime метрики + if 'system_uptime_seconds' in metrics_data: + lines.append("# HELP system_uptime_seconds System uptime in seconds") + lines.append("# TYPE system_uptime_seconds gauge") + lines.append(f"system_uptime_seconds {metrics_data['system_uptime_seconds']}") + + if 'monitor_uptime_seconds' in metrics_data: + lines.append("# HELP monitor_uptime_seconds Monitor uptime in seconds") + lines.append("# TYPE monitor_uptime_seconds gauge") + lines.append(f"monitor_uptime_seconds {metrics_data['monitor_uptime_seconds']}") + + return '\n'.join(lines) + + async def start(self): + """Запуск HTTP сервера""" + runner = web.AppRunner(self.app) + await runner.setup() + + site = web.TCPSite(runner, self.host, self.port) + await site.start() + + logger.info(f"Prometheus сервер запущен на http://{self.host}:{self.port}") + + return runner + + async def stop(self, runner): + """Остановка HTTP сервера""" + await runner.cleanup() + logger.info("Prometheus сервер остановлен") diff --git a/infra/monitoring/server_monitor.py b/infra/monitoring/server_monitor.py new file mode 100644 index 0000000..daca516 --- /dev/null +++ b/infra/monitoring/server_monitor.py @@ -0,0 +1,62 @@ +import asyncio +import logging +try: + from .metrics_collector import MetricsCollector + from .message_sender import MessageSender + from .prometheus_server import PrometheusServer +except ImportError: + from metrics_collector import MetricsCollector + from message_sender import MessageSender + from prometheus_server import PrometheusServer + +logger = logging.getLogger(__name__) + + +class ServerMonitor: + def __init__(self): + # Создаем экземпляры модулей + self.metrics_collector = MetricsCollector() + self.message_sender = MessageSender() + self.prometheus_server = PrometheusServer() + + logger.info(f"Модуль мониторинга сервера запущен на {self.metrics_collector.os_type.upper()}") + + async def monitor_loop(self): + """Основной цикл мониторинга""" + logger.info(f"Модуль мониторинга сервера запущен на {self.metrics_collector.os_type.upper()}") + + # Запускаем Prometheus сервер + prometheus_runner = await self.prometheus_server.start() + + try: + while True: + try: + # Проверка алертов и восстановлений + await self.message_sender.process_alerts_and_recoveries() + + # Проверка необходимости отправки статуса + if self.message_sender.should_send_status(): + await self.message_sender.send_status_message() + + # Пауза между проверками (30 секунд) + await asyncio.sleep(30) + + except Exception as e: + logger.error(f"Ошибка в цикле мониторинга: {e}") + await asyncio.sleep(30) + finally: + # Останавливаем Prometheus сервер при завершении + await self.prometheus_server.stop(prometheus_runner) + + async def send_startup_status(self): + """Отправка статуса при запуске""" + if self.message_sender.should_send_startup_status(): + await self.message_sender.send_status_message() + + def get_system_info(self): + """Получение информации о системе (для обратной совместимости)""" + return self.metrics_collector.get_system_info() + + def get_metrics_data(self): + """Получение данных для метрик Prometheus (для обратной совместимости)""" + return self.metrics_collector.get_metrics_data() diff --git a/infra/monitoring/test_monitor.py b/infra/monitoring/test_monitor.py new file mode 100644 index 0000000..21593ee --- /dev/null +++ b/infra/monitoring/test_monitor.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Тестовый скрипт для проверки работы модуля мониторинга +""" + +import sys +import os +import logging + +# Добавляем текущую директорию в путь для импорта +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from server_monitor import ServerMonitor + +# Настройка логирования +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +def main(): + """Основная функция тестирования""" + print("🚀 Тестирование модуля мониторинга сервера") + print("=" * 50) + + try: + # Создаем экземпляр мониторинга + monitor = ServerMonitor() + + # Получаем информацию о системе + print("📊 Получение информации о системе...") + system_info = monitor.get_system_info() + + if system_info: + print("✅ Информация о системе получена успешно") + print(f" CPU: {system_info.get('cpu_percent', 'N/A')}%") + print(f" RAM: {system_info.get('ram_percent', 'N/A')}%") + print(f" Диск: {system_info.get('disk_percent', 'N/A')}%") + print(f" Хост: {system_info.get('server_hostname', 'N/A')}") + print(f" ОС: {monitor.os_type}") + else: + print("❌ Не удалось получить информацию о системе") + return + + # Проверяем статус процессов + print("\n🤖 Проверка статуса процессов...") + voice_status, voice_uptime = monitor.check_process_status('voice_bot') + helper_status, helper_uptime = monitor.check_process_status('helper_bot') + + print(f" Voice Bot: {voice_status} - {voice_uptime}") + print(f" Helper Bot: {helper_status} - {helper_uptime}") + + # Получаем метрики для Prometheus + print("\n📈 Получение метрик для Prometheus...") + metrics = monitor.get_metrics_data() + + if metrics: + print("✅ Метрики получены успешно") + for key, value in metrics.items(): + print(f" {key}: {value}") + else: + print("❌ Не удалось получить метрики") + + # Проверяем алерты + print("\n🚨 Проверка алертов...") + alerts, recoveries = monitor.check_alerts(system_info) + + if alerts: + print(f" Найдено алертов: {len(alerts)}") + for alert_type, value, details in alerts: + print(f" {alert_type}: {value}% - {details}") + else: + print(" Алертов не найдено") + + if recoveries: + print(f" Найдено восстановлений: {len(recoveries)}") + for recovery_type, value in recoveries: + print(f" {recovery_type}: {value}%") + + # Получаем сообщение о статусе + print("\n💬 Формирование сообщения о статусе...") + status_message = monitor.get_status_message(system_info) + if status_message: + print("✅ Сообщение о статусе сформировано") + print(" Первые 200 символов:") + print(f" {status_message[:200]}...") + else: + print("❌ Не удалось сформировать сообщение о статусе") + + print("\n🎉 Тестирование завершено успешно!") + + except Exception as e: + print(f"❌ Ошибка при тестировании: {e}") + logging.error(f"Ошибка при тестировании: {e}", exc_info=True) + return 1 + + return 0 + +if __name__ == "__main__": + exit(main()) diff --git a/infra/prometheus/prometheus.yml b/infra/prometheus/prometheus.yml new file mode 100644 index 0000000..5aef2e9 --- /dev/null +++ b/infra/prometheus/prometheus.yml @@ -0,0 +1,38 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Job для мониторинга инфраструктуры + - job_name: 'infrastructure' + static_configs: + - targets: ['host.docker.internal:9091'] # Порт для метрик сервера мониторинга + metrics_path: '/metrics' + scrape_interval: 30s + scrape_timeout: 10s + honor_labels: true + + - job_name: 'telegram-helper-bot' + static_configs: + - targets: ['telegram-helper-bot:8080'] # Или IP адрес сервера с ботом + metrics_path: '/metrics' + scrape_interval: 15s + scrape_timeout: 10s + honor_labels: true + labels: + bot_name: 'telegram-helper-bot' + environment: 'production' + service: 'telegram-bot' +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..651a9ed --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +psutil>=5.9.0 +asyncio +aiohttp>=3.8.0 +python-dotenv>=1.0.0 diff --git a/scripts/deploy.sh b/scripts/deploy.sh new file mode 100755 index 0000000..a2d1f13 --- /dev/null +++ b/scripts/deploy.sh @@ -0,0 +1,221 @@ +#!/bin/bash + +# Bots Infrastructure Deployment Script +# This script deploys the complete bots infrastructure using Docker Compose + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Configuration +COMPOSE_FILE="docker-compose.yml" +ENV_FILE=".env" +LOG_DIR="logs" + +# Function to print colored output +print_status() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to check prerequisites +check_prerequisites() { + print_status "Checking prerequisites..." + + if ! command -v docker &> /dev/null; then + print_error "Docker is not installed. Please install Docker first." + exit 1 + fi + + if ! command -v docker-compose &> /dev/null; then + print_error "Docker Compose is not installed. Please install Docker Compose first." + exit 1 + fi + + if [ ! -f "$ENV_FILE" ]; then + print_error "Environment file $ENV_FILE not found. Please create it from .env.example" + exit 1 + fi + + print_status "Prerequisites check passed" +} + +# Function to create necessary directories +create_directories() { + print_status "Creating necessary directories..." + + mkdir -p "$LOG_DIR"/telegram-helper-bot + mkdir -p "$LOG_DIR"/voice-bot + mkdir -p infra/nginx/ssl + + print_status "Directories created" +} + +# Function to load environment variables +load_env() { + print_status "Loading environment variables..." + + if [ -f "$ENV_FILE" ]; then + export $(cat "$ENV_FILE" | grep -v '^#' | xargs) + print_status "Environment variables loaded" + else + print_error "Environment file not found" + exit 1 + fi +} + +# Function to validate environment variables +validate_env() { + print_status "Validating environment variables..." + + local required_vars=("BOT_TOKEN_1" "BOT_TOKEN_2" "DB_PASSWORD" "REDIS_PASSWORD") + local missing_vars=() + + for var in "${required_vars[@]}"; do + if [ -z "${!var}" ]; then + missing_vars+=("$var") + fi + done + + if [ ${#missing_vars[@]} -ne 0 ]; then + print_error "Missing required environment variables: ${missing_vars[*]}" + exit 1 + fi + + print_status "Environment variables validation passed" +} + +# Function to stop existing services +stop_services() { + print_status "Stopping existing services..." + + if docker-compose -f "$COMPOSE_FILE" ps -q | grep -q .; then + docker-compose -f "$COMPOSE_FILE" down + print_status "Existing services stopped" + else + print_status "No existing services to stop" + fi +} + +# Function to deploy services +deploy_services() { + print_status "Deploying services..." + + docker-compose -f "$COMPOSE_FILE" up -d --build + + if [ $? -eq 0 ]; then + print_status "Services deployed successfully" + else + print_error "Failed to deploy services" + exit 1 + fi +} + +# Function to wait for services to be healthy +wait_for_services() { + print_status "Waiting for services to be healthy..." + + local max_attempts=30 + local attempt=1 + + while [ $attempt -le $max_attempts ]; do + local unhealthy_services=$(docker-compose -f "$COMPOSE_FILE" ps | grep -c "unhealthy\|starting" || true) + + if [ "$unhealthy_services" -eq 0 ]; then + print_status "All services are healthy" + break + fi + + if [ $attempt -eq $max_attempts ]; then + print_warning "Some services may not be fully healthy after $max_attempts attempts" + break + fi + + print_status "Waiting for services to be healthy... (attempt $attempt/$max_attempts)" + sleep 10 + ((attempt++)) + done +} + +# Function to show service status +show_status() { + print_status "Service status:" + docker-compose -f "$COMPOSE_FILE" ps + + echo "" + print_status "Service logs (last 10 lines):" + docker-compose -f "$COMPOSE_FILE" logs --tail=10 +} + +# Function to show access information +show_access_info() { + echo "" + print_status "Access Information:" + echo "Grafana Dashboard: http://localhost:3000 (admin/admin)" + echo "Prometheus: http://localhost:9090" + echo "PostgreSQL: localhost:5432" + echo "Redis: localhost:6379" + echo "" + print_status "Check logs with: docker-compose logs -f [service_name]" +} + +# Main deployment function +main() { + print_status "Starting bots infrastructure deployment..." + + check_prerequisites + create_directories + load_env + validate_env + stop_services + deploy_services + wait_for_services + show_status + show_access_info + + print_status "Deployment completed successfully!" +} + +# Handle command line arguments +case "${1:-}" in + "stop") + print_status "Stopping services..." + docker-compose -f "$COMPOSE_FILE" down + print_status "Services stopped" + ;; + "restart") + print_status "Restarting services..." + docker-compose -f "$COMPOSE_FILE" restart + print_status "Services restarted" + ;; + "logs") + docker-compose -f "$COMPOSE_FILE" logs -f "${2:-}" + ;; + "status") + show_status + ;; + "help"|"-h"|"--help") + echo "Usage: $0 [command]" + echo "Commands:" + echo " (no args) - Deploy the infrastructure" + echo " stop - Stop all services" + echo " restart - Restart all services" + echo " logs - Show logs (optionally specify service name)" + echo " status - Show service status" + echo " help - Show this help message" + ;; + *) + main + ;; +esac