Initial commit: Add infrastructure and bot project

2025-08-31 17:55:55 +03:00
commit 7378179d98
21 changed files with 3139 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,63 @@
 # Environment files
 .env
 .env.local
 .env.*.local
 # Logs
 logs/
 *.log
 # Docker volumes
 prometheus_data/
 grafana_data/
 # OS generated files
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 # IDE and editor files
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # Python cache (if any Python scripts are added later)
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
 .Python
 *.so
 .pytest_cache/
 .coverage
 htmlcov/
 .tox/
 .cache/
 .mypy_cache/
 # Virtual environments
 .venv/
 venv/
 env/
 ENV/
 env.bak/
 venv.bak/
 # Temporary files
 *.tmp
 *.temp
 *.pid
 # Node modules (if any Node.js tools are added later)
 node_modules/
 # Build artifacts
 *.tar.gz
 dist/
 build/
--- a/27
+++ b/27
@@ -0,0 +1,27 @@
 FROM python:3.9-slim
 # Установка системных зависимостей
 RUN apt-get update && apt-get install -y \
    procps \
    && rm -rf /var/lib/apt/lists/*
 # Установка рабочей директории
 WORKDIR /app
 # Копирование файлов зависимостей
 COPY requirements.txt .
 # Установка Python зависимостей
 RUN pip install --no-cache-dir -r requirements.txt
 # Копирование исходного кода
 COPY . .
 # Создание пользователя для безопасности
 RUN groupadd -g 1000 monitor && \
    useradd -m -u 1000 -g monitor monitor && \
    chown -R 1000:1000 /app
 USER 1000
 # Команда по умолчанию для запуска мониторинга
 CMD ["python", "infra/monitoring/main.py"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,133 @@
 # Production Environment
 Проект для управления ботами и мониторинга инфраструктуры.
 ## Структура проекта
 ```
 prod/
 ├── bots/                    # Боты и их конфигурации
 ├── infra/                   # Инфраструктура
 │   ├── grafana/            # Дашборды Grafana
 │   ├── monitoring/         # Модуль мониторинга сервера
 │   └── prometheus/         # Конфигурация Prometheus
 ├── scripts/                 # Скрипты развертывания
 ├── docker-compose.yml       # Docker Compose конфигурация
 ├── env.template             # Шаблон переменных окружения
 └── README.md               # Этот файл
 ```
 ## 🚀 Быстрый запуск
 ### 1. Настройка переменных окружения
 Скопируйте шаблон и настройте переменные:
 ```bash
 cp env.template .env
 ```
 Отредактируйте `.env` файл, добавив реальные значения:
 ```env
 # Telegram Bot Configuration
 TELEGRAM_MONITORING_BOT_TOKEN=your_bot_token_here
 GROUP_MONITORING_FOR_LOGS=your_telegram_group_id_here
 IMPORTANT_MONITORING_LOGS=your_important_logs_channel_id_here
 # Monitoring Configuration
 THRESHOLD=80.0
 RECOVERY_THRESHOLD=75.0
 # Prometheus Configuration
 PROMETHEUS_RETENTION_DAYS=30
 # Grafana Configuration
 GRAFANA_ADMIN_USER=admin
 GRAFANA_ADMIN_PASSWORD=admin
 ```
 ### 2. Запуск всех сервисов
 ```bash
 docker-compose up -d
 ```
 ### 3. Проверка статуса
 ```bash
 docker-compose ps
 ```
 ## 📊 Сервисы
 - **Prometheus** (порт 9090) - сбор метрик
 - **Grafana** (порт 3000) - дашборды
 - **Server Monitor** - мониторинг системы + Telegram уведомления
 ## 🌐 Доступные адреса
 | Сервис | Адрес | Описание |
 |--------|-------|----------|
 | **Grafana** | http://localhost:3000 | Дашборды мониторинга (admin/admin) |
 | **Prometheus** | http://localhost:9090 | API метрик и веб-интерфейс |
 | **Метрики сервера** | http://localhost:9091/metrics | Endpoint для Prometheus |
 | **Health check** | http://localhost:9091/health | Проверка состояния мониторинга |
 ## 🔧 Модуль мониторинга
 Модуль автоматически:
 - Собирает метрики CPU, RAM, диска каждые 30 секунд
 - Отправляет статусы каждые 30 минут в Telegram
 - Отправляет алерты при превышении пороговых значений
 - Интегрирован с Prometheus/Grafana
 ### 📈 Собираемые метрики
 - **CPU**: использование, load average (1m, 5m, 15m)
 - **RAM**: использование оперативной памяти
 - **Disk**: использование диска, I/O активность
 - **Swap**: использование swap
 - **System**: uptime системы и мониторинга
 ## 📝 Логи
 ```bash
 # Все сервисы
 docker-compose logs
 # Только мониторинг
 docker-compose logs -f server_monitor
 # Prometheus
 docker logs bots_prometheus
 # Grafana
 docker logs bots_grafana
 ```
 ## 🔍 Проверка статуса
 ### Автоматическая проверка
 ```bash
 cd infra/monitoring
 python3 check_grafana.py
 ```
 ### Ручная проверка
 ```bash
 # Проверка метрик
 curl http://localhost:9091/metrics
 # Проверка Prometheus targets
 curl http://localhost:9090/api/v1/targets
 # Проверка Grafana
 curl http://localhost:3000/api/health
 ```
 ## 🛑 Остановка
 ```bash
 docker-compose down
 ```
--- a/bots/.gitkeep
+++ b/bots/.gitkeep
@@ -0,0 +1 @@
 # This file ensures the bots directory is tracked by git
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,145 @@
 services:
  # Prometheus Monitoring
  prometheus:
    image: prom/prometheus:latest
    container_name: bots_prometheus
    restart: unless-stopped
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_DAYS:-30}d'
      - '--web.enable-lifecycle'
    ports:
      - "9090:9090"
    volumes:
      - ./infra/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    networks:
      - bots_network
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
  # Grafana Dashboard
  grafana:
    image: grafana/grafana:latest
    container_name: bots_grafana
    restart: unless-stopped
    environment:
      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - ./infra/grafana/provisioning:/etc/grafana/provisioning:ro
    networks:
      - bots_network
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
  # Server Monitoring Service
  server_monitor:
    build: .
    container_name: bots_server_monitor
    restart: unless-stopped
    environment:
      - TELEGRAM_BOT_TOKEN=${TELEGRAM_MONITORING_BOT_TOKEN}
      - GROUP_FOR_LOGS=${GROUP_MONITORING_FOR_LOGS}
      - IMPORTANT_LOGS=${IMPORTANT_MONITORING_LOGS}
      - THRESHOLD=${THRESHOLD:-80.0}
      - RECOVERY_THRESHOLD=${RECOVERY_THRESHOLD:-75.0}
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /var/run:/host/var/run:ro
    networks:
      - bots_network
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD-SHELL", "ps aux | grep python | grep server_monitor || exit 1"]
      interval: 60s
      timeout: 10s
      retries: 3
  # Telegram Helper Bot
  telegram-bot:
    build:
      context: ./bots/telegram-helper-bot
      dockerfile: Dockerfile.bot
    container_name: bots_telegram_bot
    restart: unless-stopped
    ports:
      - "8080:8080"
    environment:
      - PYTHONPATH=/app
      - DOCKER_CONTAINER=true
      - LOG_LEVEL=${LOG_LEVEL:-INFO}
      - LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-30}
      - METRICS_HOST=${METRICS_HOST:-0.0.0.0}
      - METRICS_PORT=${METRICS_PORT:-8080}
      # Telegram settings
      - TELEGRAM_BOT_TOKEN=${BOT_TOKEN}
      - TELEGRAM_LISTEN_BOT_TOKEN=${LISTEN_BOT_TOKEN}
      - TELEGRAM_TEST_BOT_TOKEN=${TEST_BOT_TOKEN}
      - TELEGRAM_PREVIEW_LINK=${PREVIEW_LINK:-false}
      - TELEGRAM_MAIN_PUBLIC=${MAIN_PUBLIC}
      - TELEGRAM_GROUP_FOR_POSTS=${GROUP_FOR_POSTS}
      - TELEGRAM_GROUP_FOR_MESSAGE=${GROUP_FOR_MESSAGE}
      - TELEGRAM_GROUP_FOR_LOGS=${GROUP_FOR_LOGS}
      - TELEGRAM_IMPORTANT_LOGS=${IMPORTANT_LOGS}
      - TELEGRAM_ARCHIVE=${ARCHIVE}
      - TELEGRAM_TEST_GROUP=${TEST_GROUP}
      # Bot settings
      - SETTINGS_LOGS=${LOGS:-false}
      - SETTINGS_TEST=${TEST:-false}
      # Database
      - DATABASE_PATH=${DATABASE_PATH:-database/tg-bot-database.db}
    volumes:
      - ./bots/telegram-helper-bot/database:/app/database:rw
      - ./bots/telegram-helper-bot/logs:/app/logs:rw
      - ./bots/telegram-helper-bot/.env:/app/.env:ro
    networks:
      - bots_network
    depends_on:
      - prometheus
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    deploy:
      resources:
        limits:
          memory: 512M
          cpus: '0.5'
        reservations:
          memory: 256M
          cpus: '0.25'
 volumes:
  prometheus_data:
    driver: local
  grafana_data:
    driver: local
 networks:
  bots_network:
    driver: bridge
    ipam:
      config:
        - subnet: 192.168.100.0/24
--- a/env.template
+++ b/env.template
@@ -0,0 +1,15 @@
 # Telegram Bot Configuration
 TELEGRAM_MONITORING_BOT_TOKEN=your_bot_token_here
 GROUP_MONITORING_FOR_LOGS=your_telegram_group_id_here
 IMPORTANT_MONITORING_LOGS=your_important_logs_channel_id_here
 # Monitoring Configuration
 THRESHOLD=80.0
 RECOVERY_THRESHOLD=75.0
 # Prometheus Configuration
 PROMETHEUS_RETENTION_DAYS=30
 # Grafana Configuration
 GRAFANA_ADMIN_USER=admin
 GRAFANA_ADMIN_PASSWORD=admin
--- a/infra/grafana/provisioning/dashboards/all-dashboards.yml
+++ b/infra/grafana/provisioning/dashboards/all-dashboards.yml
@@ -0,0 +1,12 @@
 apiVersion: 1
 providers:
  - name: 'Infrastructure Dashboards'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
--- a/infra/grafana/provisioning/dashboards/server-dashboard.json
+++ b/infra/grafana/provisioning/dashboards/server-dashboard.json
@@ -0,0 +1,224 @@
 {
  "id": null,
  "title": "Server Monitoring",
  "tags": ["monitoring", "server"],
  "style": "dark",
  "timezone": "browser",
  "panels": [
    {
      "id": 1,
      "title": "CPU Usage",
      "type": "stat",
      "targets": [
        {
          "expr": "cpu_usage_percent",
          "legendFormat": "CPU %"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 70},
              {"color": "red", "value": 90}
            ]
          },
          "unit": "percent"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
    },
    {
      "id": 2,
      "title": "RAM Usage",
      "type": "stat",
      "targets": [
        {
          "expr": "ram_usage_percent",
          "legendFormat": "RAM %"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 70},
              {"color": "red", "value": 90}
            ]
          },
          "unit": "percent"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
    },
    {
      "id": 3,
      "title": "Disk Usage",
      "type": "stat",
      "targets": [
        {
          "expr": "disk_usage_percent",
          "legendFormat": "Disk %"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 80},
              {"color": "red", "value": 95}
            ]
          },
          "unit": "percent"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
    },
    {
      "id": 4,
      "title": "Load Average",
      "type": "timeseries",
      "targets": [
        {
          "expr": "load_average_1m",
          "legendFormat": "1m"
        },
        {
          "expr": "load_average_5m",
          "legendFormat": "5m"
        },
        {
          "expr": "load_average_15m",
          "legendFormat": "15m"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "vis": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          }
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
    },
    {
      "id": 5,
      "title": "System Uptime",
      "type": "stat",
      "targets": [
        {
          "expr": "system_uptime_seconds",
          "legendFormat": "Uptime"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "unit": "s"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8}
    },
    {
      "id": 6,
      "title": "Disk I/O Usage",
      "type": "stat",
      "targets": [
        {
          "expr": "disk_io_percent",
          "legendFormat": "Disk I/O %"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 50},
              {"color": "red", "value": 80}
            ]
          },
          "unit": "percent"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 0, "y": 16}
    },
    {
      "id": 7,
      "title": "Swap Usage",
      "type": "stat",
      "targets": [
        {
          "expr": "swap_usage_percent",
          "legendFormat": "Swap %"
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 50},
              {"color": "red", "value": 80}
            ]
          },
          "unit": "percent"
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 6, "y": 16}
    }
  ],
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "refresh": "30s"
 }
--- a/infra/grafana/provisioning/dashboards/telegram-bot-dashboards.json
+++ b/infra/grafana/provisioning/dashboards/telegram-bot-dashboards.json
--- a/infra/grafana/provisioning/datasources/prometheus.yml
+++ b/infra/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,8 @@
 apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
--- a/infra/monitoring/init.py
+++ b/infra/monitoring/init.py
@@ -0,0 +1,7 @@
 # Infrastructure Monitoring Module
 from .metrics_collector import MetricsCollector
 from .message_sender import MessageSender
 from .server_monitor import ServerMonitor
 __all__ = ['MetricsCollector', 'MessageSender', 'ServerMonitor']
--- a/infra/monitoring/check_grafana.py
+++ b/infra/monitoring/check_grafana.py
@@ -0,0 +1,127 @@
 #!/usr/bin/env python3
 """
 Скрипт для проверки статуса Grafana и дашбордов
 """
 import requests
 import json
 import sys
 from datetime import datetime
 def check_grafana_status():
    """Проверка статуса Grafana"""
    try:
        response = requests.get("http://localhost:3000/api/health", timeout=5)
        if response.status_code == 200:
            data = response.json()
            print(f"✅ Grafana работает (версия: {data.get('version', 'unknown')})")
            return True
        else:
            print(f"❌ Grafana: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Grafana: ошибка подключения - {e}")
        return False
 def check_prometheus_connection():
    """Проверка подключения Grafana к Prometheus"""
    try:
        # Проверяем, что Prometheus доступен
        response = requests.get("http://localhost:9090/api/v1/targets", timeout=5)
        if response.status_code == 200:
            print("✅ Prometheus доступен для Grafana")
            return True
        else:
            print(f"❌ Prometheus: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Prometheus: ошибка подключения - {e}")
        return False
 def check_metrics_availability():
    """Проверка доступности метрик"""
    try:
        response = requests.get("http://localhost:9091/metrics", timeout=5)
        if response.status_code == 200:
            content = response.text
            if "cpu_usage_percent" in content and "ram_usage_percent" in content:
                print("✅ Метрики доступны и содержат данные")
                return True
            else:
                print("⚠️  Метрики доступны, но данные неполные")
                return False
        else:
            print(f"❌ Метрики: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Метрики: ошибка подключения - {e}")
        return False
 def check_prometheus_targets():
    """Проверка статуса targets в Prometheus"""
    try:
        response = requests.get("http://localhost:9090/api/v1/targets", timeout=5)
        if response.status_code == 200:
            data = response.json()
            targets = data.get('data', {}).get('activeTargets', [])
            print("\n📊 Статус targets в Prometheus:")
            for target in targets:
                job = target.get('labels', {}).get('job', 'unknown')
                instance = target.get('labels', {}).get('instance', 'unknown')
                health = target.get('health', 'unknown')
                last_error = target.get('lastError', '')
                status_emoji = "✅" if health == "up" else "❌"
                print(f"  {status_emoji} {job} ({instance}): {health}")
                if last_error:
                    print(f"    Ошибка: {last_error}")
            return True
        else:
            print(f"❌ Prometheus API: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Prometheus API: ошибка подключения - {e}")
        return False
 def main():
    """Основная функция проверки"""
    print(f"🔍 Проверка Grafana и системы мониторинга - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 70)
    # Проверяем все компоненты
    all_ok = True
    if not check_grafana_status():
        all_ok = False
    if not check_prometheus_connection():
        all_ok = False
    if not check_metrics_availability():
        all_ok = False
    if not check_prometheus_targets():
        all_ok = False
    print("\n" + "=" * 70)
    if all_ok:
        print("🎉 Все компоненты работают корректно!")
        print("\n📋 Доступные адреса:")
        print("  • Grafana: http://localhost:3000 (admin/admin)")
        print("  • Prometheus: http://localhost:9090")
        print("  • Метрики: http://localhost:9091/metrics")
        print("\n📊 Дашборды должны быть доступны в Grafana:")
        print("  • Server Monitoring")
        print("  • Server Monitoring Dashboard")
        print("\n💡 Если дашборды не видны, используйте ручную настройку:")
        print("  • См. файл: GRAFANA_MANUAL_SETUP.md")
    else:
        print("⚠️  Обнаружены проблемы в системе мониторинга")
        print("   Проверьте логи и настройки")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/infra/monitoring/main.py
+++ b/infra/monitoring/main.py
@@ -0,0 +1,50 @@
 #!/usr/bin/env python3
 """
 Основной скрипт для запуска модуля мониторинга сервера
 """
 import asyncio
 import logging
 import os
 import sys
 # Добавляем корневую папку проекта в путь
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
 from dotenv import load_dotenv
 from infra.monitoring.server_monitor import ServerMonitor
 # Загружаем переменные окружения из .env файла
 load_dotenv()
 # Настройка логирования
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 async def main():
    """Основная функция запуска мониторинга"""
    try:
        # Создаем экземпляр мониторинга
        monitor = ServerMonitor()
        # Отправляем статус при запуске
        await monitor.send_startup_status()
        # Запускаем основной цикл мониторинга
        await monitor.monitor_loop()
    except KeyboardInterrupt:
        logger.info("Мониторинг остановлен пользователем")
    except Exception as e:
        logger.error(f"Критическая ошибка в мониторинге: {e}")
        raise
 if __name__ == "__main__":
    # Запускаем асинхронную функцию
    asyncio.run(main())
--- a/infra/monitoring/message_sender.py
+++ b/infra/monitoring/message_sender.py
@@ -0,0 +1,252 @@
 import os
 import aiohttp
 import logging
 from datetime import datetime
 from typing import Dict, List, Tuple
 try:
    from .metrics_collector import MetricsCollector
 except ImportError:
    from metrics_collector import MetricsCollector
 logger = logging.getLogger(__name__)
 class MessageSender:
    def __init__(self):
        # Получаем переменные окружения
        self.telegram_bot_token = os.getenv('TELEGRAM_MONITORING_BOT_TOKEN')
        self.group_for_logs = os.getenv('GROUP_MONITORING_FOR_LOGS')
        self.important_logs = os.getenv('IMPORTANT_MONITORING_LOGS')
        # Создаем экземпляр сборщика метрик
        self.metrics_collector = MetricsCollector()
        # Время последней отправки статуса
        self.last_status_time = None
        if not self.telegram_bot_token:
            logger.warning("TELEGRAM_MONITORING_BOT_TOKEN не установлен в переменных окружения")
        if not self.group_for_logs:
            logger.warning("GROUP_MONITORING_FOR_LOGS не установлен в переменных окружения")
        if not self.important_logs:
            logger.warning("IMPORTANT_MONITORING_LOGS не установлен в переменных окружения")
    async def send_telegram_message(self, chat_id: str, message: str) -> bool:
        """Отправка сообщения в Telegram через прямое обращение к API"""
        if not self.telegram_bot_token:
            logger.error("TELEGRAM_MONITORING_BOT_TOKEN не установлен")
            return False
        try:
            async with aiohttp.ClientSession() as session:
                url = f"https://api.telegram.org/bot{self.telegram_bot_token}/sendMessage"
                payload = {
                    "chat_id": chat_id,
                    "text": message,
                    "parse_mode": "HTML"
                }
                async with session.post(url, json=payload) as response:
                    if response.status == 200:
                        logger.info(f"Сообщение успешно отправлено в чат {chat_id}")
                        return True
                    else:
                        response_text = await response.text()
                        logger.error(f"Ошибка отправки в Telegram: {response.status} - {response_text}")
                        return False
        except Exception as e:
            logger.error(f"Ошибка при отправке сообщения в Telegram: {e}")
            return False
    def should_send_status(self) -> bool:
        """Проверка, нужно ли отправить статус (каждые 30 минут в 00 и 30 минут часа)"""
        now = datetime.now()
        # Проверяем, что сейчас 00 или 30 минут часа
        if now.minute in [0, 30]:
            # Проверяем, не отправляли ли мы уже статус в эту минуту
            if (self.last_status_time is None or 
                self.last_status_time.hour != now.hour or 
                self.last_status_time.minute != now.minute):
                self.last_status_time = now
                return True
        return False
    def should_send_startup_status(self) -> bool:
        """Проверка, нужно ли отправить статус при запуске"""
        return self.last_status_time is None
    def _get_disk_space_emoji(self, disk_percent: float) -> str:
        """Получение эмодзи для дискового пространства"""
        if disk_percent < 60:
            return "🟢"
        elif disk_percent < 90:
            return "⚠️"
        else:
            return "🚨"
    def get_status_message(self, system_info: Dict) -> str:
        """Формирование сообщения со статусом сервера"""
        try:
            voice_bot_status, voice_bot_uptime = self.metrics_collector.check_process_status('voice_bot')
            helper_bot_status, helper_bot_uptime = self.metrics_collector.check_process_status('helper_bot')
            # Получаем эмодзи для дискового пространства
            disk_emoji = self._get_disk_space_emoji(system_info['disk_percent'])
            message = f"""🖥 **Статус Сервера** | <code>{system_info['current_time']}</code>
 ---------------------------------
 **📊 Общая нагрузка:**
 CPU: <b>{system_info['cpu_percent']}%</b> | LA: <b>{system_info['load_avg_1m']} / {system_info['cpu_count']}</b> | IO Wait: <b>{system_info['disk_percent']}%</b>
 **💾 Память:**
 RAM: <b>{system_info['ram_used']}/{system_info['ram_total']} GB</b> ({system_info['ram_percent']}%) 
 Swap: <b>{system_info['swap_used']}/{system_info['swap_total']} GB</b> ({system_info['swap_percent']}%)
 **🗂️ Дисковое пространство:**
 Диск (/): <b>{system_info['disk_used']}/{system_info['disk_total']} GB</b> ({system_info['disk_percent']}%) {disk_emoji}
 **💿 Диск I/O:**
 Read: <b>{system_info['disk_read_speed']}</b> | Write: <b>{system_info['disk_write_speed']}</b>
 Диск загружен: <b>{system_info['disk_io_percent']}%</b>
 **🤖 Процессы:**
 {voice_bot_status} voice-bot - {voice_bot_uptime}
 {helper_bot_status} helper-bot - {helper_bot_uptime}
 ---------------------------------
 ⏰ Uptime сервера: {system_info['system_uptime']}"""
            return message
        except Exception as e:
            logger.error(f"Ошибка при формировании статуса сервера: {e}")
            return f"Ошибка при получении статуса сервера: {e}"
    def get_alert_message(self, metric_name: str, current_value: float, details: str) -> str:
        """Формирование сообщения об алерте"""
        try:
            message = f"""🚨  **ALERT: Высокая нагрузка на сервере!**
 ---------------------------------
 **Показатель:** {metric_name}
 **Текущее значение:** <b>{current_value}%</b> ⚠️
 **Пороговое значение:** 80%
 **Детали:**
 {details}
 **Сервер:** `{self.metrics_collector.os_type.upper()}`
 **Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}`
 ---------------------------------"""
            return message
        except Exception as e:
            logger.error(f"Ошибка при формировании алерта: {e}")
            return f"Ошибка при формировании алерта: {e}"
    def get_recovery_message(self, metric_name: str, current_value: float, peak_value: float) -> str:
        """Формирование сообщения о восстановлении"""
        try:
            message = f"""✅  **RECOVERY: Нагрузка нормализовалась**
 ---------------------------------
 **Показатель:** {metric_name}
 **Текущее значение:** <b>{current_value}%</b> ✔️
 **Было превышение:** До {peak_value}%
 **Сервер:** `{self.metrics_collector.os_type.upper()}`
 **Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}`
 ---------------------------------"""
            return message
        except Exception as e:
            logger.error(f"Ошибка при формировании сообщения о восстановлении: {e}")
            return f"Ошибка при формировании сообщения о восстановлении: {e}"
    async def send_status_message(self) -> bool:
        """Отправка статуса сервера в группу логов"""
        if not self.group_for_logs:
            logger.warning("GROUP_MONITORING_FOR_LOGS не установлен, пропускаем отправку статуса")
            return False
        try:
            system_info = self.metrics_collector.get_system_info()
            if not system_info:
                logger.error("Не удалось получить информацию о системе")
                return False
            status_message = self.get_status_message(system_info)
            return await self.send_telegram_message(self.group_for_logs, status_message)
        except Exception as e:
            logger.error(f"Ошибка при отправке статуса: {e}")
            return False
    async def send_alert_message(self, metric_type: str, current_value: float, details: str) -> bool:
        """Отправка сообщения об алерте в важные логи"""
        if not self.important_logs:
            logger.warning("IMPORTANT_MONITORING_LOGS не установлен, пропускаем отправку алерта")
            return False
        try:
            metric_names = {
                'cpu': 'Использование CPU',
                'ram': 'Использование оперативной памяти',
                'disk': 'Заполнение диска (/)'
            }
            metric_name = metric_names.get(metric_type, metric_type)
            alert_message = self.get_alert_message(metric_name, current_value, details)
            return await self.send_telegram_message(self.important_logs, alert_message)
        except Exception as e:
            logger.error(f"Ошибка при отправке алерта: {e}")
            return False
    async def send_recovery_message(self, metric_type: str, current_value: float, peak_value: float) -> bool:
        """Отправка сообщения о восстановлении в важные логи"""
        if not self.important_logs:
            logger.warning("IMPORTANT_MONITORING_LOGS не установлен, пропускаем отправку сообщения о восстановлении")
            return False
        try:
            metric_names = {
                'cpu': 'Использование CPU',
                'ram': 'Использование оперативной памяти',
                'disk': 'Заполнение диска (/)'
            }
            metric_name = metric_names.get(metric_type, metric_type)
            recovery_message = self.get_recovery_message(metric_name, current_value, peak_value)
            return await self.send_telegram_message(self.important_logs, recovery_message)
        except Exception as e:
            logger.error(f"Ошибка при отправке сообщения о восстановлении: {e}")
            return False
    async def process_alerts_and_recoveries(self) -> None:
        """Обработка алертов и восстановлений"""
        try:
            system_info = self.metrics_collector.get_system_info()
            if not system_info:
                return
            # Проверка алертов
            alerts, recoveries = self.metrics_collector.check_alerts(system_info)
            # Отправка алертов
            for metric_type, value, details in alerts:
                await self.send_alert_message(metric_type, value, details)
                logger.warning(f"ALERT отправлен: {metric_type} - {value}% - {details}")
            # Отправка сообщений о восстановлении
            for metric_type, value in recoveries:
                # Находим пиковое значение для сообщения о восстановлении
                peak_value = self.metrics_collector.threshold
                await self.send_recovery_message(metric_type, value, peak_value)
                logger.info(f"RECOVERY отправлен: {metric_type} - {value}%")
        except Exception as e:
            logger.error(f"Ошибка при обработке алертов и восстановлений: {e}")
--- a/infra/monitoring/metrics_collector.py
+++ b/infra/monitoring/metrics_collector.py
@@ -0,0 +1,495 @@
 import os
 import psutil
 import time
 import platform
 from datetime import datetime
 from typing import Dict, Optional, Tuple
 import logging
 logger = logging.getLogger(__name__)
 class MetricsCollector:
    def __init__(self):
        # Определяем ОС
        self.os_type = self._detect_os()
        logger.info(f"Обнаружена ОС: {self.os_type}")
        # Пороговые значения для алертов
        self.threshold = float(os.getenv('THRESHOLD', '80.0'))
        self.recovery_threshold = float(os.getenv('RECOVERY_THRESHOLD', '75.0'))
        # Состояние алертов для предотвращения спама
        self.alert_states = {
            'cpu': False,
            'ram': False,
            'disk': False
        }
        # PID файлы для отслеживания процессов
        self.pid_files = {
            'voice_bot': 'voice_bot.pid',
            'helper_bot': 'helper_bot.pid'
        }
        # Для расчета скорости диска
        self.last_disk_io = None
        self.last_disk_io_time = None
        # Для расчета процента загрузки диска (отдельные переменные)
        self.last_disk_io_for_percent = None
        self.last_disk_io_time_for_percent = None
        # Инициализируем базовые значения для скорости диска при первом вызове
        self._initialize_disk_io()
        # Время запуска мониторинга для расчета uptime
        self.monitor_start_time = time.time()
    def _detect_os(self) -> str:
        """Определение типа операционной системы"""
        system = platform.system().lower()
        if system == "darwin":
            return "macos"
        elif system == "linux":
            return "ubuntu"
        else:
            return "unknown"
    def _initialize_disk_io(self):
        """Инициализация базовых значений для расчета скорости диска"""
        try:
            disk_io = self._get_disk_io_counters()
            if disk_io:
                self.last_disk_io = disk_io
                self.last_disk_io_time = time.time()
                logger.debug("Инициализированы базовые значения для расчета скорости диска")
        except Exception as e:
            logger.error(f"Ошибка при инициализации диска I/O: {e}")
    def _get_disk_path(self) -> str:
        """Получение пути к диску в зависимости от ОС"""
        if self.os_type == "macos":
            return "/"
        elif self.os_type == "ubuntu":
            return "/"
        else:
            return "/"
    def _get_disk_usage(self) -> Optional[object]:
        """Получение информации о диске с учетом ОС"""
        try:
            if self.os_type == "macos":
                # На macOS используем diskutil для получения реального использования диска
                return self._get_macos_disk_usage()
            else:
                disk_path = self._get_disk_path()
                return psutil.disk_usage(disk_path)
        except Exception as e:
            logger.error(f"Ошибка при получении информации о диске: {e}")
            return None
    def _get_macos_disk_usage(self) -> Optional[object]:
        """Получение информации о диске на macOS через diskutil"""
        try:
            import subprocess
            import re
            # Получаем информацию о диске через diskutil
            result = subprocess.run(['diskutil', 'info', '/'], capture_output=True, text=True)
            if result.returncode != 0:
                # Fallback к psutil
                return psutil.disk_usage('/')
            output = result.stdout
            # Извлекаем размеры из вывода diskutil
            total_match = re.search(r'Container Total Space:\s+(\d+\.\d+)\s+GB', output)
            free_match = re.search(r'Container Free Space:\s+(\d+\.\d+)\s+GB', output)
            if total_match and free_match:
                total_gb = float(total_match.group(1))
                free_gb = float(free_match.group(1))
                used_gb = total_gb - free_gb
                # Создаем объект, похожий на результат psutil.disk_usage
                class DiskUsage:
                    def __init__(self, total, used, free):
                        self.total = total * (1024**3)  # Конвертируем в байты
                        self.used = used * (1024**3)
                        self.free = free * (1024**3)
                return DiskUsage(total_gb, used_gb, free_gb)
            else:
                # Fallback к psutil
                return psutil.disk_usage('/')
        except Exception as e:
            logger.error(f"Ошибка при получении информации о диске macOS: {e}")
            # Fallback к psutil
            return psutil.disk_usage('/')
    def _get_disk_io_counters(self):
        """Получение статистики диска с учетом ОС"""
        try:
            if self.os_type == "macos":
                # На macOS может быть несколько дисков, берем основной
                return psutil.disk_io_counters(perdisk=False)
            elif self.os_type == "ubuntu":
                # На Ubuntu обычно один диск
                return psutil.disk_io_counters(perdisk=False)
            else:
                return psutil.disk_io_counters()
        except Exception as e:
            logger.error(f"Ошибка при получении статистики диска: {e}")
            return None
    def _get_system_uptime(self) -> float:
        """Получение uptime системы с учетом ОС"""
        try:
            if self.os_type == "macos":
                # На macOS используем boot_time
                boot_time = psutil.boot_time()
                return time.time() - boot_time
            elif self.os_type == "ubuntu":
                # На Ubuntu также используем boot_time
                boot_time = psutil.boot_time()
                return time.time() - boot_time
            else:
                boot_time = psutil.boot_time()
                return time.time() - boot_time
        except Exception as e:
            logger.error(f"Ошибка при получении uptime системы: {e}")
            return 0.0
    def get_monitor_uptime(self) -> str:
        """Получение uptime мониторинга"""
        uptime_seconds = time.time() - self.monitor_start_time
        return self._format_uptime(uptime_seconds)
    def get_system_info(self) -> Dict:
        """Получение информации о системе"""
        try:
            # CPU
            cpu_percent = psutil.cpu_percent(interval=1)
            load_avg = psutil.getloadavg()
            cpu_count = psutil.cpu_count()
            # Память
            memory = psutil.virtual_memory()
            swap = psutil.swap_memory()
            # Используем единый расчет для всех ОС: used / total для получения процента занятой памяти
            # Это обеспечивает консистентность между macOS и Ubuntu
            ram_percent = (memory.used / memory.total) * 100
            # Диск
            disk = self._get_disk_usage()
            disk_io = self._get_disk_io_counters()
            if disk is None:
                logger.error("Не удалось получить информацию о диске")
                return {}
            # Сначала рассчитываем процент загрузки диска (до обновления last_disk_io_time)
            disk_io_percent = self._calculate_disk_io_percent()
            # Затем рассчитываем скорость диска (это обновит last_disk_io_time)
            disk_read_speed, disk_write_speed = self._calculate_disk_speed(disk_io)
            # Диагностика диска для отладки
            if disk_io:
                logger.debug(f"Диск I/O статистика: read_count={disk_io.read_count}, write_count={disk_io.write_count}, "
                           f"read_bytes={disk_io.read_bytes}, write_bytes={disk_io.write_bytes}")
            # Система
            system_uptime = self._get_system_uptime()
            # Получаем имя хоста в зависимости от ОС
            if self.os_type == "macos":
                hostname = os.uname().nodename
            elif self.os_type == "ubuntu":
                hostname = os.uname().nodename
            else:
                hostname = "unknown"
            return {
                'cpu_percent': cpu_percent,
                'load_avg_1m': round(load_avg[0], 2),
                'load_avg_5m': round(load_avg[1], 2),
                'load_avg_15m': round(load_avg[2], 2),
                'cpu_count': cpu_count,
                'ram_used': round(memory.used / (1024**3), 2),
                'ram_total': round(memory.total / (1024**3), 2),
                'ram_percent': round(ram_percent, 1),  # Исправленный процент занятой памяти
                'swap_used': round(swap.used / (1024**3), 2),
                'swap_total': round(swap.total / (1024**3), 2),
                'swap_percent': swap.percent,
                'disk_used': round(disk.used / (1024**3), 2),
                'disk_total': round(disk.total / (1024**3), 2),
                'disk_percent': round((disk.used / disk.total) * 100, 1),
                'disk_free': round(disk.free / (1024**3), 2),
                'disk_read_speed': disk_read_speed,
                'disk_write_speed': disk_write_speed,
                'disk_io_percent': disk_io_percent,
                'system_uptime': self._format_uptime(system_uptime),
                'monitor_uptime': self.get_monitor_uptime(),
                'server_hostname': hostname,
                'current_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }
        except Exception as e:
            logger.error(f"Ошибка при получении информации о системе: {e}")
            return {}
    def _format_bytes(self, bytes_value: int) -> str:
        """Форматирование байтов в человекочитаемый вид"""
        if bytes_value == 0:
            return "0 B"
        size_names = ["B", "KB", "MB", "GB", "TB"]
        i = 0
        while bytes_value >= 1024 and i < len(size_names) - 1:
            bytes_value /= 1024.0
            i += 1
        return f"{bytes_value:.1f} {size_names[i]}"
    def _format_uptime(self, seconds: float) -> str:
        """Форматирование времени работы системы"""
        days = int(seconds // 86400)
        hours = int((seconds % 86400) // 3600)
        minutes = int((seconds % 3600) // 60)
        if days > 0:
            return f"{days}д {hours}ч {minutes}м"
        elif hours > 0:
            return f"{hours}ч {minutes}м"
        else:
            return f"{minutes}м"
    def check_process_status(self, process_name: str) -> Tuple[str, str]:
        """Проверка статуса процесса и возврат статуса с uptime"""
        try:
            # Сначала проверяем по PID файлу
            pid_file = self.pid_files.get(process_name)
            if pid_file and os.path.exists(pid_file):
                try:
                    with open(pid_file, 'r') as f:
                        content = f.read().strip()
                        if content and content != '# Этот файл будет автоматически обновляться при запуске бота':
                            pid = int(content)
                            if psutil.pid_exists(pid):
                                # Получаем uptime процесса
                                try:
                                    proc = psutil.Process(pid)
                                    proc_uptime = time.time() - proc.create_time()
                                    uptime_str = self._format_uptime(proc_uptime)
                                    return "✅", f"Uptime {uptime_str}"
                                except:
                                    return "✅", "Uptime неизвестно"
                except (ValueError, FileNotFoundError):
                    pass
            # Проверяем по имени процесса более точно
            for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
                try:
                    proc_name = proc.info['name'].lower()
                    cmdline = ' '.join(proc.info['cmdline']).lower() if proc.info['cmdline'] else ''
                    # Более точная проверка для каждого бота
                    if process_name == 'voice_bot':
                        # Проверяем voice_bot
                        if ('voice_bot' in proc_name or 
                            'voice_bot' in cmdline or
                            'voice_bot_v2.py' in cmdline):
                            # Получаем uptime процесса
                            try:
                                proc_uptime = time.time() - proc.create_time()
                                uptime_str = self._format_uptime(proc_uptime)
                                return "✅", f"Uptime {uptime_str}"
                            except:
                                return "✅", "Uptime неизвестно"
                    elif process_name == 'helper_bot':
                        # Проверяем helper_bot
                        if ('helper_bot' in proc_name or 
                            'helper_bot' in cmdline or
                            'run_helper.py' in cmdline or
                            'python' in proc_name and 'helper_bot' in cmdline):
                            # Получаем uptime процесса
                            try:
                                proc_uptime = time.time() - proc.create_time()
                                uptime_str = self._format_uptime(proc_uptime)
                                return "✅", f"Uptime {uptime_str}"
                            except:
                                return "✅", "Uptime неизвестно"
                except (psutil.NoSuchProcess, psutil.AccessDenied):
                    continue
            return "❌", "Выключен"
        except Exception as e:
            logger.error(f"Ошибка при проверке процесса {process_name}: {e}")
            return "❌", "Выключен"
    def _calculate_disk_speed(self, current_disk_io) -> Tuple[str, str]:
        """Расчет скорости чтения/записи диска"""
        current_time = time.time()
        if self.last_disk_io is None or self.last_disk_io_time is None:
            self.last_disk_io = current_disk_io
            self.last_disk_io_time = current_time
            return "0 B/s", "0 B/s"
        time_diff = current_time - self.last_disk_io_time
        if time_diff < 1:  # Минимальный интервал 1 секунда
            return "0 B/s", "0 B/s"
        read_diff = current_disk_io.read_bytes - self.last_disk_io.read_bytes
        write_diff = current_disk_io.write_bytes - self.last_disk_io.write_bytes
        read_speed = read_diff / time_diff
        write_speed = write_diff / time_diff
        # Обновляем предыдущие значения
        self.last_disk_io = current_disk_io
        self.last_disk_io_time = current_time
        return self._format_bytes(read_speed) + "/s", self._format_bytes(write_speed) + "/s"
    def _calculate_disk_io_percent(self) -> int:
        """Расчет процента загрузки диска на основе реальной скорости I/O"""
        try:
            # Получаем текущую статистику диска
            current_disk_io = self._get_disk_io_counters()
            if current_disk_io is None:
                return 0
            current_time = time.time()
            # Если это первое измерение, инициализируем
            if self.last_disk_io_for_percent is None or self.last_disk_io_time_for_percent is None:
                logger.debug("Первое измерение диска для процента, инициализируем базовые значения")
                self.last_disk_io_for_percent = current_disk_io
                self.last_disk_io_time_for_percent = current_time
                return 0
            # Рассчитываем время между измерениями
            time_diff = current_time - self.last_disk_io_time_for_percent
            if time_diff < 0.1:  # Минимальный интервал 0.1 секунды для более точных измерений
                logger.debug(f"Интервал между измерениями слишком мал: {time_diff:.3f}s, возвращаем 0%")
                return 0
            # Рассчитываем скорость операций в секунду
            read_ops_diff = current_disk_io.read_count - self.last_disk_io_for_percent.read_count
            write_ops_diff = current_disk_io.write_count - self.last_disk_io_for_percent.write_count
            read_ops_per_sec = read_ops_diff / time_diff
            write_ops_per_sec = write_ops_diff / time_diff
            total_ops_per_sec = read_ops_per_sec + write_ops_per_sec
            # Рассчитываем скорость передачи данных в байтах в секунду
            read_bytes_diff = current_disk_io.read_bytes - self.last_disk_io_for_percent.read_bytes
            write_bytes_diff = current_disk_io.write_bytes - self.last_disk_io_for_percent.write_bytes
            read_bytes_per_sec = read_bytes_diff / time_diff
            write_bytes_per_sec = write_bytes_diff / time_diff
            total_bytes_per_sec = read_bytes_per_sec + write_bytes_per_sec
            # Обновляем предыдущие значения для процента
            self.last_disk_io_for_percent = current_disk_io
            self.last_disk_io_time_for_percent = current_time
            # Определяем максимальную производительность диска в зависимости от ОС
            if self.os_type == "macos":
                # macOS обычно имеет SSD с высокой производительностью
                max_ops_per_sec = 50000  # Операций в секунду
                max_bytes_per_sec = 3 * (1024**3)  # 3 GB/s
            elif self.os_type == "ubuntu":
                # Ubuntu может быть на разных типах дисков
                max_ops_per_sec = 30000  # Операций в секунду
                max_bytes_per_sec = 2 * (1024**3)  # 2 GB/s
            else:
                max_ops_per_sec = 40000
                max_bytes_per_sec = 2.5 * (1024**3)
            # Рассчитываем процент загрузки на основе операций и байтов
            # Защита от деления на ноль
            if max_ops_per_sec > 0:
                ops_percent = min(100, (total_ops_per_sec / max_ops_per_sec) * 100)
            else:
                ops_percent = 0
            if max_bytes_per_sec > 0:
                bytes_percent = min(100, (total_bytes_per_sec / max_bytes_per_sec) * 100)
            else:
                bytes_percent = 0
            # Взвешенный средний процент (операции важнее для большинства случаев)
            final_percent = (ops_percent * 0.7) + (bytes_percent * 0.3)
            # Логируем для отладки (только при высоких значениях)
            if final_percent > 10:
                logger.debug(f"Диск I/O: {total_ops_per_sec:.1f} ops/s, {total_bytes_per_sec/(1024**2):.1f} MB/s, "
                           f"Загрузка: {final_percent:.1f}% (ops: {ops_percent:.1f}%, bytes: {bytes_percent:.1f}%)")
            # Округляем до целого числа
            return round(final_percent)
        except Exception as e:
            logger.error(f"Ошибка при расчете процента загрузки диска: {e}")
            return 0
    def get_metrics_data(self) -> Dict:
        """Получение данных для метрик Prometheus"""
        system_info = self.get_system_info()
        if not system_info:
            return {}
        return {
            'cpu_usage_percent': system_info.get('cpu_percent', 0),
            'ram_usage_percent': system_info.get('ram_percent', 0),
            'disk_usage_percent': system_info.get('disk_percent', 0),
            'load_average_1m': system_info.get('load_avg_1m', 0),
            'load_average_5m': system_info.get('load_avg_5m', 0),
            'load_average_15m': system_info.get('load_avg_15m', 0),
            'swap_usage_percent': system_info.get('swap_percent', 0),
            'disk_io_percent': system_info.get('disk_io_percent', 0),
            'system_uptime_seconds': self._get_system_uptime(),
            'monitor_uptime_seconds': time.time() - self.monitor_start_time
        }
    def check_alerts(self, system_info: Dict) -> Tuple[bool, Optional[str]]:
        """Проверка необходимости отправки алертов"""
        alerts = []
        # Проверка CPU
        if system_info['cpu_percent'] > self.threshold and not self.alert_states['cpu']:
            self.alert_states['cpu'] = True
            alerts.append(('cpu', system_info['cpu_percent'], f"Нагрузка за 1 мин: {system_info['load_avg_1m']}"))
        # Проверка RAM
        if system_info['ram_percent'] > self.threshold and not self.alert_states['ram']:
            self.alert_states['ram'] = True
            alerts.append(('ram', system_info['ram_percent'], f"Используется: {system_info['ram_used']} GB из {system_info['ram_total']} GB"))
        # Проверка диска
        if system_info['disk_percent'] > self.threshold and not self.alert_states['disk']:
            self.alert_states['disk'] = True
            alerts.append(('disk', system_info['disk_percent'], f"Свободно: {system_info['disk_free']} GB на /"))
        # Проверка восстановления
        recoveries = []
        if system_info['cpu_percent'] < self.recovery_threshold and self.alert_states['cpu']:
            self.alert_states['cpu'] = False
            recoveries.append(('cpu', system_info['cpu_percent']))
        if system_info['ram_percent'] < self.recovery_threshold and self.alert_states['ram']:
            self.alert_states['ram'] = False
            recoveries.append(('ram', system_info['ram_percent']))
        if system_info['disk_percent'] < self.recovery_threshold and self.alert_states['disk']:
            self.alert_states['disk'] = False
            recoveries.append(('disk', system_info['disk_percent']))
        return alerts, recoveries
--- a/infra/monitoring/prometheus_server.py
+++ b/infra/monitoring/prometheus_server.py
@@ -0,0 +1,143 @@
 import asyncio
 import logging
 from aiohttp import web
 try:
    from .metrics_collector import MetricsCollector
 except ImportError:
    from metrics_collector import MetricsCollector
 logger = logging.getLogger(__name__)
 class PrometheusServer:
    def __init__(self, host='0.0.0.0', port=9091):
        self.host = host
        self.port = port
        self.metrics_collector = MetricsCollector()
        self.app = web.Application()
        self.setup_routes()
    def setup_routes(self):
        """Настройка маршрутов для Prometheus"""
        self.app.router.add_get('/', self.root_handler)
        self.app.router.add_get('/metrics', self.metrics_handler)
        self.app.router.add_get('/health', self.health_handler)
    async def root_handler(self, request):
        """Главная страница"""
        return web.Response(
            text="Prometheus Metrics Server\n\n"
                 "Available endpoints:\n"
                 "- /metrics - Prometheus metrics\n"
                 "- /health - Health check",
            content_type='text/plain'
        )
    async def health_handler(self, request):
        """Health check endpoint"""
        return web.Response(
            text="OK",
            content_type='text/plain'
        )
    async def metrics_handler(self, request):
        """Endpoint для Prometheus метрик"""
        try:
            metrics_data = self.metrics_collector.get_metrics_data()
            prometheus_metrics = self._format_prometheus_metrics(metrics_data)
            return web.Response(
                text=prometheus_metrics,
                content_type='text/plain'
            )
        except Exception as e:
            logger.error(f"Ошибка при получении метрик: {e}")
            return web.Response(
                text=f"Error: {str(e)}",
                status=500,
                content_type='text/plain'
            )
    def _format_prometheus_metrics(self, metrics_data: dict) -> str:
        """Форматирование метрик в Prometheus формат"""
        lines = []
        # Системная информация
        lines.append("# HELP system_info System information")
        lines.append("# TYPE system_info gauge")
        lines.append(f"system_info{{os=\"{self.metrics_collector.os_type}\"}} 1")
        # CPU метрики
        if 'cpu_usage_percent' in metrics_data:
            lines.append("# HELP cpu_usage_percent CPU usage percentage")
            lines.append("# TYPE cpu_usage_percent gauge")
            lines.append(f"cpu_usage_percent {metrics_data['cpu_usage_percent']}")
        if 'load_average_1m' in metrics_data:
            lines.append("# HELP load_average_1m 1 minute load average")
            lines.append("# TYPE load_average_1m gauge")
            lines.append(f"load_average_1m {metrics_data['load_average_1m']}")
        if 'load_average_5m' in metrics_data:
            lines.append("# HELP load_average_5m 5 minute load average")
            lines.append("# TYPE load_average_5m gauge")
            lines.append(f"load_average_5m {metrics_data['load_average_5m']}")
        if 'load_average_15m' in metrics_data:
            lines.append("# HELP load_average_15m 15 minute load average")
            lines.append("# TYPE load_average_15m gauge")
            lines.append(f"load_average_15m {metrics_data['load_average_15m']}")
        # RAM метрики
        if 'ram_usage_percent' in metrics_data:
            lines.append("# HELP ram_usage_percent RAM usage percentage")
            lines.append("# TYPE ram_usage_percent gauge")
            lines.append(f"ram_usage_percent {metrics_data['ram_usage_percent']}")
        # Disk метрики
        if 'disk_usage_percent' in metrics_data:
            lines.append("# HELP disk_usage_percent Disk usage percentage")
            lines.append("# TYPE disk_usage_percent gauge")
            lines.append(f"disk_usage_percent {metrics_data['disk_usage_percent']}")
        if 'disk_io_percent' in metrics_data:
            lines.append("# HELP disk_io_percent Disk I/O usage percentage")
            lines.append("# TYPE disk_io_percent gauge")
            lines.append(f"disk_io_percent {metrics_data['disk_io_percent']}")
        # Swap метрики
        if 'swap_usage_percent' in metrics_data:
            lines.append("# HELP swap_usage_percent Swap usage percentage")
            lines.append("# TYPE swap_usage_percent gauge")
            lines.append(f"swap_usage_percent {metrics_data['swap_usage_percent']}")
        # Uptime метрики
        if 'system_uptime_seconds' in metrics_data:
            lines.append("# HELP system_uptime_seconds System uptime in seconds")
            lines.append("# TYPE system_uptime_seconds gauge")
            lines.append(f"system_uptime_seconds {metrics_data['system_uptime_seconds']}")
        if 'monitor_uptime_seconds' in metrics_data:
            lines.append("# HELP monitor_uptime_seconds Monitor uptime in seconds")
            lines.append("# TYPE monitor_uptime_seconds gauge")
            lines.append(f"monitor_uptime_seconds {metrics_data['monitor_uptime_seconds']}")
        return '\n'.join(lines)
    async def start(self):
        """Запуск HTTP сервера"""
        runner = web.AppRunner(self.app)
        await runner.setup()
        site = web.TCPSite(runner, self.host, self.port)
        await site.start()
        logger.info(f"Prometheus сервер запущен на http://{self.host}:{self.port}")
        return runner
    async def stop(self, runner):
        """Остановка HTTP сервера"""
        await runner.cleanup()
        logger.info("Prometheus сервер остановлен")
--- a/infra/monitoring/server_monitor.py
+++ b/infra/monitoring/server_monitor.py
@@ -0,0 +1,62 @@
 import asyncio
 import logging
 try:
    from .metrics_collector import MetricsCollector
    from .message_sender import MessageSender
    from .prometheus_server import PrometheusServer
 except ImportError:
    from metrics_collector import MetricsCollector
    from message_sender import MessageSender
    from prometheus_server import PrometheusServer
 logger = logging.getLogger(__name__)
 class ServerMonitor:
    def __init__(self):
        # Создаем экземпляры модулей
        self.metrics_collector = MetricsCollector()
        self.message_sender = MessageSender()
        self.prometheus_server = PrometheusServer()
        logger.info(f"Модуль мониторинга сервера запущен на {self.metrics_collector.os_type.upper()}")
    async def monitor_loop(self):
        """Основной цикл мониторинга"""
        logger.info(f"Модуль мониторинга сервера запущен на {self.metrics_collector.os_type.upper()}")
        # Запускаем Prometheus сервер
        prometheus_runner = await self.prometheus_server.start()
        try:
            while True:
                try:
                    # Проверка алертов и восстановлений
                    await self.message_sender.process_alerts_and_recoveries()
                    # Проверка необходимости отправки статуса
                    if self.message_sender.should_send_status():
                        await self.message_sender.send_status_message()
                    # Пауза между проверками (30 секунд)
                    await asyncio.sleep(30)
                except Exception as e:
                    logger.error(f"Ошибка в цикле мониторинга: {e}")
                    await asyncio.sleep(30)
        finally:
            # Останавливаем Prometheus сервер при завершении
            await self.prometheus_server.stop(prometheus_runner)
    async def send_startup_status(self):
        """Отправка статуса при запуске"""
        if self.message_sender.should_send_startup_status():
            await self.message_sender.send_status_message()
    def get_system_info(self):
        """Получение информации о системе (для обратной совместимости)"""
        return self.metrics_collector.get_system_info()
    def get_metrics_data(self):
        """Получение данных для метрик Prometheus (для обратной совместимости)"""
        return self.metrics_collector.get_metrics_data()
--- a/infra/monitoring/test_monitor.py
+++ b/infra/monitoring/test_monitor.py
@@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 """
 Тестовый скрипт для проверки работы модуля мониторинга
 """
 import sys
 import os
 import logging
 # Добавляем текущую директорию в путь для импорта
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from server_monitor import ServerMonitor
 # Настройка логирования
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 def main():
    """Основная функция тестирования"""
    print("🚀 Тестирование модуля мониторинга сервера")
    print("=" * 50)
    try:
        # Создаем экземпляр мониторинга
        monitor = ServerMonitor()
        # Получаем информацию о системе
        print("📊 Получение информации о системе...")
        system_info = monitor.get_system_info()
        if system_info:
            print("✅ Информация о системе получена успешно")
            print(f"   CPU: {system_info.get('cpu_percent', 'N/A')}%")
            print(f"   RAM: {system_info.get('ram_percent', 'N/A')}%")
            print(f"   Диск: {system_info.get('disk_percent', 'N/A')}%")
            print(f"   Хост: {system_info.get('server_hostname', 'N/A')}")
            print(f"   ОС: {monitor.os_type}")
        else:
            print("❌ Не удалось получить информацию о системе")
            return
        # Проверяем статус процессов
        print("\n🤖 Проверка статуса процессов...")
        voice_status, voice_uptime = monitor.check_process_status('voice_bot')
        helper_status, helper_uptime = monitor.check_process_status('helper_bot')
        print(f"   Voice Bot: {voice_status} - {voice_uptime}")
        print(f"   Helper Bot: {helper_status} - {helper_uptime}")
        # Получаем метрики для Prometheus
        print("\n📈 Получение метрик для Prometheus...")
        metrics = monitor.get_metrics_data()
        if metrics:
            print("✅ Метрики получены успешно")
            for key, value in metrics.items():
                print(f"   {key}: {value}")
        else:
            print("❌ Не удалось получить метрики")
        # Проверяем алерты
        print("\n🚨 Проверка алертов...")
        alerts, recoveries = monitor.check_alerts(system_info)
        if alerts:
            print(f"   Найдено алертов: {len(alerts)}")
            for alert_type, value, details in alerts:
                print(f"     {alert_type}: {value}% - {details}")
        else:
            print("   Алертов не найдено")
        if recoveries:
            print(f"   Найдено восстановлений: {len(recoveries)}")
            for recovery_type, value in recoveries:
                print(f"     {recovery_type}: {value}%")
        # Получаем сообщение о статусе
        print("\n💬 Формирование сообщения о статусе...")
        status_message = monitor.get_status_message(system_info)
        if status_message:
            print("✅ Сообщение о статусе сформировано")
            print("   Первые 200 символов:")
            print(f"   {status_message[:200]}...")
        else:
            print("❌ Не удалось сформировать сообщение о статусе")
        print("\n🎉 Тестирование завершено успешно!")
    except Exception as e:
        print(f"❌ Ошибка при тестировании: {e}")
        logging.error(f"Ошибка при тестировании: {e}", exc_info=True)
        return 1
    return 0
 if __name__ == "__main__":
    exit(main())
--- a/infra/prometheus/prometheus.yml
+++ b/infra/prometheus/prometheus.yml
@@ -0,0 +1,38 @@
 global:
  scrape_interval: 15s
  evaluation_interval: 15s
 rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
 scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
  # Job для мониторинга инфраструктуры
  - job_name: 'infrastructure'
    static_configs:
      - targets: ['host.docker.internal:9091']  # Порт для метрик сервера мониторинга
    metrics_path: '/metrics'
    scrape_interval: 30s
    scrape_timeout: 10s
    honor_labels: true
  - job_name: 'telegram-helper-bot'
    static_configs:
      - targets: ['telegram-helper-bot:8080']  # Или IP адрес сервера с ботом
    metrics_path: '/metrics'
    scrape_interval: 15s
    scrape_timeout: 10s
    honor_labels: true
    labels:
      bot_name: 'telegram-helper-bot'
      environment: 'production'
      service: 'telegram-bot'
 alerting:
  alertmanagers:
    - static_configs:
        - targets:
          # - alertmanager:9093
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 psutil>=5.9.0
 asyncio
 aiohttp>=3.8.0
 python-dotenv>=1.0.0
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -0,0 +1,221 @@
 #!/bin/bash
 # Bots Infrastructure Deployment Script
 # This script deploys the complete bots infrastructure using Docker Compose
 set -e
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 # Configuration
 COMPOSE_FILE="docker-compose.yml"
 ENV_FILE=".env"
 LOG_DIR="logs"
 # Function to print colored output
 print_status() {
    echo -e "${GREEN}[INFO]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 # Function to check prerequisites
 check_prerequisites() {
    print_status "Checking prerequisites..."
    if ! command -v docker &> /dev/null; then
        print_error "Docker is not installed. Please install Docker first."
        exit 1
    fi
    if ! command -v docker-compose &> /dev/null; then
        print_error "Docker Compose is not installed. Please install Docker Compose first."
        exit 1
    fi
    if [ ! -f "$ENV_FILE" ]; then
        print_error "Environment file $ENV_FILE not found. Please create it from .env.example"
        exit 1
    fi
    print_status "Prerequisites check passed"
 }
 # Function to create necessary directories
 create_directories() {
    print_status "Creating necessary directories..."
    mkdir -p "$LOG_DIR"/telegram-helper-bot
    mkdir -p "$LOG_DIR"/voice-bot
    mkdir -p infra/nginx/ssl
    print_status "Directories created"
 }
 # Function to load environment variables
 load_env() {
    print_status "Loading environment variables..."
    if [ -f "$ENV_FILE" ]; then
        export $(cat "$ENV_FILE" | grep -v '^#' | xargs)
        print_status "Environment variables loaded"
    else
        print_error "Environment file not found"
        exit 1
    fi
 }
 # Function to validate environment variables
 validate_env() {
    print_status "Validating environment variables..."
    local required_vars=("BOT_TOKEN_1" "BOT_TOKEN_2" "DB_PASSWORD" "REDIS_PASSWORD")
    local missing_vars=()
    for var in "${required_vars[@]}"; do
        if [ -z "${!var}" ]; then
            missing_vars+=("$var")
        fi
    done
    if [ ${#missing_vars[@]} -ne 0 ]; then
        print_error "Missing required environment variables: ${missing_vars[*]}"
        exit 1
    fi
    print_status "Environment variables validation passed"
 }
 # Function to stop existing services
 stop_services() {
    print_status "Stopping existing services..."
    if docker-compose -f "$COMPOSE_FILE" ps -q | grep -q .; then
        docker-compose -f "$COMPOSE_FILE" down
        print_status "Existing services stopped"
    else
        print_status "No existing services to stop"
    fi
 }
 # Function to deploy services
 deploy_services() {
    print_status "Deploying services..."
    docker-compose -f "$COMPOSE_FILE" up -d --build
    if [ $? -eq 0 ]; then
        print_status "Services deployed successfully"
    else
        print_error "Failed to deploy services"
        exit 1
    fi
 }
 # Function to wait for services to be healthy
 wait_for_services() {
    print_status "Waiting for services to be healthy..."
    local max_attempts=30
    local attempt=1
    while [ $attempt -le $max_attempts ]; do
        local unhealthy_services=$(docker-compose -f "$COMPOSE_FILE" ps | grep -c "unhealthy\|starting" || true)
        if [ "$unhealthy_services" -eq 0 ]; then
            print_status "All services are healthy"
            break
        fi
        if [ $attempt -eq $max_attempts ]; then
            print_warning "Some services may not be fully healthy after $max_attempts attempts"
            break
        fi
        print_status "Waiting for services to be healthy... (attempt $attempt/$max_attempts)"
        sleep 10
        ((attempt++))
    done
 }
 # Function to show service status
 show_status() {
    print_status "Service status:"
    docker-compose -f "$COMPOSE_FILE" ps
    echo ""
    print_status "Service logs (last 10 lines):"
    docker-compose -f "$COMPOSE_FILE" logs --tail=10
 }
 # Function to show access information
 show_access_info() {
    echo ""
    print_status "Access Information:"
    echo "Grafana Dashboard: http://localhost:3000 (admin/admin)"
    echo "Prometheus: http://localhost:9090"
    echo "PostgreSQL: localhost:5432"
    echo "Redis: localhost:6379"
    echo ""
    print_status "Check logs with: docker-compose logs -f [service_name]"
 }
 # Main deployment function
 main() {
    print_status "Starting bots infrastructure deployment..."
    check_prerequisites
    create_directories
    load_env
    validate_env
    stop_services
    deploy_services
    wait_for_services
    show_status
    show_access_info
    print_status "Deployment completed successfully!"
 }
 # Handle command line arguments
 case "${1:-}" in
    "stop")
        print_status "Stopping services..."
        docker-compose -f "$COMPOSE_FILE" down
        print_status "Services stopped"
        ;;
    "restart")
        print_status "Restarting services..."
        docker-compose -f "$COMPOSE_FILE" restart
        print_status "Services restarted"
        ;;
    "logs")
        docker-compose -f "$COMPOSE_FILE" logs -f "${2:-}"
        ;;
    "status")
        show_status
        ;;
    "help"|"-h"|"--help")
        echo "Usage: $0 [command]"
        echo "Commands:"
        echo "  (no args)  - Deploy the infrastructure"
        echo "  stop       - Stop all services"
        echo "  restart    - Restart all services"
        echo "  logs       - Show logs (optionally specify service name)"
        echo "  status     - Show service status"
        echo "  help       - Show this help message"
        ;;
    *)
        main
        ;;
 esac
		`@@ -0,0 +1 @@`
							`# This file ensures the bots directory is tracked by git`