diff --git a/CHANGES_SUMMARY.md b/CHANGES_SUMMARY.md deleted file mode 100644 index 0519ecb..0000000 --- a/CHANGES_SUMMARY.md +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/Dockerfile.bot b/Dockerfile.bot index a4c9aba..ca2dc1d 100644 --- a/Dockerfile.bot +++ b/Dockerfile.bot @@ -52,10 +52,10 @@ USER deploy # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 + CMD curl -f http://localhost:8080/health || exit 1 # Expose metrics port -EXPOSE 8000 +EXPOSE 8080 # Graceful shutdown STOPSIGNAL SIGTERM diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index cb4580c..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,130 +0,0 @@ -version: '3.8' - -services: - telegram-bot: - build: - context: . - dockerfile: Dockerfile.bot - container_name: telegram-bot - restart: unless-stopped - expose: - - "8000" - environment: - - PYTHONPATH=/app - - DOCKER_CONTAINER=true - - LOG_LEVEL=${LOG_LEVEL:-INFO} - - LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-30} - - METRICS_HOST=${METRICS_HOST:-0.0.0.0} - - METRICS_PORT=${METRICS_PORT:-8000} - # Telegram settings - - TELEGRAM_BOT_TOKEN=${BOT_TOKEN} - - TELEGRAM_LISTEN_BOT_TOKEN=${LISTEN_BOT_TOKEN} - - TELEGRAM_TEST_BOT_TOKEN=${TEST_BOT_TOKEN} - - TELEGRAM_PREVIEW_LINK=${PREVIEW_LINK:-false} - - TELEGRAM_MAIN_PUBLIC=${MAIN_PUBLIC} - - TELEGRAM_GROUP_FOR_POSTS=${GROUP_FOR_POSTS} - - TELEGRAM_GROUP_FOR_MESSAGE=${GROUP_FOR_MESSAGE} - - TELEGRAM_GROUP_FOR_LOGS=${GROUP_FOR_LOGS} - - TELEGRAM_IMPORTANT_LOGS=${IMPORTANT_LOGS} - - TELEGRAM_ARCHIVE=${ARCHIVE} - - TELEGRAM_TEST_GROUP=${TEST_GROUP} - # Bot settings - - SETTINGS_LOGS=${LOGS:-false} - - SETTINGS_TEST=${TEST:-false} - # Database - - DATABASE_PATH=${DATABASE_PATH:-database/tg-bot-database.db} - volumes: - - ./database:/app/database:rw - - ./logs:/app/logs:rw - - ./.env:/app/.env:ro - networks: - - bot-internal - depends_on: - - prometheus - - grafana - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - deploy: - resources: - limits: - memory: 512M - cpus: '0.5' - reservations: - memory: 256M - cpus: '0.25' - - prometheus: - image: prom/prometheus:latest - container_name: prometheus - expose: - - "9090" - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro - - prometheus_data:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--storage.tsdb.retention.time=200h' - - '--web.enable-lifecycle' - restart: unless-stopped - networks: - - bot-internal - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] - interval: 30s - timeout: 10s - retries: 3 - deploy: - resources: - limits: - memory: 256M - cpus: '0.25' - - grafana: - image: grafana/grafana:latest - container_name: grafana - ports: - - "3000:3000" # Grafana доступна извне - environment: - - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} - - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - - GF_USERS_ALLOW_SIGN_UP=false - - GF_SERVER_ROOT_URL=http://localhost:3000 - volumes: - - grafana_data:/var/lib/grafana - - ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro - - ./grafana/datasources:/etc/grafana/provisioning/datasources:ro - restart: unless-stopped - networks: - - bot-internal - depends_on: - - prometheus - healthcheck: - test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] - interval: 30s - timeout: 10s - retries: 3 - deploy: - resources: - limits: - memory: 256M - cpus: '0.25' - -volumes: - prometheus_data: - driver: local - grafana_data: - driver: local - -networks: - bot-internal: - driver: bridge - ipam: - config: - - subnet: 172.20.0.0/16 diff --git a/env.example b/env.example index 588a34f..bb48ef3 100644 --- a/env.example +++ b/env.example @@ -20,9 +20,9 @@ TEST=false # Database DATABASE_PATH=database/tg-bot-database.db -# Monitoring +# Monitoring (Centralized Prometheus) METRICS_HOST=0.0.0.0 -METRICS_PORT=8000 +METRICS_PORT=8080 # Logging LOG_LEVEL=INFO diff --git a/grafana/dashboards/dashboards.yml b/grafana/dashboards/dashboards.yml deleted file mode 100644 index 304cbc9..0000000 --- a/grafana/dashboards/dashboards.yml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: 1 - -providers: - - name: 'Telegram Bot Dashboards' - orgId: 1 - folder: '' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /etc/grafana/provisioning/dashboards diff --git a/grafana/dashboards/telegram-bot-dashboard.json b/grafana/dashboards/telegram-bot-dashboard.json deleted file mode 100644 index f6f6e18..0000000 --- a/grafana/dashboards/telegram-bot-dashboard.json +++ /dev/null @@ -1,1012 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum(rate(bot_commands_total[5m]))", - "refId": "A" - } - ], - "title": "Commands per Second", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "histogram_quantile(0.95, rate(method_duration_seconds_bucket[5m]))", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "histogram_quantile(0.99, rate(method_duration_seconds_bucket[5m]))", - "refId": "B" - } - ], - "title": "Method Response Time (P95, P99)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum(rate(errors_total[5m]))", - "refId": "A" - } - ], - "title": "Errors per Second", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum(active_users)", - "refId": "A" - } - ], - "title": "Active Users", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 5, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "histogram_quantile(0.95, rate(db_query_duration_seconds_bucket[5m]))", - "refId": "A" - } - ], - "title": "Database Query Time (P95)", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 6, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum(rate(messages_processed_total[5m]))", - "refId": "A" - } - ], - "title": "Messages Processed per Second", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum by(query_type) (rate(db_queries_total[5m]))", - "refId": "A" - } - ], - "title": "Database Queries by Type", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 8, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "rate(db_errors_total[5m])", - "refId": "A" - } - ], - "title": "Database Errors per Second", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 32 - }, - "id": 9, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum by(command) (rate(bot_commands_total[5m]))", - "refId": "A" - } - ], - "title": "Commands by Type", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 32 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "sum by(status) (rate(bot_commands_total[5m]))", - "refId": "A" - } - ], - "title": "Commands by Status", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "vis": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 40 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "topk(5, sum by(command) (rate(bot_commands_total[5m])))", - "refId": "A" - } - ], - "title": "Top Commands", - "type": "timeseries" - } - ], - "refresh": "5s", - "schemaVersion": 38, - "style": "dark", - "tags": [ - "telegram", - "bot", - "monitoring" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Telegram Bot Dashboard", - "uid": "telegram-bot", - "version": 1, - "weekStart": "" -} diff --git a/grafana/datasources/prometheus.yml b/grafana/datasources/prometheus.yml deleted file mode 100644 index 86fd346..0000000 --- a/grafana/datasources/prometheus.yml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: 1 - -datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true diff --git a/helper_bot/__init__.py b/helper_bot/__init__.py index 3ed7b11..e69de29 100644 --- a/helper_bot/__init__.py +++ b/helper_bot/__init__.py @@ -1 +0,0 @@ -from . import server_monitor diff --git a/helper_bot/main.py b/helper_bot/main.py index da740dd..3b109ea 100644 --- a/helper_bot/main.py +++ b/helper_bot/main.py @@ -2,6 +2,8 @@ from aiogram import Bot, Dispatcher from aiogram.client.default import DefaultBotProperties from aiogram.fsm.storage.memory import MemoryStorage from aiogram.fsm.strategy import FSMStrategy +import asyncio +import logging from helper_bot.handlers.admin import admin_router from helper_bot.handlers.callback import callback_router @@ -10,6 +12,7 @@ from helper_bot.handlers.private import private_router from helper_bot.middlewares.dependencies_middleware import DependenciesMiddleware from helper_bot.middlewares.blacklist_middleware import BlacklistMiddleware from helper_bot.middlewares.metrics_middleware import MetricsMiddleware, ErrorMetricsMiddleware +from helper_bot.server_prometheus import start_metrics_server, stop_metrics_server async def start_bot(bdf): @@ -33,4 +36,21 @@ async def start_bot(bdf): dp.include_routers(admin_router, private_router, callback_router, group_router) await bot.delete_webhook(drop_pending_updates=True) - await dp.start_polling(bot, skip_updates=True) + + # Запускаем HTTP сервер для метрик параллельно с ботом + metrics_host = bdf.settings.get('Metrics', {}).get('host', '0.0.0.0') + metrics_port = bdf.settings.get('Metrics', {}).get('port', 8080) + + try: + # Запускаем метрики сервер + await start_metrics_server(metrics_host, metrics_port) + + # Запускаем бота + await dp.start_polling(bot, skip_updates=True) + + except Exception as e: + logging.error(f"Error in bot startup: {e}") + raise + finally: + # Останавливаем метрики сервер при завершении + await stop_metrics_server() diff --git a/helper_bot/server_monitor.py b/helper_bot/server_monitor.py deleted file mode 100644 index d568b2c..0000000 --- a/helper_bot/server_monitor.py +++ /dev/null @@ -1,623 +0,0 @@ -import asyncio -import os -import psutil -import time -import platform -from datetime import datetime, timedelta -from typing import Dict, Optional, Tuple -import logging - -logger = logging.getLogger(__name__) - - -class ServerMonitor: - def __init__(self, bot, group_for_logs: str, important_logs: str): - self.bot = bot - self.group_for_logs = group_for_logs - self.important_logs = important_logs - - # Определяем ОС - self.os_type = self._detect_os() - logger.info(f"Обнаружена ОС: {self.os_type}") - - # Пороговые значения для алертов - self.threshold = 80.0 - self.recovery_threshold = 75.0 - - # Состояние алертов для предотвращения спама - self.alert_states = { - 'cpu': False, - 'ram': False, - 'disk': False - } - - # PID файлы для отслеживания процессов - self.pid_files = { - 'voice_bot': 'voice_bot.pid', - 'helper_bot': 'helper_bot.pid' - } - - # Время последней отправки статуса - self.last_status_time = None - - # Для расчета скорости диска - self.last_disk_io = None - self.last_disk_io_time = None - - # Время запуска бота для расчета uptime - self.bot_start_time = time.time() - - def _detect_os(self) -> str: - """Определение типа операционной системы""" - system = platform.system().lower() - if system == "darwin": - return "macos" - elif system == "linux": - return "ubuntu" - else: - return "unknown" - - def _get_disk_path(self) -> str: - """Получение пути к диску в зависимости от ОС""" - if self.os_type == "macos": - return "/" - elif self.os_type == "ubuntu": - return "/" - else: - return "/" - - def _get_disk_usage(self) -> Optional[object]: - """Получение информации о диске с учетом ОС""" - try: - if self.os_type == "macos": - # На macOS используем diskutil для получения реального использования диска - return self._get_macos_disk_usage() - else: - disk_path = self._get_disk_path() - return psutil.disk_usage(disk_path) - except Exception as e: - logger.error(f"Ошибка при получении информации о диске: {e}") - return None - - def _get_macos_disk_usage(self) -> Optional[object]: - """Получение информации о диске на macOS через diskutil""" - try: - import subprocess - import re - - # Получаем информацию о диске через diskutil - result = subprocess.run(['diskutil', 'info', '/'], capture_output=True, text=True) - if result.returncode != 0: - # Fallback к psutil - return psutil.disk_usage('/') - - output = result.stdout - - # Извлекаем размеры из вывода diskutil - total_match = re.search(r'Container Total Space:\s+(\d+\.\d+)\s+GB', output) - free_match = re.search(r'Container Free Space:\s+(\d+\.\d+)\s+GB', output) - - if total_match and free_match: - total_gb = float(total_match.group(1)) - free_gb = float(free_match.group(1)) - used_gb = total_gb - free_gb - - # Создаем объект, похожий на результат psutil.disk_usage - class DiskUsage: - def __init__(self, total, used, free): - self.total = total * (1024**3) # Конвертируем в байты - self.used = used * (1024**3) - self.free = free * (1024**3) - - return DiskUsage(total_gb, used_gb, free_gb) - else: - # Fallback к psutil - return psutil.disk_usage('/') - - except Exception as e: - logger.error(f"Ошибка при получении информации о диске macOS: {e}") - # Fallback к psutil - return psutil.disk_usage('/') - - def _get_disk_io_counters(self): - """Получение статистики диска с учетом ОС""" - try: - if self.os_type == "macos": - # На macOS может быть несколько дисков, берем основной - return psutil.disk_io_counters(perdisk=False) - elif self.os_type == "ubuntu": - # На Ubuntu обычно один диск - return psutil.disk_io_counters(perdisk=False) - else: - return psutil.disk_io_counters() - except Exception as e: - logger.error(f"Ошибка при получении статистики диска: {e}") - return None - - def _get_system_uptime(self) -> float: - """Получение uptime системы с учетом ОС""" - try: - if self.os_type == "macos": - # На macOS используем boot_time - boot_time = psutil.boot_time() - return time.time() - boot_time - elif self.os_type == "ubuntu": - # На Ubuntu также используем boot_time - boot_time = psutil.boot_time() - return time.time() - boot_time - else: - boot_time = psutil.boot_time() - return time.time() - boot_time - except Exception as e: - logger.error(f"Ошибка при получении uptime системы: {e}") - return 0.0 - - def get_bot_uptime(self) -> str: - """Получение uptime бота""" - uptime_seconds = time.time() - self.bot_start_time - return self._format_uptime(uptime_seconds) - - def get_system_info(self) -> Dict: - """Получение информации о системе""" - try: - # CPU - cpu_percent = psutil.cpu_percent(interval=1) - load_avg = psutil.getloadavg() - cpu_count = psutil.cpu_count() - - # Память - memory = psutil.virtual_memory() - swap = psutil.swap_memory() - - # Используем единый расчет для всех ОС: used / total для получения процента занятой памяти - # Это обеспечивает консистентность между macOS и Ubuntu - ram_percent = (memory.used / memory.total) * 100 - - # Диск - disk = self._get_disk_usage() - disk_io = self._get_disk_io_counters() - - if disk is None: - logger.error("Не удалось получить информацию о диске") - return {} - - # Расчет скорости диска - disk_read_speed, disk_write_speed = self._calculate_disk_speed(disk_io) - - # Система - system_uptime = self._get_system_uptime() - - # Получаем имя хоста в зависимости от ОС - if self.os_type == "macos": - hostname = os.uname().nodename - elif self.os_type == "ubuntu": - hostname = os.uname().nodename - else: - hostname = "unknown" - - return { - 'cpu_percent': cpu_percent, - 'load_avg_1m': round(load_avg[0], 2), - 'load_avg_5m': round(load_avg[1], 2), - 'load_avg_15m': round(load_avg[2], 2), - 'cpu_count': cpu_count, - 'ram_used': round(memory.used / (1024**3), 2), - 'ram_total': round(memory.total / (1024**3), 2), - 'ram_percent': round(ram_percent, 1), # Исправленный процент занятой памяти - 'swap_used': round(swap.used / (1024**3), 2), - 'swap_total': round(swap.total / (1024**3), 2), - 'swap_percent': swap.percent, - 'disk_used': round(disk.used / (1024**3), 2), - 'disk_total': round(disk.total / (1024**3), 2), - 'disk_percent': round((disk.used / disk.total) * 100, 1), - 'disk_free': round(disk.free / (1024**3), 2), - 'disk_read_speed': disk_read_speed, - 'disk_write_speed': disk_write_speed, - 'disk_io_percent': self._calculate_disk_io_percent(), - 'system_uptime': self._format_uptime(system_uptime), - 'bot_uptime': self.get_bot_uptime(), - 'server_hostname': hostname, - 'current_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - except Exception as e: - logger.error(f"Ошибка при получении информации о системе: {e}") - return {} - - def _get_disk_space_emoji(self, disk_percent: float) -> str: - """Получение эмодзи для дискового пространства""" - if disk_percent < 60: - return "🟢" - elif disk_percent < 90: - return "⚠️" - else: - return "🚨" - - def _format_bytes(self, bytes_value: int) -> str: - """Форматирование байтов в человекочитаемый вид""" - if bytes_value == 0: - return "0 B" - - size_names = ["B", "KB", "MB", "GB", "TB"] - i = 0 - while bytes_value >= 1024 and i < len(size_names) - 1: - bytes_value /= 1024.0 - i += 1 - - return f"{bytes_value:.1f} {size_names[i]}" - - def _format_uptime(self, seconds: float) -> str: - """Форматирование времени работы системы""" - days = int(seconds // 86400) - hours = int((seconds % 86400) // 3600) - minutes = int((seconds % 3600) // 60) - - if days > 0: - return f"{days}д {hours}ч {minutes}м" - elif hours > 0: - return f"{hours}ч {minutes}м" - else: - return f"{minutes}м" - - def check_process_status(self, process_name: str) -> Tuple[str, str]: - """Проверка статуса процесса и возврат статуса с uptime""" - try: - # Сначала проверяем по PID файлу - pid_file = self.pid_files.get(process_name) - if pid_file and os.path.exists(pid_file): - try: - with open(pid_file, 'r') as f: - content = f.read().strip() - if content and content != '# Этот файл будет автоматически обновляться при запуске бота': - pid = int(content) - if psutil.pid_exists(pid): - # Получаем uptime процесса - try: - proc = psutil.Process(pid) - proc_uptime = time.time() - proc.create_time() - uptime_str = self._format_uptime(proc_uptime) - return "✅", f"Uptime {uptime_str}" - except: - return "✅", "Uptime неизвестно" - except (ValueError, FileNotFoundError): - pass - - # Проверяем по имени процесса более точно - for proc in psutil.process_iter(['pid', 'name', 'cmdline']): - try: - proc_name = proc.info['name'].lower() - cmdline = ' '.join(proc.info['cmdline']).lower() if proc.info['cmdline'] else '' - - # Более точная проверка для каждого бота - if process_name == 'voice_bot': - # Проверяем voice_bot - if ('voice_bot' in proc_name or - 'voice_bot' in cmdline or - 'voice_bot_v2.py' in cmdline): - # Получаем uptime процесса - try: - proc_uptime = time.time() - proc.create_time() - uptime_str = self._format_uptime(proc_uptime) - return "✅", f"Uptime {uptime_str}" - except: - return "✅", "Uptime неизвестно" - elif process_name == 'helper_bot': - # Проверяем helper_bot - if ('helper_bot' in proc_name or - 'helper_bot' in cmdline or - 'run_helper.py' in cmdline or - 'python' in proc_name and 'helper_bot' in cmdline): - # Получаем uptime процесса - try: - proc_uptime = time.time() - proc.create_time() - uptime_str = self._format_uptime(proc_uptime) - return "✅", f"Uptime {uptime_str}" - except: - return "✅", "Uptime неизвестно" - except (psutil.NoSuchProcess, psutil.AccessDenied): - continue - - return "❌", "Выключен" - except Exception as e: - logger.error(f"Ошибка при проверке процесса {process_name}: {e}") - return "❌", "Выключен" - - def should_send_status(self) -> bool: - """Проверка, нужно ли отправить статус (каждые 30 минут в 00 и 30 минут часа)""" - now = datetime.now() - - # Проверяем, что сейчас 00 или 30 минут часа - if now.minute in [0, 30]: - # Проверяем, не отправляли ли мы уже статус в эту минуту - if (self.last_status_time is None or - self.last_status_time.hour != now.hour or - self.last_status_time.minute != now.minute): - self.last_status_time = now - return True - - return False - - def _calculate_disk_speed(self, current_disk_io) -> Tuple[str, str]: - """Расчет скорости чтения/записи диска""" - current_time = time.time() - - if self.last_disk_io is None or self.last_disk_io_time is None: - self.last_disk_io = current_disk_io - self.last_disk_io_time = current_time - return "0 B/s", "0 B/s" - - time_diff = current_time - self.last_disk_io_time - if time_diff < 1: # Минимальный интервал 1 секунда - return "0 B/s", "0 B/s" - - read_diff = current_disk_io.read_bytes - self.last_disk_io.read_bytes - write_diff = current_disk_io.write_bytes - self.last_disk_io.write_bytes - - read_speed = read_diff / time_diff - write_speed = write_diff / time_diff - - # Обновляем предыдущие значения - self.last_disk_io = current_disk_io - self.last_disk_io_time = current_time - - return self._format_bytes(read_speed) + "/s", self._format_bytes(write_speed) + "/s" - - def _calculate_disk_io_percent(self) -> int: - """Расчет процента загрузки диска на основе IOPS""" - try: - # Получаем статистику диска - disk_io = self._get_disk_io_counters() - if disk_io is None: - return 0 - - # Простая эвристика: считаем общее количество операций - total_ops = disk_io.read_count + disk_io.write_count - - # Нормализуем к проценту (это приблизительная оценка) - # На macOS обычно нормальная нагрузка до 1000-5000 операций в секунду - if total_ops < 1000: - return 10 - elif total_ops < 5000: - return 30 - elif total_ops < 10000: - return 50 - elif total_ops < 20000: - return 70 - else: - return 90 - except: - return 0 - - def should_send_startup_status(self) -> bool: - """Проверка, нужно ли отправить статус при запуске""" - return self.last_status_time is None - - async def send_startup_message(self): - """Отправка сообщения о запуске бота""" - try: - message = f"""🚀 **Бот запущен!** ---------------------------------- -**Время запуска:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} -**Сервер:** `{psutil.os.uname().nodename}` -**Система:** {psutil.os.uname().sysname} {psutil.os.uname().release} -**ОС:** {self.os_type.upper()} - -✅ Мониторинг сервера активирован -✅ Статус будет отправляться каждые 30 минут (в 00 и 30 минут часа) -✅ Алерты будут отправляться при превышении пороговых значений ----------------------------------""" - - await self.bot.send_message( - chat_id=self.important_logs, - text=message, - parse_mode='HTML' - ) - logger.info("Сообщение о запуске бота отправлено") - - except Exception as e: - logger.error(f"Ошибка при отправке сообщения о запуске: {e}") - - async def send_shutdown_message(self): - """Отправка сообщения об отключении бота""" - try: - # Получаем финальную информацию о системе - system_info = self.get_system_info() - if not system_info: - system_info = { - 'current_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - 'server_hostname': psutil.os.uname().nodename - } - - message = f"""🛑 **Бот отключен!** ---------------------------------- -**Время отключения:** {system_info['current_time']} -**Сервер:** `{system_info['server_hostname']}` - -❌ Мониторинг сервера остановлен -❌ Статус больше не будет отправляться -❌ Алерты отключены - -⚠️ **Внимание:** Проверьте состояние сервера! ----------------------------------""" - - await self.bot.send_message( - chat_id=self.important_logs, - text=message, - parse_mode='HTML' - ) - logger.info("Сообщение об отключении бота отправлено") - - except Exception as e: - logger.error(f"Ошибка при отправке сообщения об отключении: {e}") - - def check_alerts(self, system_info: Dict) -> Tuple[bool, Optional[str]]: - """Проверка необходимости отправки алертов""" - alerts = [] - - # Проверка CPU - if system_info['cpu_percent'] > self.threshold and not self.alert_states['cpu']: - self.alert_states['cpu'] = True - alerts.append(('cpu', system_info['cpu_percent'], f"Нагрузка за 1 мин: {system_info['load_avg_1m']}")) - - # Проверка RAM - if system_info['ram_percent'] > self.threshold and not self.alert_states['ram']: - self.alert_states['ram'] = True - alerts.append(('ram', system_info['ram_percent'], f"Используется: {system_info['ram_used']} GB из {system_info['ram_total']} GB")) - - # Проверка диска - if system_info['disk_percent'] > self.threshold and not self.alert_states['disk']: - self.alert_states['disk'] = True - alerts.append(('disk', system_info['disk_percent'], f"Свободно: {system_info['disk_free']} GB на /")) - - # Проверка восстановления - recoveries = [] - if system_info['cpu_percent'] < self.recovery_threshold and self.alert_states['cpu']: - self.alert_states['cpu'] = False - recoveries.append(('cpu', system_info['cpu_percent'])) - - if system_info['ram_percent'] < self.recovery_threshold and self.alert_states['ram']: - self.alert_states['ram'] = False - recoveries.append(('ram', system_info['ram_percent'])) - - if system_info['disk_percent'] < self.recovery_threshold and self.alert_states['disk']: - self.alert_states['disk'] = False - recoveries.append(('disk', system_info['disk_percent'])) - - return alerts, recoveries - - async def send_status_message(self, system_info: Dict): - """Отправка сообщения со статусом сервера""" - try: - voice_bot_status, voice_bot_uptime = self.check_process_status('voice_bot') - helper_bot_status, helper_bot_uptime = self.check_process_status('helper_bot') - - # Получаем эмодзи для дискового пространства - disk_emoji = self._get_disk_space_emoji(system_info['disk_percent']) - - message = f"""🖥 **Статус Сервера** | {system_info['current_time']} ---------------------------------- -**📊 Общая нагрузка:** -CPU: {system_info['cpu_percent']}% | LA: {system_info['load_avg_1m']} / {system_info['cpu_count']} | IO Wait: {system_info['disk_percent']}% - -**💾 Память:** -RAM: {system_info['ram_used']}/{system_info['ram_total']} GB ({system_info['ram_percent']}%) -Swap: {system_info['swap_used']}/{system_info['swap_total']} GB ({system_info['swap_percent']}%) - -**🗂️ Дисковое пространство:** -Диск (/): {system_info['disk_used']}/{system_info['disk_total']} GB ({system_info['disk_percent']}%) {disk_emoji} - -**💿 Диск I/O:** -Read: {system_info['disk_read_speed']} | Write: {system_info['disk_write_speed']} -Диск загружен: {system_info['disk_io_percent']}% - -**🤖 Процессы:** -{voice_bot_status} voice-bot - {voice_bot_uptime} -{helper_bot_status} helper-bot - {helper_bot_uptime} ---------------------------------- -⏰ Uptime сервера: {system_info['system_uptime']}""" - - await self.bot.send_message( - chat_id=self.group_for_logs, - text=message, - parse_mode='HTML' - ) - logger.info("Статус сервера отправлен") - - except Exception as e: - logger.error(f"Ошибка при отправке статуса сервера: {e}") - - async def send_alert_message(self, metric_name: str, current_value: float, details: str): - """Отправка сообщения об алерте""" - try: - message = f"""🚨 **ALERT: Высокая нагрузка на сервере!** ---------------------------------- -**Показатель:** {metric_name} -**Текущее значение:** {current_value}% ⚠️ -**Пороговое значение:** 80% - -**Детали:** -{details} - -**Сервер:** `{psutil.os.uname().nodename}` -**Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}` ----------------------------------""" - - await self.bot.send_message( - chat_id=self.important_logs, - text=message, - parse_mode='HTML' - ) - logger.warning(f"Алерт отправлен: {metric_name} - {current_value}%") - - except Exception as e: - logger.error(f"Ошибка при отправке алерта: {e}") - - async def send_recovery_message(self, metric_name: str, current_value: float, peak_value: float): - """Отправка сообщения о восстановлении""" - try: - message = f"""✅ **RECOVERY: Нагрузка нормализовалась** ---------------------------------- -**Показатель:** {metric_name} -**Текущее значение:** {current_value}% ✔️ -**Было превышение:** До {peak_value}% - -**Сервер:** `{psutil.os.uname().nodename}` -**Время:** `{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}` ----------------------------------""" - - await self.bot.send_message( - chat_id=self.important_logs, - text=message, - parse_mode='HTML' - ) - logger.info(f"Сообщение о восстановлении отправлено: {metric_name}") - - except Exception as e: - logger.error(f"Ошибка при отправке сообщения о восстановлении: {e}") - - async def monitor_loop(self): - """Основной цикл мониторинга""" - logger.info(f"Модуль мониторинга сервера запущен на {self.os_type.upper()}") - - # Отправляем сообщение о запуске при первом запуске - if self.should_send_startup_status(): - await self.send_startup_message() - - while True: - try: - system_info = self.get_system_info() - if not system_info: - await asyncio.sleep(60) - continue - - # Проверка алертов - alerts, recoveries = self.check_alerts(system_info) - - # Отправка алертов - for metric_type, value, details in alerts: - metric_names = { - 'cpu': 'Использование CPU', - 'ram': 'Использование оперативной памяти', - 'disk': 'Заполнение диска (/)' - } - await self.send_alert_message(metric_names[metric_type], value, details) - - # Отправка сообщений о восстановлении - for metric_type, value in recoveries: - metric_names = { - 'cpu': 'Использование CPU', - 'ram': 'Использование оперативной памяти', - 'disk': 'Заполнение диска (/)' - } - # Находим пиковое значение (используем 80% как пример) - await self.send_recovery_message(metric_names[metric_type], value, 80.0) - - # Отправка статуса каждые 30 минут в 00 и 30 минут часа - if self.should_send_status(): - await self.send_status_message(system_info) - - # Пауза между проверками (1 минута) - await asyncio.sleep(60) - - except Exception as e: - logger.error(f"Ошибка в цикле мониторинга: {e}") - await asyncio.sleep(60) diff --git a/helper_bot/server_prometheus.py b/helper_bot/server_prometheus.py new file mode 100644 index 0000000..9aa140a --- /dev/null +++ b/helper_bot/server_prometheus.py @@ -0,0 +1,126 @@ + +""" +HTTP server for metrics endpoint integration with centralized Prometheus monitoring. +Provides /metrics endpoint and health check for the bot. +""" + +import asyncio +import logging +from aiohttp import web +from typing import Optional +from .utils.metrics import metrics + + +class MetricsServer: + """HTTP server for Prometheus metrics and health checks.""" + + def __init__(self, host: str = '0.0.0.0', port: int = 8080): + self.host = host + self.port = port + self.app = web.Application() + self.runner: Optional[web.AppRunner] = None + self.site: Optional[web.TCPSite] = None + self.logger = logging.getLogger(__name__) + + # Настраиваем роуты + self.app.router.add_get('/metrics', self.metrics_handler) + self.app.router.add_get('/health', self.health_handler) + + async def metrics_handler(self, request: web.Request) -> web.Response: + """Handle /metrics endpoint for Prometheus scraping.""" + try: + self.logger.info("Generating metrics...") + + # Проверяем, что metrics доступен + if not metrics: + self.logger.error("Metrics object is not available") + return web.Response( + text="Metrics not available", + status=500 + ) + + # Генерируем метрики в формате Prometheus + self.logger.info("Calling metrics.get_metrics()...") + metrics_data = metrics.get_metrics() + self.logger.info(f"Generated metrics: {len(metrics_data)} bytes") + + return web.Response( + body=metrics_data, + content_type='text/plain; version=0.0.4' + ) + except Exception as e: + self.logger.error(f"Error generating metrics: {e}") + import traceback + self.logger.error(f"Traceback: {traceback.format_exc()}") + return web.Response( + text=f"Error generating metrics: {e}", + status=500 + ) + + async def health_handler(self, request: web.Request) -> web.Response: + """Handle /health endpoint for health checks.""" + return web.Response( + text="OK", + content_type='text/plain' + ) + + async def start(self) -> None: + """Start the HTTP server.""" + try: + self.runner = web.AppRunner(self.app) + await self.runner.setup() + + self.site = web.TCPSite(self.runner, self.host, self.port) + await self.site.start() + + self.logger.info(f"Metrics server started on {self.host}:{self.port}") + self.logger.info("Available endpoints:") + self.logger.info(f" - /metrics - Prometheus metrics") + self.logger.info(f" - /health - Health check") + + except Exception as e: + self.logger.error(f"Failed to start metrics server: {e}") + raise + + async def stop(self) -> None: + """Stop the HTTP server.""" + try: + if self.site: + await self.site.stop() + self.logger.info("Metrics server site stopped") + + if self.runner: + await self.runner.cleanup() + self.logger.info("Metrics server runner cleaned up") + + except Exception as e: + self.logger.error(f"Error stopping metrics server: {e}") + + async def __aenter__(self): + """Async context manager entry.""" + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + await self.stop() + + +# Глобальный экземпляр сервера для использования в main.py +metrics_server: Optional[MetricsServer] = None + + +async def start_metrics_server(host: str = '0.0.0.0', port: int = 8080) -> MetricsServer: + """Start metrics server and return instance.""" + global metrics_server + metrics_server = MetricsServer(host, port) + await metrics_server.start() + return metrics_server + + +async def stop_metrics_server() -> None: + """Stop metrics server if running.""" + global metrics_server + if metrics_server: + await metrics_server.stop() + metrics_server = None diff --git a/prometheus.yml b/prometheus.yml deleted file mode 100644 index fd60240..0000000 --- a/prometheus.yml +++ /dev/null @@ -1,26 +0,0 @@ -global: - scrape_interval: 15s - evaluation_interval: 15s - -rule_files: - # - "first_rules.yml" - # - "second_rules.yml" - -scrape_configs: - - job_name: 'telegram-bot' - static_configs: - - targets: ['telegram-bot:8000'] - metrics_path: '/metrics' - scrape_interval: 10s - scrape_timeout: 10s - honor_labels: true - - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - -alerting: - alertmanagers: - - static_configs: - - targets: - # - alertmanager:9093 diff --git a/run_helper.py b/run_helper.py index 82a960a..0bb12ff 100644 --- a/run_helper.py +++ b/run_helper.py @@ -10,40 +10,26 @@ if CURRENT_DIR not in sys.path: from helper_bot.main import start_bot from helper_bot.utils.base_dependency_factory import get_global_instance -from helper_bot.server_monitor import ServerMonitor from helper_bot.utils.auto_unban_scheduler import get_auto_unban_scheduler -async def start_monitoring(bdf, bot): - """Запуск модуля мониторинга сервера""" - monitor = ServerMonitor( - bot=bot, - group_for_logs=bdf.settings['Telegram']['group_for_logs'], - important_logs=bdf.settings['Telegram']['important_logs'] - ) - return monitor - - async def main(): """Основная функция запуска""" bdf = get_global_instance() - # Создаем бота для мониторинга + # Создаем бота для автоматического разбана from aiogram import Bot from aiogram.client.default import DefaultBotProperties - monitor_bot = Bot( + auto_unban_bot = Bot( token=bdf.settings['Telegram']['bot_token'], default=DefaultBotProperties(parse_mode='HTML'), timeout=30.0 ) - # Создаем экземпляр монитора - monitor = await start_monitoring(bdf, monitor_bot) - # Инициализируем планировщик автоматического разбана auto_unban_scheduler = get_auto_unban_scheduler() - auto_unban_scheduler.set_bot(monitor_bot) + auto_unban_scheduler.set_bot(auto_unban_bot) auto_unban_scheduler.start_scheduler() # Инициализируем метрики ПОСЛЕ импорта всех модулей @@ -63,9 +49,8 @@ async def main(): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - # Запускаем бота, мониторинг и метрики + # Запускаем бота и метрики bot_task = asyncio.create_task(start_bot(bdf)) - monitor_task = asyncio.create_task(monitor.monitor_loop()) metrics_task = asyncio.create_task(metrics_manager.start()) try: @@ -76,13 +61,6 @@ async def main(): except KeyboardInterrupt: print("Получен сигнал завершения...") finally: - print("Отправляем сообщение об отключении...") - try: - # Отправляем сообщение об отключении - await monitor.send_shutdown_message() - except Exception as e: - print(f"Ошибка при отправке сообщения об отключении: {e}") - print("Останавливаем планировщик автоматического разбана...") auto_unban_scheduler.stop_scheduler() @@ -92,17 +70,16 @@ async def main(): print("Останавливаем задачи...") # Отменяем задачи bot_task.cancel() - monitor_task.cancel() metrics_task.cancel() # Ждем завершения задач try: - await asyncio.gather(bot_task, monitor_task, metrics_task, return_exceptions=True) + await asyncio.gather(bot_task, metrics_task, return_exceptions=True) except Exception as e: print(f"Ошибка при остановке задач: {e}") # Закрываем сессию бота - await monitor_bot.session.close() + await auto_unban_bot.session.close() print("Бот корректно остановлен") diff --git a/tests/test_monitor.py b/tests/test_monitor.py deleted file mode 100644 index 41f39b5..0000000 --- a/tests/test_monitor.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -""" -Тестовый скрипт для проверки модуля мониторинга сервера -""" -import pytest -import asyncio -import sys -import os - -# Добавляем путь к проекту -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -from helper_bot.server_monitor import ServerMonitor - - -class MockBot: - """Мок объект бота для тестирования""" - - async def send_message(self, chat_id, text, parse_mode=None): - print(f"\n{'='*60}") - print(f"Отправка в чат: {chat_id}") - print(f"Текст сообщения:") - print(text) - print(f"{'='*60}\n") - - -@pytest.mark.asyncio -async def test_monitor(): - """Тестирование модуля мониторинга""" - print("🧪 Тестирование модуля мониторинга сервера") - print("=" * 60) - - # Создаем мок бота - mock_bot = MockBot() - - # Создаем монитор - monitor = ServerMonitor( - bot=mock_bot, - group_for_logs="-123456789", - important_logs="-987654321" - ) - - print("📊 Получение информации о системе...") - system_info = monitor.get_system_info() - - if system_info: - print("✅ Информация о системе получена успешно") - print(f"CPU: {system_info['cpu_percent']}%") - print(f"RAM: {system_info['ram_percent']}%") - print(f"Disk: {system_info['disk_percent']}%") - print(f"Uptime: {system_info['system_uptime']}") - - print("\n🤖 Проверка статуса процессов...") - voice_status, voice_uptime = monitor.check_process_status('voice_bot') - helper_status, helper_uptime = monitor.check_process_status('helper_bot') - print(f"Voice Bot: {voice_status} - {voice_uptime}") - print(f"Helper Bot: {helper_status} - {helper_uptime}") - - print("\n📝 Тестирование отправки статуса...") - await monitor.send_status_message(system_info) - - print("\n🚨 Тестирование отправки алерта...") - await monitor.send_alert_message( - "Использование CPU", - 85.5, - "Нагрузка за 1 мин: 2.5" - ) - - print("\n✅ Тестирование отправки сообщения о восстановлении...") - await monitor.send_recovery_message( - "Использование CPU", - 70.0, - 85.5 - ) - - else: - print("❌ Не удалось получить информацию о системе") - - print("\n🎯 Тестирование завершено!") - - -if __name__ == "__main__": - asyncio.run(test_monitor())