Files
telegram-helper-bot/helper_bot/utils/metrics.py
Andrey e2b1353408 feat: добавлена система миграций БД и CI/CD пайплайны
- Создана система отслеживания миграций (MigrationRepository, таблица migrations)
- Добавлен скрипт apply_migrations.py для автоматического применения миграций
- Созданы CI/CD пайплайны (.github/workflows/ci.yml, deploy.yml)
- Обновлена документация по миграциям в database-patterns.md
- Миграции применяются автоматически при деплое в продакшн
2026-01-25 23:17:09 +03:00

707 lines
26 KiB
Python

"""
Metrics module for Telegram bot monitoring with Prometheus.
Provides predefined metrics for bot commands, errors, performance, and user activity.
"""
import asyncio
import os
import time
from contextlib import asynccontextmanager
from functools import wraps
from typing import Any, Dict, Optional
from prometheus_client import (CONTENT_TYPE_LATEST, Counter, Gauge, Histogram,
generate_latest)
from prometheus_client.core import CollectorRegistry
# Метрики rate limiter теперь создаются в основном классе
class BotMetrics:
"""Central class for managing all bot metrics."""
def __init__(self):
self.registry = CollectorRegistry()
# Создаем метрики rate limiter в том же registry
self._create_rate_limit_metrics()
# Bot commands counter
self.bot_commands_total = Counter(
'bot_commands_total',
'Total number of bot commands processed',
['command', 'status', 'handler_type', 'user_type'],
registry=self.registry
)
# Method execution time histogram
self.method_duration_seconds = Histogram(
'method_duration_seconds',
'Time spent executing methods',
['method_name', 'handler_type', 'status'],
# Оптимизированные buckets для Telegram API (обычно < 1 сек)
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
registry=self.registry
)
# Errors counter
self.errors_total = Counter(
'errors_total',
'Total number of errors',
['error_type', 'handler_type', 'method_name'],
registry=self.registry
)
# Active users gauge
self.active_users = Gauge(
'active_users',
'Number of currently active users',
['user_type'],
registry=self.registry
)
# Total users gauge (отдельная метрика)
self.total_users = Gauge(
'total_users',
'Total number of users in database',
registry=self.registry
)
# Database query metrics
self.db_query_duration_seconds = Histogram(
'db_query_duration_seconds',
'Time spent executing database queries',
['query_type', 'table_name', 'operation'],
# Оптимизированные buckets для SQLite/PostgreSQL
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5],
registry=self.registry
)
# Database queries counter
self.db_queries_total = Counter(
'db_queries_total',
'Total number of database queries executed',
['query_type', 'table_name', 'operation'],
registry=self.registry
)
# Database errors counter
self.db_errors_total = Counter(
'db_errors_total',
'Total number of database errors',
['error_type', 'query_type', 'table_name', 'operation'],
registry=self.registry
)
# Message processing metrics
self.messages_processed_total = Counter(
'messages_processed_total',
'Total number of messages processed',
['message_type', 'chat_type', 'handler_type'],
registry=self.registry
)
# Middleware execution metrics
self.middleware_duration_seconds = Histogram(
'middleware_duration_seconds',
'Time spent in middleware execution',
['middleware_name', 'status'],
# Middleware должен быть быстрым
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.25],
registry=self.registry
)
# Rate limiting metrics
self.rate_limit_hits_total = Counter(
'rate_limit_hits_total',
'Total number of rate limit hits',
['limit_type', 'user_id', 'action'],
registry=self.registry
)
# User activity metrics
self.user_activity_total = Counter(
'user_activity_total',
'Total user activity events',
['activity_type', 'user_type', 'chat_type'],
registry=self.registry
)
# File download metrics
self.file_downloads_total = Counter(
'file_downloads_total',
'Total number of file downloads',
['content_type', 'status'],
registry=self.registry
)
self.file_download_duration_seconds = Histogram(
'file_download_duration_seconds',
'Time spent downloading files',
['content_type'],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0],
registry=self.registry
)
self.file_download_size_bytes = Histogram(
'file_download_size_bytes',
'Size of downloaded files in bytes',
['content_type'],
buckets=[1024, 10240, 102400, 1048576, 10485760, 104857600, 1073741824],
registry=self.registry
)
# Media processing metrics
self.media_processing_total = Counter(
'media_processing_total',
'Total number of media processing operations',
['content_type', 'status'],
registry=self.registry
)
self.media_processing_duration_seconds = Histogram(
'media_processing_duration_seconds',
'Time spent processing media',
['content_type'],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0],
registry=self.registry
)
def _create_rate_limit_metrics(self):
"""Создает метрики rate limiter в основном registry"""
try:
# Создаем метрики rate limiter в том же registry
self.rate_limit_requests_total = Counter(
'rate_limit_requests_total',
'Total number of rate limited requests',
['chat_id', 'status', 'error_type'],
registry=self.registry
)
self.rate_limit_errors_total = Counter(
'rate_limit_errors_total',
'Total number of rate limit errors',
['error_type', 'chat_id'],
registry=self.registry
)
self.rate_limit_wait_duration_seconds = Histogram(
'rate_limit_wait_duration_seconds',
'Time spent waiting due to rate limiting',
['chat_id'],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0],
registry=self.registry
)
self.rate_limit_active_chats = Gauge(
'rate_limit_active_chats',
'Number of active chats with rate limiting',
registry=self.registry
)
self.rate_limit_success_rate = Gauge(
'rate_limit_success_rate',
'Success rate of rate limited requests',
['chat_id'],
registry=self.registry
)
self.rate_limit_requests_per_minute = Gauge(
'rate_limit_requests_per_minute',
'Requests per minute',
['chat_id'],
registry=self.registry
)
self.rate_limit_total_requests = Gauge(
'rate_limit_total_requests',
'Total number of requests',
['chat_id'],
registry=self.registry
)
self.rate_limit_total_errors = Gauge(
'rate_limit_total_errors',
'Total number of errors',
['chat_id', 'error_type'],
registry=self.registry
)
self.rate_limit_avg_wait_time_seconds = Gauge(
'rate_limit_avg_wait_time_seconds',
'Average wait time in seconds',
['chat_id'],
registry=self.registry
)
except Exception as e:
# Логируем ошибку, но не прерываем инициализацию
import logging
logging.warning(f"Failed to create rate limit metrics: {e}")
def record_command(self, command_type: str, handler_type: str = "unknown", user_type: str = "unknown", status: str = "success"):
"""Record a bot command execution."""
self.bot_commands_total.labels(
command=command_type,
status=status,
handler_type=handler_type,
user_type=user_type
).inc()
def record_error(self, error_type: str, handler_type: str = "unknown", method_name: str = "unknown"):
"""Record an error occurrence."""
self.errors_total.labels(
error_type=error_type,
handler_type=handler_type,
method_name=method_name
).inc()
def record_method_duration(self, method_name: str, duration: float, handler_type: str = "unknown", status: str = "success"):
"""Record method execution duration."""
self.method_duration_seconds.labels(
method_name=method_name,
handler_type=handler_type,
status=status
).observe(duration)
def set_active_users(self, count: int, user_type: str = "daily"):
"""Set the number of active users for a specific type."""
self.active_users.labels(user_type=user_type).set(count)
def set_total_users(self, count: int):
"""Set the total number of users in database."""
self.total_users.set(count)
def record_db_query(self, query_type: str, duration: float, table_name: str = "unknown", operation: str = "unknown"):
"""Record database query duration."""
self.db_query_duration_seconds.labels(
query_type=query_type,
table_name=table_name,
operation=operation
).observe(duration)
self.db_queries_total.labels(
query_type=query_type,
table_name=table_name,
operation=operation
).inc()
def record_message(self, message_type: str, chat_type: str = "unknown", handler_type: str = "unknown"):
"""Record a processed message."""
self.messages_processed_total.labels(
message_type=message_type,
chat_type=chat_type,
handler_type=handler_type
).inc()
def record_middleware(self, middleware_name: str, duration: float, status: str = "success"):
"""Record middleware execution duration."""
self.middleware_duration_seconds.labels(
middleware_name=middleware_name,
status=status
).observe(duration)
def record_file_download(self, content_type: str, file_size: int, duration: float):
"""Record file download metrics."""
self.file_downloads_total.labels(
content_type=content_type,
status="success"
).inc()
self.file_download_duration_seconds.labels(
content_type=content_type
).observe(duration)
self.file_download_size_bytes.labels(
content_type=content_type
).observe(file_size)
def record_file_download_error(self, content_type: str, error_message: str):
"""Record file download error metrics."""
self.file_downloads_total.labels(
content_type=content_type,
status="error"
).inc()
self.errors_total.labels(
error_type="file_download_error",
handler_type="media_processing",
method_name="download_file"
).inc()
def record_media_processing(self, content_type: str, duration: float, success: bool):
"""Record media processing metrics."""
status = "success" if success else "error"
self.media_processing_total.labels(
content_type=content_type,
status=status
).inc()
self.media_processing_duration_seconds.labels(
content_type=content_type
).observe(duration)
if not success:
self.errors_total.labels(
error_type="media_processing_error",
handler_type="media_processing",
method_name="add_in_db_media"
).inc()
def record_db_error(self, error_type: str, query_type: str = "unknown", table_name: str = "unknown", operation: str = "unknown"):
"""Record database error occurrence."""
self.db_errors_total.labels(
error_type=error_type,
query_type=query_type,
table_name=table_name,
operation=operation
).inc()
def record_rate_limit_request(self, chat_id: int, success: bool, wait_time: float = 0.0, error_type: str = None):
"""Record rate limit request metrics."""
try:
# Определяем статус
status = "success" if success else "error"
# Записываем счетчик запросов
self.rate_limit_requests_total.labels(
chat_id=str(chat_id),
status=status,
error_type=error_type or "none"
).inc()
# Записываем время ожидания
if wait_time > 0:
self.rate_limit_wait_duration_seconds.labels(
chat_id=str(chat_id)
).observe(wait_time)
# Записываем ошибки
if not success and error_type:
self.rate_limit_errors_total.labels(
error_type=error_type,
chat_id=str(chat_id)
).inc()
except Exception as e:
import logging
logging.warning(f"Failed to record rate limit request: {e}")
def update_rate_limit_gauges(self):
"""Update rate limit gauge metrics."""
try:
from .rate_limit_monitor import rate_limit_monitor
# Обновляем количество активных чатов
self.rate_limit_active_chats.set(len(rate_limit_monitor.stats))
# Обновляем метрики для каждого чата
for chat_id, chat_stats in rate_limit_monitor.stats.items():
chat_id_str = str(chat_id)
# Процент успеха
self.rate_limit_success_rate.labels(
chat_id=chat_id_str
).set(chat_stats.success_rate)
# Запросов в минуту
self.rate_limit_requests_per_minute.labels(
chat_id=chat_id_str
).set(chat_stats.requests_per_minute)
# Общее количество запросов
self.rate_limit_total_requests.labels(
chat_id=chat_id_str
).set(chat_stats.total_requests)
# Среднее время ожидания
self.rate_limit_avg_wait_time_seconds.labels(
chat_id=chat_id_str
).set(chat_stats.average_wait_time)
# Количество ошибок по типам
if chat_stats.retry_after_errors > 0:
self.rate_limit_total_errors.labels(
chat_id=chat_id_str,
error_type="RetryAfter"
).set(chat_stats.retry_after_errors)
if chat_stats.other_errors > 0:
self.rate_limit_total_errors.labels(
chat_id=chat_id_str,
error_type="Other"
).set(chat_stats.other_errors)
except Exception as e:
import logging
logging.warning(f"Failed to update rate limit gauges: {e}")
def get_metrics(self) -> bytes:
"""Generate metrics in Prometheus format."""
# Обновляем gauge метрики rate limiter перед генерацией
self.update_rate_limit_gauges()
return generate_latest(self.registry)
# Global metrics instance
metrics = BotMetrics()
# Decorators for easy metric collection
def track_time(method_name: str = None, handler_type: str = "unknown"):
"""Decorator to track execution time of functions."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"success"
)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"error"
)
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"success"
)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"error"
)
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
def track_errors(handler_type: str = "unknown", method_name: str = None):
"""Decorator to track errors in functions."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
def db_query_time(query_type: str = "unknown", table_name: str = "unknown", operation: str = "unknown"):
"""Decorator to track database query execution time."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
metrics.record_db_error(
type(e).__name__,
query_type,
table_name,
operation
)
metrics.record_error(
type(e).__name__,
"database",
func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
metrics.record_db_error(
type(e).__name__,
query_type,
table_name,
operation
)
metrics.record_error(
type(e).__name__,
"database",
func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
@asynccontextmanager
async def track_middleware(middleware_name: str):
"""Context manager to track middleware execution time."""
start_time = time.time()
try:
yield
duration = time.time() - start_time
metrics.record_middleware(middleware_name, duration, "success")
except Exception as e:
duration = time.time() - start_time
metrics.record_middleware(middleware_name, duration, "error")
metrics.record_error(
type(e).__name__,
"middleware",
middleware_name
)
raise
def track_media_processing(content_type: str = "unknown"):
"""Decorator to track media processing operations."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_media_processing(content_type, duration, True)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_media_processing(content_type, duration, False)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_media_processing(content_type, duration, True)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_media_processing(content_type, duration, False)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
def track_file_operations(content_type: str = "unknown"):
"""Decorator to track file download/upload operations."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
# Получаем размер файла из результата
file_size = 0
if result and isinstance(result, str) and os.path.exists(result):
file_size = os.path.getsize(result)
# Записываем метрики
metrics.record_file_download(content_type, file_size, duration)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_file_download_error(content_type, str(e))
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
# Получаем размер файла из результата
file_size = 0
if result and isinstance(result, str) and os.path.exists(result):
file_size = os.path.getsize(result)
# Записываем метрики
metrics.record_file_download(content_type, file_size, duration)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_file_download_error(content_type, str(e))
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator