Files
telegram-helper-bot/helper_bot/utils/metrics.py
Andrey fc0517c011 Enhance bot functionality with new features and improvements
- Added a new `/status` endpoint in `server_prometheus.py` to provide process status information, including uptime and resource usage metrics.
- Implemented a PID manager in `run_helper.py` to track the bot's process, improving monitoring capabilities.
- Introduced a method to delete audio moderation records in `audio_repository.py`, enhancing database management.
- Updated voice message handling in callback handlers to ensure proper deletion of audio moderation records.
- Improved error handling and logging in various services, ensuring better tracking of media processing and file downloads.
- Refactored media handling functions to streamline operations and improve code readability.
- Enhanced metrics tracking for file downloads and media processing, providing better insights into bot performance.
2025-09-04 00:46:45 +03:00

452 lines
16 KiB
Python

"""
Metrics module for Telegram bot monitoring with Prometheus.
Provides predefined metrics for bot commands, errors, performance, and user activity.
"""
from typing import Dict, Any, Optional
from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
from prometheus_client.core import CollectorRegistry
import time
from functools import wraps
import asyncio
from contextlib import asynccontextmanager
class BotMetrics:
"""Central class for managing all bot metrics."""
def __init__(self):
self.registry = CollectorRegistry()
# Bot commands counter
self.bot_commands_total = Counter(
'bot_commands_total',
'Total number of bot commands processed',
['command', 'status', 'handler_type', 'user_type'],
registry=self.registry
)
# Method execution time histogram
self.method_duration_seconds = Histogram(
'method_duration_seconds',
'Time spent executing methods',
['method_name', 'handler_type', 'status'],
# Оптимизированные buckets для Telegram API (обычно < 1 сек)
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
registry=self.registry
)
# Errors counter
self.errors_total = Counter(
'errors_total',
'Total number of errors',
['error_type', 'handler_type', 'method_name'],
registry=self.registry
)
# Active users gauge
self.active_users = Gauge(
'active_users',
'Number of currently active users',
['user_type'],
registry=self.registry
)
# Total users gauge (отдельная метрика)
self.total_users = Gauge(
'total_users',
'Total number of users in database',
registry=self.registry
)
# Database query metrics
self.db_query_duration_seconds = Histogram(
'db_query_duration_seconds',
'Time spent executing database queries',
['query_type', 'table_name', 'operation'],
# Оптимизированные buckets для SQLite/PostgreSQL
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5],
registry=self.registry
)
# Database queries counter
self.db_queries_total = Counter(
'db_queries_total',
'Total number of database queries executed',
['query_type', 'table_name', 'operation'],
registry=self.registry
)
# Database errors counter
self.db_errors_total = Counter(
'db_errors_total',
'Total number of database errors',
['error_type', 'query_type', 'table_name', 'operation'],
registry=self.registry
)
# Message processing metrics
self.messages_processed_total = Counter(
'messages_processed_total',
'Total number of messages processed',
['message_type', 'chat_type', 'handler_type'],
registry=self.registry
)
# Middleware execution metrics
self.middleware_duration_seconds = Histogram(
'middleware_duration_seconds',
'Time spent in middleware execution',
['middleware_name', 'status'],
# Middleware должен быть быстрым
buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.25],
registry=self.registry
)
# Rate limiting metrics
self.rate_limit_hits_total = Counter(
'rate_limit_hits_total',
'Total number of rate limit hits',
['limit_type', 'user_id', 'action'],
registry=self.registry
)
# User activity metrics
self.user_activity_total = Counter(
'user_activity_total',
'Total user activity events',
['activity_type', 'user_type', 'chat_type'],
registry=self.registry
)
# File download metrics
self.file_downloads_total = Counter(
'file_downloads_total',
'Total number of file downloads',
['content_type', 'status'],
registry=self.registry
)
self.file_download_duration_seconds = Histogram(
'file_download_duration_seconds',
'Time spent downloading files',
['content_type'],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0],
registry=self.registry
)
self.file_download_size_bytes = Histogram(
'file_download_size_bytes',
'Size of downloaded files in bytes',
['content_type'],
buckets=[1024, 10240, 102400, 1048576, 10485760, 104857600, 1073741824],
registry=self.registry
)
# Media processing metrics
self.media_processing_total = Counter(
'media_processing_total',
'Total number of media processing operations',
['content_type', 'status'],
registry=self.registry
)
self.media_processing_duration_seconds = Histogram(
'media_processing_duration_seconds',
'Time spent processing media',
['content_type'],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0],
registry=self.registry
)
def record_command(self, command_type: str, handler_type: str = "unknown", user_type: str = "unknown", status: str = "success"):
"""Record a bot command execution."""
self.bot_commands_total.labels(
command=command_type,
status=status,
handler_type=handler_type,
user_type=user_type
).inc()
def record_error(self, error_type: str, handler_type: str = "unknown", method_name: str = "unknown"):
"""Record an error occurrence."""
self.errors_total.labels(
error_type=error_type,
handler_type=handler_type,
method_name=method_name
).inc()
def record_method_duration(self, method_name: str, duration: float, handler_type: str = "unknown", status: str = "success"):
"""Record method execution duration."""
self.method_duration_seconds.labels(
method_name=method_name,
handler_type=handler_type,
status=status
).observe(duration)
def set_active_users(self, count: int, user_type: str = "daily"):
"""Set the number of active users for a specific type."""
self.active_users.labels(user_type=user_type).set(count)
def set_total_users(self, count: int):
"""Set the total number of users in database."""
self.total_users.set(count)
def record_db_query(self, query_type: str, duration: float, table_name: str = "unknown", operation: str = "unknown"):
"""Record database query duration."""
self.db_query_duration_seconds.labels(
query_type=query_type,
table_name=table_name,
operation=operation
).observe(duration)
self.db_queries_total.labels(
query_type=query_type,
table_name=table_name,
operation=operation
).inc()
def record_message(self, message_type: str, chat_type: str = "unknown", handler_type: str = "unknown"):
"""Record a processed message."""
self.messages_processed_total.labels(
message_type=message_type,
chat_type=chat_type,
handler_type=handler_type
).inc()
def record_middleware(self, middleware_name: str, duration: float, status: str = "success"):
"""Record middleware execution duration."""
self.middleware_duration_seconds.labels(
middleware_name=middleware_name,
status=status
).observe(duration)
def record_file_download(self, content_type: str, file_size: int, duration: float):
"""Record file download metrics."""
self.file_downloads_total.labels(
content_type=content_type,
status="success"
).inc()
self.file_download_duration_seconds.labels(
content_type=content_type
).observe(duration)
self.file_download_size_bytes.labels(
content_type=content_type
).observe(file_size)
def record_file_download_error(self, content_type: str, error_message: str):
"""Record file download error metrics."""
self.file_downloads_total.labels(
content_type=content_type,
status="error"
).inc()
self.errors_total.labels(
error_type="file_download_error",
handler_type="media_processing",
method_name="download_file"
).inc()
def record_media_processing(self, content_type: str, duration: float, success: bool):
"""Record media processing metrics."""
status = "success" if success else "error"
self.media_processing_total.labels(
content_type=content_type,
status=status
).inc()
self.media_processing_duration_seconds.labels(
content_type=content_type
).observe(duration)
if not success:
self.errors_total.labels(
error_type="media_processing_error",
handler_type="media_processing",
method_name="add_in_db_media"
).inc()
def get_metrics(self) -> bytes:
"""Generate metrics in Prometheus format."""
return generate_latest(self.registry)
# Global metrics instance
metrics = BotMetrics()
# Decorators for easy metric collection
def track_time(method_name: str = None, handler_type: str = "unknown"):
"""Decorator to track execution time of functions."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"success"
)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"error"
)
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"success"
)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_method_duration(
method_name or func.__name__,
duration,
handler_type,
"error"
)
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
def track_errors(handler_type: str = "unknown", method_name: str = None):
"""Decorator to track errors in functions."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
metrics.record_error(
type(e).__name__,
handler_type,
method_name or func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
def db_query_time(query_type: str = "unknown", table_name: str = "unknown", operation: str = "unknown"):
"""Decorator to track database query execution time."""
def decorator(func):
@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
metrics.record_db_error(
type(e).__name__,
query_type,
table_name,
operation
)
metrics.record_error(
type(e).__name__,
"database",
func.__name__
)
raise
@wraps(func)
def sync_wrapper(*args, **kwargs):
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
return result
except Exception as e:
duration = time.time() - start_time
metrics.record_db_query(query_type, duration, table_name, operation)
metrics.record_db_error(
type(e).__name__,
query_type,
table_name,
operation
)
metrics.record_error(
type(e).__name__,
"database",
func.__name__
)
raise
if asyncio.iscoroutinefunction(func):
return async_wrapper
return sync_wrapper
return decorator
@asynccontextmanager
async def track_middleware(middleware_name: str):
"""Context manager to track middleware execution time."""
start_time = time.time()
try:
yield
duration = time.time() - start_time
metrics.record_middleware(middleware_name, duration, "success")
except Exception as e:
duration = time.time() - start_time
metrics.record_middleware(middleware_name, duration, "error")
metrics.record_error(
type(e).__name__,
"middleware",
middleware_name
)
raise