Refactor metrics handling and improve logging

- Removed the MetricsManager initialization from `run_helper.py` to avoid duplication, as metrics are now handled in `main.py`.
- Updated logging levels in `server_prometheus.py` and `metrics_middleware.py` to use debug instead of info for less critical messages.
- Added metrics configuration to `BaseDependencyFactory` for better management of metrics settings.
- Deleted the obsolete `metrics_exporter.py` file to streamline the codebase.
- Updated various tests to reflect changes in the metrics handling and ensure proper functionality.
This commit is contained in:
2025-09-03 00:33:20 +03:00
parent 6fcecff97c
commit c8c7d50cbb
19 changed files with 402 additions and 605 deletions

View File

@@ -47,21 +47,18 @@ class MetricsMiddleware(BaseMiddleware):
) -> Any:
"""Process event and collect metrics."""
# Добавляем логирование для диагностики
self.logger.info(f"📊 MetricsMiddleware called for event type: {type(event).__name__}")
# Extract command info before execution
command_info = None
if isinstance(event, Message):
self.logger.info(f"📊 Processing Message event")
self.logger.debug(f"📊 Processing Message event")
await self._record_message_metrics(event)
command_info = self._extract_command_info(event)
elif isinstance(event, CallbackQuery):
self.logger.info(f"📊 Processing CallbackQuery event")
self.logger.debug(f"📊 Processing CallbackQuery event")
await self._record_callback_metrics(event)
command_info = self._extract_callback_command_info(event)
else:
self.logger.info(f"📊 Processing unknown event type: {type(event).__name__}")
self.logger.debug(f"📊 Processing unknown event type: {type(event).__name__}")
# Execute handler with timing
start_time = time.time()
@@ -71,7 +68,7 @@ class MetricsMiddleware(BaseMiddleware):
# Record successful execution
handler_name = self._get_handler_name(handler)
self.logger.info(f"📊 Recording successful execution: {handler_name}")
self.logger.debug(f"📊 Recording successful execution: {handler_name}")
metrics.record_method_duration(
handler_name,
duration,
@@ -95,7 +92,7 @@ class MetricsMiddleware(BaseMiddleware):
# Record error and timing
handler_name = self._get_handler_name(handler)
self.logger.error(f"📊 Recording error execution: {handler_name}, error: {type(e).__name__}")
self.logger.debug(f"📊 Recording error execution: {handler_name}, error: {type(e).__name__}")
metrics.record_method_duration(
handler_name,
duration,

View File

@@ -29,7 +29,7 @@ class MetricsServer:
async def metrics_handler(self, request: web.Request) -> web.Response:
"""Handle /metrics endpoint for Prometheus scraping."""
try:
self.logger.info("Generating metrics...")
self.logger.debug("Generating metrics...")
# Проверяем, что metrics доступен
if not metrics:
@@ -40,9 +40,8 @@ class MetricsServer:
)
# Генерируем метрики в формате Prometheus
self.logger.info("Calling metrics.get_metrics()...")
metrics_data = metrics.get_metrics()
self.logger.info(f"Generated metrics: {len(metrics_data)} bytes")
self.logger.debug(f"Generated metrics: {len(metrics_data)} bytes")
return web.Response(
body=metrics_data,

View File

@@ -43,6 +43,11 @@ class BaseDependencyFactory:
'test': self._parse_bool(os.getenv('TEST', 'false'))
}
self.settings['Metrics'] = {
'host': os.getenv('METRICS_HOST', '0.0.0.0'),
'port': self._parse_int(os.getenv('METRICS_PORT', '8080'))
}
def _parse_bool(self, value: str) -> bool:
"""Парсит строковое значение в boolean."""
return value.lower() in ('true', '1', 'yes', 'on')

View File

@@ -1,290 +0,0 @@
"""
Metrics exporter for Prometheus.
Provides HTTP endpoint for metrics collection and background metrics collection.
"""
import asyncio
import logging
from aiohttp import web
from typing import Optional, Dict, Any, Protocol
from .metrics import metrics
import time
class DatabaseProvider(Protocol):
"""Protocol for database operations."""
async def fetch_one(self, query: str, params: tuple = ()) -> Optional[Dict[str, Any]]:
"""Execute query and return single result."""
...
class MetricsCollector(Protocol):
"""Protocol for metrics collection operations."""
async def collect_user_metrics(self, db: DatabaseProvider) -> None:
"""Collect user-related metrics."""
...
class UserMetricsCollector:
"""Concrete implementation of user metrics collection."""
def __init__(self, logger: logging.Logger):
self.logger = logger
async def collect_user_metrics(self, db: DatabaseProvider) -> None:
"""Collect user-related metrics from database."""
try:
# Проверяем, есть ли метод fetch_one (асинхронная БД)
if hasattr(db, 'fetch_one'):
# Используем UNIX timestamp для сравнения с date_changed
current_timestamp = int(time.time())
one_day_ago = current_timestamp - (24 * 60 * 60) # 24 часа назад
active_users_query = """
SELECT COUNT(DISTINCT user_id) as active_users
FROM our_users
WHERE date_changed > ?
"""
result = await db.fetch_one(active_users_query, (one_day_ago,))
if result:
metrics.set_active_users(result['active_users'], 'daily')
self.logger.debug(f"Updated active users: {result['active_users']}")
else:
metrics.set_active_users(0, 'daily')
self.logger.debug("Updated active users: 0")
# Проверяем синхронную БД BotDB
elif hasattr(db, 'connect') and hasattr(db, 'cursor'):
# Используем синхронный запрос для BotDB в отдельном потоке
import asyncio
from concurrent.futures import ThreadPoolExecutor
current_timestamp = int(time.time())
one_day_ago = current_timestamp - (24 * 60 * 60) # 24 часа назад
active_users_query = """
SELECT COUNT(DISTINCT user_id) as active_users
FROM our_users
WHERE date_changed > ?
"""
def sync_db_query():
try:
db.connect()
db.cursor.execute(active_users_query, (one_day_ago,))
result = db.cursor.fetchone()
return result[0] if result else 0
finally:
db.close()
# Выполняем синхронный запрос в отдельном потоке
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as executor:
result = await loop.run_in_executor(executor, sync_db_query)
metrics.set_active_users(result, 'daily')
self.logger.debug(f"Updated active users: {result}")
else:
metrics.set_active_users(0, 'daily')
self.logger.warning("Database doesn't support fetch_one or connect methods")
except Exception as e:
self.logger.error(f"Error collecting user metrics: {e}")
metrics.set_active_users(0, 'daily')
class DependencyProvider(Protocol):
"""Protocol for dependency injection."""
def get_db(self) -> DatabaseProvider:
"""Get database instance."""
...
class BackgroundMetricsCollector:
"""Background service for collecting periodic metrics using dependency injection."""
def __init__(
self,
dependency_provider: DependencyProvider,
metrics_collector: MetricsCollector,
interval: int = 60
):
self.dependency_provider = dependency_provider
self.metrics_collector = metrics_collector
self.interval = interval
self.running = False
self.logger = logging.getLogger(__name__)
async def start(self):
"""Start background metrics collection."""
self.running = True
self.logger.info("Background metrics collector started")
while self.running:
try:
await self._collect_metrics()
await asyncio.sleep(self.interval)
except Exception as e:
self.logger.error(f"Error in background metrics collection: {e}")
await asyncio.sleep(self.interval)
async def stop(self):
"""Stop background metrics collection."""
self.running = False
self.logger.info("Background metrics collector stopped")
async def _collect_metrics(self):
"""Collect periodic metrics using dependency injection."""
try:
db = self.dependency_provider.get_db()
if db:
await self.metrics_collector.collect_user_metrics(db)
else:
self.logger.warning("Database not available for metrics collection")
except Exception as e:
self.logger.error(f"Error collecting metrics: {e}")
class MetricsExporter:
"""HTTP server for exposing Prometheus metrics."""
def __init__(self, host: str = "0.0.0.0", port: int = 8000):
self.host = host
self.port = port
self.app = web.Application()
self.runner: Optional[web.AppRunner] = None
self.site: Optional[web.TCPSite] = None
self.logger = logging.getLogger(__name__)
# Setup routes
self.app.router.add_get('/metrics', self.metrics_handler)
self.app.router.add_get('/health', self.health_handler)
self.app.router.add_get('/', self.root_handler)
async def start(self):
"""Start the metrics server."""
try:
self.runner = web.AppRunner(self.app)
await self.runner.setup()
self.site = web.TCPSite(self.runner, self.host, self.port)
await self.site.start()
self.logger.info(f"Metrics server started on {self.host}:{self.port}")
except Exception as e:
self.logger.error(f"Failed to start metrics server: {e}")
raise
async def stop(self):
"""Stop the metrics server."""
try:
if self.site:
await self.site.stop()
self.logger.info("Metrics server site stopped")
if self.runner:
await self.runner.cleanup()
self.logger.info("Metrics server runner cleaned up")
except Exception as e:
self.logger.error(f"Error stopping metrics server: {e}")
finally:
# Очищаем ссылки
self.site = None
self.runner = None
# Даем время на закрытие всех соединений
await asyncio.sleep(0.1)
self.logger.info("Metrics server stopped")
async def metrics_handler(self, request: web.Request) -> web.Response:
"""Handle /metrics endpoint for Prometheus."""
try:
metrics_data = metrics.get_metrics()
self.logger.debug(f"Generated metrics: {len(metrics_data)} bytes")
return web.Response(
body=metrics_data,
content_type='text/plain; version=0.0.4'
)
except Exception as e:
self.logger.error(f"Error generating metrics: {e}")
return web.Response(
text=f"Error generating metrics: {e}",
status=500
)
async def health_handler(self, request: web.Request) -> web.Response:
"""Handle /health endpoint for health checks."""
return web.json_response({
"status": "healthy",
"service": "telegram-bot-metrics"
})
async def root_handler(self, request: web.Request) -> web.Response:
"""Handle root endpoint with basic info."""
return web.json_response({
"service": "Telegram Bot Metrics Exporter",
"endpoints": {
"/metrics": "Prometheus metrics",
"/health": "Health check",
"/": "This info"
}
})
class MetricsManager:
"""Main class for managing metrics collection and export."""
def __init__(self, host: str = "0.0.0.0", port: int = 8000):
self.exporter = MetricsExporter(host, port)
# Dependency injection setup
from helper_bot.utils.base_dependency_factory import get_global_instance
dependency_provider = get_global_instance()
metrics_collector = UserMetricsCollector(logging.getLogger(__name__))
self.collector = BackgroundMetricsCollector(
dependency_provider=dependency_provider,
metrics_collector=metrics_collector
)
self.logger = logging.getLogger(__name__)
async def start(self):
"""Start metrics collection and export."""
try:
# Start metrics exporter
await self.exporter.start()
# Start background collector
asyncio.create_task(self.collector.start())
self.logger.info("Metrics manager started successfully")
except Exception as e:
self.logger.error(f"Failed to start metrics manager: {e}")
raise
async def stop(self):
"""Stop metrics collection and export."""
try:
# Останавливаем background collector
if hasattr(self, 'collector'):
await self.collector.stop()
self.logger.info("Background metrics collector stopped")
# Останавливаем exporter
if hasattr(self, 'exporter'):
await self.exporter.stop()
self.logger.info("Metrics exporter stopped")
except Exception as e:
self.logger.error(f"Error stopping metrics manager: {e}")
# Не вызываем raise, чтобы не прерывать процесс завершения
finally:
# Очищаем ссылки
self.collector = None
self.exporter = None
self.logger.info("Metrics manager stopped successfully")