Enhance metrics handling and logging in bot

- Integrated metrics scheduler start and stop functionality in `run_helper.py` for better resource management.
- Improved logging for metrics server operations in `server_prometheus.py`, ensuring clearer error reporting and status updates.
- Updated metrics middleware to collect comprehensive metrics for all event types, enhancing monitoring capabilities.
- Added active user metrics tracking in `admin_handlers.py` to provide insights on user engagement.
- Refactored command and callback handling in `metrics_middleware.py` to improve clarity and error handling.
This commit is contained in:
2025-09-03 16:16:14 +03:00
parent c8c7d50cbb
commit fe06008930
6 changed files with 598 additions and 80 deletions

View File

@@ -5,11 +5,18 @@ Provides /metrics endpoint and health check for the bot.
"""
import asyncio
import logging
from aiohttp import web
from typing import Optional
from .utils.metrics import metrics
# Импортируем логгер из проекта
try:
from logs.custom_logger import logger
except ImportError:
# Fallback для случаев, когда custom_logger недоступен
import logging
logger = logging.getLogger(__name__)
class MetricsServer:
"""HTTP server for Prometheus metrics and health checks."""
@@ -20,7 +27,6 @@ class MetricsServer:
self.app = web.Application()
self.runner: Optional[web.AppRunner] = None
self.site: Optional[web.TCPSite] = None
self.logger = logging.getLogger(__name__)
# Настраиваем роуты
self.app.router.add_get('/metrics', self.metrics_handler)
@@ -29,11 +35,11 @@ class MetricsServer:
async def metrics_handler(self, request: web.Request) -> web.Response:
"""Handle /metrics endpoint for Prometheus scraping."""
try:
self.logger.debug("Generating metrics...")
logger.debug("Generating metrics...")
# Проверяем, что metrics доступен
if not metrics:
self.logger.error("Metrics object is not available")
logger.error("Metrics object is not available")
return web.Response(
text="Metrics not available",
status=500
@@ -41,16 +47,16 @@ class MetricsServer:
# Генерируем метрики в формате Prometheus
metrics_data = metrics.get_metrics()
self.logger.debug(f"Generated metrics: {len(metrics_data)} bytes")
logger.debug(f"Generated metrics: {len(metrics_data)} bytes")
return web.Response(
body=metrics_data,
content_type='text/plain; version=0.0.4'
)
except Exception as e:
self.logger.error(f"Error generating metrics: {e}")
logger.error(f"Error generating metrics: {e}")
import traceback
self.logger.error(f"Traceback: {traceback.format_exc()}")
logger.error(f"Traceback: {traceback.format_exc()}")
return web.Response(
text=f"Error generating metrics: {e}",
status=500
@@ -89,7 +95,7 @@ class MetricsServer:
status=200
)
except Exception as e:
self.logger.error(f"Health check failed: {e}")
logger.error(f"Health check failed: {e}")
return web.Response(
text=f"ERROR: Health check failed: {e}",
content_type='text/plain',
@@ -105,13 +111,13 @@ class MetricsServer:
self.site = web.TCPSite(self.runner, self.host, self.port)
await self.site.start()
self.logger.info(f"Metrics server started on {self.host}:{self.port}")
self.logger.info("Available endpoints:")
self.logger.info(f" - /metrics - Prometheus metrics")
self.logger.info(f" - /health - Health check")
logger.info(f"Metrics server started on {self.host}:{self.port}")
logger.info("Available endpoints:")
logger.info(f" - /metrics - Prometheus metrics")
logger.info(f" - /health - Health check")
except Exception as e:
self.logger.error(f"Failed to start metrics server: {e}")
logger.error(f"Failed to start metrics server: {e}")
raise
async def stop(self) -> None:
@@ -119,14 +125,14 @@ class MetricsServer:
try:
if self.site:
await self.site.stop()
self.logger.info("Metrics server site stopped")
logger.info("Metrics server site stopped")
if self.runner:
await self.runner.cleanup()
self.logger.info("Metrics server runner cleaned up")
logger.info("Metrics server runner cleaned up")
except Exception as e:
self.logger.error(f"Error stopping metrics server: {e}")
logger.error(f"Error stopping metrics server: {e}")
async def __aenter__(self):
"""Async context manager entry."""
@@ -156,10 +162,8 @@ async def stop_metrics_server() -> None:
if metrics_server:
try:
await metrics_server.stop()
logger = logging.getLogger(__name__)
logger.info("Metrics server stopped successfully")
except Exception as e:
logger = logging.getLogger(__name__)
logger.error(f"Error stopping metrics server: {e}")
finally:
metrics_server = None