Files
telegram-helper-bot/helper_bot/server_prometheus.py
Andrey c8c7d50cbb Refactor metrics handling and improve logging
- Removed the MetricsManager initialization from `run_helper.py` to avoid duplication, as metrics are now handled in `main.py`.
- Updated logging levels in `server_prometheus.py` and `metrics_middleware.py` to use debug instead of info for less critical messages.
- Added metrics configuration to `BaseDependencyFactory` for better management of metrics settings.
- Deleted the obsolete `metrics_exporter.py` file to streamline the codebase.
- Updated various tests to reflect changes in the metrics handling and ensure proper functionality.
2025-09-03 00:33:20 +03:00

166 lines
5.8 KiB
Python

"""
HTTP server for metrics endpoint integration with centralized Prometheus monitoring.
Provides /metrics endpoint and health check for the bot.
"""
import asyncio
import logging
from aiohttp import web
from typing import Optional
from .utils.metrics import metrics
class MetricsServer:
"""HTTP server for Prometheus metrics and health checks."""
def __init__(self, host: str = '0.0.0.0', port: int = 8080):
self.host = host
self.port = port
self.app = web.Application()
self.runner: Optional[web.AppRunner] = None
self.site: Optional[web.TCPSite] = None
self.logger = logging.getLogger(__name__)
# Настраиваем роуты
self.app.router.add_get('/metrics', self.metrics_handler)
self.app.router.add_get('/health', self.health_handler)
async def metrics_handler(self, request: web.Request) -> web.Response:
"""Handle /metrics endpoint for Prometheus scraping."""
try:
self.logger.debug("Generating metrics...")
# Проверяем, что metrics доступен
if not metrics:
self.logger.error("Metrics object is not available")
return web.Response(
text="Metrics not available",
status=500
)
# Генерируем метрики в формате Prometheus
metrics_data = metrics.get_metrics()
self.logger.debug(f"Generated metrics: {len(metrics_data)} bytes")
return web.Response(
body=metrics_data,
content_type='text/plain; version=0.0.4'
)
except Exception as e:
self.logger.error(f"Error generating metrics: {e}")
import traceback
self.logger.error(f"Traceback: {traceback.format_exc()}")
return web.Response(
text=f"Error generating metrics: {e}",
status=500
)
async def health_handler(self, request: web.Request) -> web.Response:
"""Handle /health endpoint for health checks."""
try:
# Проверяем доступность метрик
if not metrics:
return web.Response(
text="ERROR: Metrics not available",
content_type='text/plain',
status=503
)
# Проверяем, что можем получить метрики
try:
metrics_data = metrics.get_metrics()
if not metrics_data:
return web.Response(
text="ERROR: Empty metrics",
content_type='text/plain',
status=503
)
except Exception as e:
return web.Response(
text=f"ERROR: Metrics generation failed: {e}",
content_type='text/plain',
status=503
)
return web.Response(
text="OK",
content_type='text/plain',
status=200
)
except Exception as e:
self.logger.error(f"Health check failed: {e}")
return web.Response(
text=f"ERROR: Health check failed: {e}",
content_type='text/plain',
status=500
)
async def start(self) -> None:
"""Start the HTTP server."""
try:
self.runner = web.AppRunner(self.app)
await self.runner.setup()
self.site = web.TCPSite(self.runner, self.host, self.port)
await self.site.start()
self.logger.info(f"Metrics server started on {self.host}:{self.port}")
self.logger.info("Available endpoints:")
self.logger.info(f" - /metrics - Prometheus metrics")
self.logger.info(f" - /health - Health check")
except Exception as e:
self.logger.error(f"Failed to start metrics server: {e}")
raise
async def stop(self) -> None:
"""Stop the HTTP server."""
try:
if self.site:
await self.site.stop()
self.logger.info("Metrics server site stopped")
if self.runner:
await self.runner.cleanup()
self.logger.info("Metrics server runner cleaned up")
except Exception as e:
self.logger.error(f"Error stopping metrics server: {e}")
async def __aenter__(self):
"""Async context manager entry."""
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.stop()
# Глобальный экземпляр сервера для использования в main.py
metrics_server: Optional[MetricsServer] = None
async def start_metrics_server(host: str = '0.0.0.0', port: int = 8080) -> MetricsServer:
"""Start metrics server and return instance."""
global metrics_server
metrics_server = MetricsServer(host, port)
await metrics_server.start()
return metrics_server
async def stop_metrics_server() -> None:
"""Stop metrics server if running."""
global metrics_server
if metrics_server:
try:
await metrics_server.stop()
logger = logging.getLogger(__name__)
logger.info("Metrics server stopped successfully")
except Exception as e:
logger = logging.getLogger(__name__)
logger.error(f"Error stopping metrics server: {e}")
finally:
metrics_server = None