Files
prod/infra/monitoring/prometheus_server.py

144 lines
5.9 KiB
Python

import asyncio
import logging
from aiohttp import web
try:
from .metrics_collector import MetricsCollector
except ImportError:
from metrics_collector import MetricsCollector
logger = logging.getLogger(__name__)
class PrometheusServer:
def __init__(self, host='0.0.0.0', port=9091):
self.host = host
self.port = port
self.metrics_collector = MetricsCollector()
self.app = web.Application()
self.setup_routes()
def setup_routes(self):
"""Настройка маршрутов для Prometheus"""
self.app.router.add_get('/', self.root_handler)
self.app.router.add_get('/metrics', self.metrics_handler)
self.app.router.add_get('/health', self.health_handler)
async def root_handler(self, request):
"""Главная страница"""
return web.Response(
text="Prometheus Metrics Server\n\n"
"Available endpoints:\n"
"- /metrics - Prometheus metrics\n"
"- /health - Health check",
content_type='text/plain'
)
async def health_handler(self, request):
"""Health check endpoint"""
return web.Response(
text="OK",
content_type='text/plain'
)
async def metrics_handler(self, request):
"""Endpoint для Prometheus метрик"""
try:
metrics_data = self.metrics_collector.get_metrics_data()
prometheus_metrics = self._format_prometheus_metrics(metrics_data)
return web.Response(
text=prometheus_metrics,
content_type='text/plain'
)
except Exception as e:
logger.error(f"Ошибка при получении метрик: {e}")
return web.Response(
text=f"Error: {str(e)}",
status=500,
content_type='text/plain'
)
def _format_prometheus_metrics(self, metrics_data: dict) -> str:
"""Форматирование метрик в Prometheus формат"""
lines = []
# Системная информация
lines.append("# HELP system_info System information")
lines.append("# TYPE system_info gauge")
lines.append(f"system_info{{os=\"{self.metrics_collector.os_type}\"}} 1")
# CPU метрики
if 'cpu_usage_percent' in metrics_data:
lines.append("# HELP cpu_usage_percent CPU usage percentage")
lines.append("# TYPE cpu_usage_percent gauge")
lines.append(f"cpu_usage_percent {metrics_data['cpu_usage_percent']}")
if 'load_average_1m' in metrics_data:
lines.append("# HELP load_average_1m 1 minute load average")
lines.append("# TYPE load_average_1m gauge")
lines.append(f"load_average_1m {metrics_data['load_average_1m']}")
if 'load_average_5m' in metrics_data:
lines.append("# HELP load_average_5m 5 minute load average")
lines.append("# TYPE load_average_5m gauge")
lines.append(f"load_average_5m {metrics_data['load_average_5m']}")
if 'load_average_15m' in metrics_data:
lines.append("# HELP load_average_15m 15 minute load average")
lines.append("# TYPE load_average_15m gauge")
lines.append(f"load_average_15m {metrics_data['load_average_15m']}")
# RAM метрики
if 'ram_usage_percent' in metrics_data:
lines.append("# HELP ram_usage_percent RAM usage percentage")
lines.append("# TYPE ram_usage_percent gauge")
lines.append(f"ram_usage_percent {metrics_data['ram_usage_percent']}")
# Disk метрики
if 'disk_usage_percent' in metrics_data:
lines.append("# HELP disk_usage_percent Disk usage percentage")
lines.append("# TYPE disk_usage_percent gauge")
lines.append(f"disk_usage_percent {metrics_data['disk_usage_percent']}")
if 'disk_io_percent' in metrics_data:
lines.append("# HELP disk_io_percent Disk I/O usage percentage")
lines.append("# TYPE disk_io_percent gauge")
lines.append(f"disk_io_percent {metrics_data['disk_io_percent']}")
# Swap метрики
if 'swap_usage_percent' in metrics_data:
lines.append("# HELP swap_usage_percent Swap usage percentage")
lines.append("# TYPE swap_usage_percent gauge")
lines.append(f"swap_usage_percent {metrics_data['swap_usage_percent']}")
# Uptime метрики
if 'system_uptime_seconds' in metrics_data:
lines.append("# HELP system_uptime_seconds System uptime in seconds")
lines.append("# TYPE system_uptime_seconds gauge")
lines.append(f"system_uptime_seconds {metrics_data['system_uptime_seconds']}")
if 'monitor_uptime_seconds' in metrics_data:
lines.append("# HELP monitor_uptime_seconds Monitor uptime in seconds")
lines.append("# TYPE monitor_uptime_seconds gauge")
lines.append(f"monitor_uptime_seconds {metrics_data['monitor_uptime_seconds']}")
return '\n'.join(lines)
async def start(self):
"""Запуск HTTP сервера"""
runner = web.AppRunner(self.app)
await runner.setup()
site = web.TCPSite(runner, self.host, self.port)
await site.start()
logger.info(f"Prometheus сервер запущен на http://{self.host}:{self.port}")
return runner
async def stop(self, runner):
"""Остановка HTTP сервера"""
await runner.cleanup()
logger.info("Prometheus сервер остановлен")