Files
prod/infra/alertmanager/alertmanager.yml
Andrey 1db579797d refactor: update Nginx configuration and Docker setup
- Change user directive in Nginx configuration from 'nginx' to 'www-data'.
- Update upstream server configurations in Nginx to use 'localhost' instead of service names.
- Modify Nginx server block to redirect HTTP to a status page instead of Grafana.
- Rename Alertmanager location from '/alertmanager/' to '/alerts/' for consistency.
- Remove deprecated status page configuration and related files.
- Adjust Prometheus configuration to reflect the new Docker network settings.
2025-09-18 21:21:23 +03:00

120 lines
2.9 KiB
YAML

# Alertmanager Configuration
# This file configures how alerts are handled and routed
global:
# SMTP configuration for email notifications
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@{{DOMAIN}}'
smtp_auth_username: 'alerts@{{DOMAIN}}'
smtp_auth_password: '{{SMTP_PASSWORD}}'
smtp_require_tls: true
# Resolve timeout
resolve_timeout: 5m
# Templates for alert formatting
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Route configuration - defines how alerts are routed
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
# Critical alerts - immediate notification
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 5m
# Warning alerts - grouped notification
- match:
severity: warning
receiver: 'warning-alerts'
group_wait: 30s
repeat_interval: 30m
# Bot-specific alerts
- match:
service: telegram-bot
receiver: 'bot-alerts'
group_wait: 10s
repeat_interval: 15m
- match:
service: anon-bot
receiver: 'bot-alerts'
group_wait: 10s
repeat_interval: 15m
# Infrastructure alerts
- match:
service: prometheus
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
- match:
service: grafana
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
- match:
service: nginx
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
# Inhibition rules - suppress certain alerts when others are firing
inhibit_rules:
# Suppress warning alerts when critical alerts are firing
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service']
# Suppress individual instance alerts when the entire service is down
- source_match:
alertname: 'ServiceDown'
target_match:
alertname: 'InstanceDown'
equal: ['service']
# Receiver configurations
receivers:
# Default webhook receiver (for testing)
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:5001/'
send_resolved: true
# Critical alerts - immediate notification via webhook
- name: 'critical-alerts'
webhook_configs:
- url: 'http://localhost:5001/critical'
send_resolved: true
# Warning alerts - less urgent notification
- name: 'warning-alerts'
webhook_configs:
- url: 'http://localhost:5001/warning'
send_resolved: true
# Bot-specific alerts
- name: 'bot-alerts'
webhook_configs:
- url: 'http://localhost:5001/bot'
send_resolved: true
# Infrastructure alerts
- name: 'infrastructure-alerts'
webhook_configs:
- url: 'http://localhost:5001/infrastructure'
send_resolved: true