- Change user directive in Nginx configuration from 'nginx' to 'www-data'. - Update upstream server configurations in Nginx to use 'localhost' instead of service names. - Modify Nginx server block to redirect HTTP to a status page instead of Grafana. - Rename Alertmanager location from '/alertmanager/' to '/alerts/' for consistency. - Remove deprecated status page configuration and related files. - Adjust Prometheus configuration to reflect the new Docker network settings.
120 lines
2.9 KiB
YAML
120 lines
2.9 KiB
YAML
# Alertmanager Configuration
|
|
# This file configures how alerts are handled and routed
|
|
|
|
global:
|
|
# SMTP configuration for email notifications
|
|
smtp_smarthost: 'localhost:587'
|
|
smtp_from: 'alerts@{{DOMAIN}}'
|
|
smtp_auth_username: 'alerts@{{DOMAIN}}'
|
|
smtp_auth_password: '{{SMTP_PASSWORD}}'
|
|
smtp_require_tls: true
|
|
|
|
# Resolve timeout
|
|
resolve_timeout: 5m
|
|
|
|
# Templates for alert formatting
|
|
templates:
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|
|
|
# Route configuration - defines how alerts are routed
|
|
route:
|
|
group_by: ['alertname', 'cluster', 'service']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 1h
|
|
receiver: 'web.hook'
|
|
routes:
|
|
# Critical alerts - immediate notification
|
|
- match:
|
|
severity: critical
|
|
receiver: 'critical-alerts'
|
|
group_wait: 5s
|
|
repeat_interval: 5m
|
|
|
|
# Warning alerts - grouped notification
|
|
- match:
|
|
severity: warning
|
|
receiver: 'warning-alerts'
|
|
group_wait: 30s
|
|
repeat_interval: 30m
|
|
|
|
# Bot-specific alerts
|
|
- match:
|
|
service: telegram-bot
|
|
receiver: 'bot-alerts'
|
|
group_wait: 10s
|
|
repeat_interval: 15m
|
|
|
|
- match:
|
|
service: anon-bot
|
|
receiver: 'bot-alerts'
|
|
group_wait: 10s
|
|
repeat_interval: 15m
|
|
|
|
# Infrastructure alerts
|
|
- match:
|
|
service: prometheus
|
|
receiver: 'infrastructure-alerts'
|
|
group_wait: 30s
|
|
repeat_interval: 1h
|
|
|
|
- match:
|
|
service: grafana
|
|
receiver: 'infrastructure-alerts'
|
|
group_wait: 30s
|
|
repeat_interval: 1h
|
|
|
|
- match:
|
|
service: nginx
|
|
receiver: 'infrastructure-alerts'
|
|
group_wait: 30s
|
|
repeat_interval: 1h
|
|
|
|
# Inhibition rules - suppress certain alerts when others are firing
|
|
inhibit_rules:
|
|
# Suppress warning alerts when critical alerts are firing
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname', 'cluster', 'service']
|
|
|
|
# Suppress individual instance alerts when the entire service is down
|
|
- source_match:
|
|
alertname: 'ServiceDown'
|
|
target_match:
|
|
alertname: 'InstanceDown'
|
|
equal: ['service']
|
|
|
|
# Receiver configurations
|
|
receivers:
|
|
# Default webhook receiver (for testing)
|
|
- name: 'web.hook'
|
|
webhook_configs:
|
|
- url: 'http://localhost:5001/'
|
|
send_resolved: true
|
|
|
|
# Critical alerts - immediate notification via webhook
|
|
- name: 'critical-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:5001/critical'
|
|
send_resolved: true
|
|
|
|
# Warning alerts - less urgent notification
|
|
- name: 'warning-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:5001/warning'
|
|
send_resolved: true
|
|
|
|
# Bot-specific alerts
|
|
- name: 'bot-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:5001/bot'
|
|
send_resolved: true
|
|
|
|
# Infrastructure alerts
|
|
- name: 'infrastructure-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:5001/infrastructure'
|
|
send_resolved: true
|