prod/infra/alertmanager/alertmanager.yml

# Alertmanager Configuration
# This file configures how alerts are handled and routed

global:
  # SMTP configuration for email notifications
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alerts@{{DOMAIN}}'
  smtp_auth_username: 'alerts@{{DOMAIN}}'
  smtp_auth_password: '{{SMTP_PASSWORD}}'
  smtp_require_tls: true

  # Resolve timeout
  resolve_timeout: 5m

# Templates for alert formatting
templates:
  - '/etc/alertmanager/templates/*.tmpl'

# Route configuration - defines how alerts are routed
route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
  routes:
    # Critical alerts - immediate notification
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 5s
      repeat_interval: 5m

    # Warning alerts - grouped notification
    - match:
        severity: warning
      receiver: 'warning-alerts'
      group_wait: 30s
      repeat_interval: 30m

    # Bot-specific alerts
    - match:
        service: telegram-bot
      receiver: 'bot-alerts'
      group_wait: 10s
      repeat_interval: 15m

    - match:
        service: anon-bot
      receiver: 'bot-alerts'
      group_wait: 10s
      repeat_interval: 15m

    # Infrastructure alerts
    - match:
        service: prometheus
      receiver: 'infrastructure-alerts'
      group_wait: 30s
      repeat_interval: 1h

    - match:
        service: grafana
      receiver: 'infrastructure-alerts'
      group_wait: 30s
      repeat_interval: 1h

    - match:
        service: nginx
      receiver: 'infrastructure-alerts'
      group_wait: 30s
      repeat_interval: 1h

# Inhibition rules - suppress certain alerts when others are firing
inhibit_rules:
  # Suppress warning alerts when critical alerts are firing
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'cluster', 'service']

  # Suppress individual instance alerts when the entire service is down
  - source_match:
      alertname: 'ServiceDown'
    target_match:
      alertname: 'InstanceDown'
    equal: ['service']

# Receiver configurations
receivers:
  # Default webhook receiver (for testing)
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://localhost:5001/'
        send_resolved: true

  # Critical alerts - immediate notification via webhook
  - name: 'critical-alerts'
    webhook_configs:
      - url: 'http://localhost:5001/critical'
        send_resolved: true

  # Warning alerts - less urgent notification
  - name: 'warning-alerts'
    webhook_configs:
      - url: 'http://localhost:5001/warning'
        send_resolved: true

  # Bot-specific alerts
  - name: 'bot-alerts'
    webhook_configs:
      - url: 'http://localhost:5001/bot'
        send_resolved: true

  # Infrastructure alerts
  - name: 'infrastructure-alerts'
    webhook_configs:
      - url: 'http://localhost:5001/infrastructure'
        send_resolved: true