feat: integrate Uptime Kuma and Alertmanager into Docker setup
- Add Uptime Kuma service for status monitoring with health checks. - Introduce Alertmanager service for alert management and notifications. - Update docker-compose.yml to include new services and their configurations. - Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs. - Modify Ansible playbook to install necessary packages and configure SSL for new services. - Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager. - Adjust Prometheus configuration to include alert rules and external URLs.
This commit is contained in:
185
infra/alertmanager/alertmanager.yml
Normal file
185
infra/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,185 @@
|
||||
# Alertmanager Configuration
|
||||
# This file configures how alerts are handled and routed
|
||||
|
||||
global:
|
||||
# SMTP configuration for email notifications
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@{{DOMAIN}}'
|
||||
smtp_auth_username: 'alerts@{{DOMAIN}}'
|
||||
smtp_auth_password: '{{SMTP_PASSWORD}}'
|
||||
smtp_require_tls: true
|
||||
|
||||
# Resolve timeout
|
||||
resolve_timeout: 5m
|
||||
|
||||
# Templates for alert formatting
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route configuration - defines how alerts are routed
|
||||
route:
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'web.hook'
|
||||
routes:
|
||||
# Critical alerts - immediate notification
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 5s
|
||||
repeat_interval: 5m
|
||||
|
||||
# Warning alerts - grouped notification
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 30m
|
||||
|
||||
# Bot-specific alerts
|
||||
- match:
|
||||
service: telegram-bot
|
||||
receiver: 'bot-alerts'
|
||||
group_wait: 10s
|
||||
repeat_interval: 15m
|
||||
|
||||
- match:
|
||||
service: anon-bot
|
||||
receiver: 'bot-alerts'
|
||||
group_wait: 10s
|
||||
repeat_interval: 15m
|
||||
|
||||
# Infrastructure alerts
|
||||
- match:
|
||||
service: prometheus
|
||||
receiver: 'infrastructure-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
- match:
|
||||
service: grafana
|
||||
receiver: 'infrastructure-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
- match:
|
||||
service: nginx
|
||||
receiver: 'infrastructure-alerts'
|
||||
group_wait: 30s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Inhibition rules - suppress certain alerts when others are firing
|
||||
inhibit_rules:
|
||||
# Suppress warning alerts when critical alerts are firing
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'cluster', 'service']
|
||||
|
||||
# Suppress individual instance alerts when the entire service is down
|
||||
- source_match:
|
||||
alertname: 'ServiceDown'
|
||||
target_match:
|
||||
alertname: 'InstanceDown'
|
||||
equal: ['service']
|
||||
|
||||
# Receiver configurations
|
||||
receivers:
|
||||
# Default webhook receiver (for testing)
|
||||
- name: 'web.hook'
|
||||
webhook_configs:
|
||||
- url: 'http://localhost:5001/'
|
||||
send_resolved: true
|
||||
|
||||
# Critical alerts - immediate notification via multiple channels
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'admin@{{DOMAIN}}'
|
||||
subject: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Service: {{ .Labels.service }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
html: |
|
||||
<h2>🚨 Critical Alert</h2>
|
||||
<table>
|
||||
<tr><td><strong>Alert:</strong></td><td>{{ .GroupLabels.alertname }}</td></tr>
|
||||
<tr><td><strong>Service:</strong></td><td>{{ .GroupLabels.service }}</td></tr>
|
||||
<tr><td><strong>Time:</strong></td><td>{{ .GroupLabels.time }}</td></tr>
|
||||
</table>
|
||||
<h3>Alerts:</h3>
|
||||
<ul>
|
||||
{{ range .Alerts }}
|
||||
<li><strong>{{ .Annotations.summary }}</strong><br/>
|
||||
{{ .Annotations.description }}<br/>
|
||||
<small>Instance: {{ .Labels.instance }} | Time: {{ .StartsAt }}</small>
|
||||
</li>
|
||||
{{ end }}
|
||||
</ul>
|
||||
webhook_configs:
|
||||
- url: 'http://localhost:5001/critical'
|
||||
send_resolved: true
|
||||
|
||||
# Warning alerts - less urgent notification
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: 'admin@{{DOMAIN}}'
|
||||
subject: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Service: {{ .Labels.service }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
webhook_configs:
|
||||
- url: 'http://localhost:5001/warning'
|
||||
send_resolved: true
|
||||
|
||||
# Bot-specific alerts
|
||||
- name: 'bot-alerts'
|
||||
email_configs:
|
||||
- to: 'bot-admin@{{DOMAIN}}'
|
||||
subject: '🤖 Bot Alert: {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
Bot Alert: {{ .GroupLabels.alertname }}
|
||||
Service: {{ .GroupLabels.service }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
- {{ .Annotations.summary }}
|
||||
{{ .Annotations.description }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
webhook_configs:
|
||||
- url: 'http://localhost:5001/bot'
|
||||
send_resolved: true
|
||||
|
||||
# Infrastructure alerts
|
||||
- name: 'infrastructure-alerts'
|
||||
email_configs:
|
||||
- to: 'infra@{{DOMAIN}}'
|
||||
subject: '🏗️ Infrastructure Alert: {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
Infrastructure Alert: {{ .GroupLabels.alertname }}
|
||||
Service: {{ .GroupLabels.service }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
- {{ .Annotations.summary }}
|
||||
{{ .Annotations.description }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
webhook_configs:
|
||||
- url: 'http://localhost:5001/infrastructure'
|
||||
send_resolved: true
|
||||
Reference in New Issue
Block a user