feat: integrate Uptime Kuma and Alertmanager into Docker setup

- Add Uptime Kuma service for status monitoring with health checks.
- Introduce Alertmanager service for alert management and notifications.
- Update docker-compose.yml to include new services and their configurations.
- Enhance Makefile with commands for managing Uptime Kuma and Alertmanager logs.
- Modify Ansible playbook to install necessary packages and configure SSL for new services.
- Update Nginx configuration to route traffic to Uptime Kuma and Alertmanager.
- Adjust Prometheus configuration to include alert rules and external URLs.
This commit is contained in:
2025-09-16 21:50:56 +03:00
parent 5e10204137
commit 9ec3f02767
20 changed files with 2173 additions and 38 deletions

View File

@@ -0,0 +1,185 @@
# Alertmanager Configuration
# This file configures how alerts are handled and routed
global:
# SMTP configuration for email notifications
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@{{DOMAIN}}'
smtp_auth_username: 'alerts@{{DOMAIN}}'
smtp_auth_password: '{{SMTP_PASSWORD}}'
smtp_require_tls: true
# Resolve timeout
resolve_timeout: 5m
# Templates for alert formatting
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Route configuration - defines how alerts are routed
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
# Critical alerts - immediate notification
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 5m
# Warning alerts - grouped notification
- match:
severity: warning
receiver: 'warning-alerts'
group_wait: 30s
repeat_interval: 30m
# Bot-specific alerts
- match:
service: telegram-bot
receiver: 'bot-alerts'
group_wait: 10s
repeat_interval: 15m
- match:
service: anon-bot
receiver: 'bot-alerts'
group_wait: 10s
repeat_interval: 15m
# Infrastructure alerts
- match:
service: prometheus
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
- match:
service: grafana
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
- match:
service: nginx
receiver: 'infrastructure-alerts'
group_wait: 30s
repeat_interval: 1h
# Inhibition rules - suppress certain alerts when others are firing
inhibit_rules:
# Suppress warning alerts when critical alerts are firing
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service']
# Suppress individual instance alerts when the entire service is down
- source_match:
alertname: 'ServiceDown'
target_match:
alertname: 'InstanceDown'
equal: ['service']
# Receiver configurations
receivers:
# Default webhook receiver (for testing)
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:5001/'
send_resolved: true
# Critical alerts - immediate notification via multiple channels
- name: 'critical-alerts'
email_configs:
- to: 'admin@{{DOMAIN}}'
subject: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
html: |
<h2>🚨 Critical Alert</h2>
<table>
<tr><td><strong>Alert:</strong></td><td>{{ .GroupLabels.alertname }}</td></tr>
<tr><td><strong>Service:</strong></td><td>{{ .GroupLabels.service }}</td></tr>
<tr><td><strong>Time:</strong></td><td>{{ .GroupLabels.time }}</td></tr>
</table>
<h3>Alerts:</h3>
<ul>
{{ range .Alerts }}
<li><strong>{{ .Annotations.summary }}</strong><br/>
{{ .Annotations.description }}<br/>
<small>Instance: {{ .Labels.instance }} | Time: {{ .StartsAt }}</small>
</li>
{{ end }}
</ul>
webhook_configs:
- url: 'http://localhost:5001/critical'
send_resolved: true
# Warning alerts - less urgent notification
- name: 'warning-alerts'
email_configs:
- to: 'admin@{{DOMAIN}}'
subject: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
webhook_configs:
- url: 'http://localhost:5001/warning'
send_resolved: true
# Bot-specific alerts
- name: 'bot-alerts'
email_configs:
- to: 'bot-admin@{{DOMAIN}}'
subject: '🤖 Bot Alert: {{ .GroupLabels.alertname }}'
body: |
Bot Alert: {{ .GroupLabels.alertname }}
Service: {{ .GroupLabels.service }}
{{ range .Alerts }}
- {{ .Annotations.summary }}
{{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
webhook_configs:
- url: 'http://localhost:5001/bot'
send_resolved: true
# Infrastructure alerts
- name: 'infrastructure-alerts'
email_configs:
- to: 'infra@{{DOMAIN}}'
subject: '🏗️ Infrastructure Alert: {{ .GroupLabels.alertname }}'
body: |
Infrastructure Alert: {{ .GroupLabels.alertname }}
Service: {{ .GroupLabels.service }}
{{ range .Alerts }}
- {{ .Annotations.summary }}
{{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
webhook_configs:
- url: 'http://localhost:5001/infrastructure'
send_resolved: true