Add Healthchecks service details to architecture and backup documentation, including its role as a Dead man's switch for backups. Update backup scripts to utilize systemd timers instead of cron for improved scheduling. Enhance network topology documentation to reflect Healthchecks integration in the VPS Miran setup. This update clarifies backup processes and enhances overall system reliability.
59 lines
2.0 KiB
Bash
Executable File
59 lines
2.0 KiB
Bash
Executable File
#!/bin/bash
|
|
# Watchdog: проверка провалившихся systemd timers.
|
|
# Запускать раз в день (например 12:00). При наличии failed → notify в Telegram.
|
|
# Timer: backup-watchdog-timers.timer
|
|
|
|
NOTIFY_SCRIPT="${NOTIFY_SCRIPT:-/root/scripts/notify-telegram.sh}"
|
|
MAX_AGE_HOURS=24
|
|
BACKUP_OK_DIR="/var/run"
|
|
|
|
if [ "$(id -u)" -ne 0 ]; then
|
|
echo "Запускайте под root."
|
|
exit 1
|
|
fi
|
|
|
|
# 1. Проверка systemctl list-timers --failed
|
|
FAILED=$(systemctl list-timers --failed --no-legend --no-pager 2>/dev/null | grep -v '^$' || true)
|
|
if [ -n "$FAILED" ]; then
|
|
MSG="Провалившиеся таймеры:
|
|
$FAILED"
|
|
if [ -x "$NOTIFY_SCRIPT" ]; then
|
|
"$NOTIFY_SCRIPT" "⚠️ Systemd timers" "$MSG" || true
|
|
fi
|
|
echo "[watchdog] Найдены провалившиеся таймеры"
|
|
echo "$FAILED"
|
|
exit 1
|
|
fi
|
|
|
|
# 2. Проверка healthcheck-файлов (если файл старше 24 ч — алерт)
|
|
BACKUP_NAMES="vps-miran ct101-pgdump immich-photos vps-mtproto etc-pve ct104-pgdump vaultwarden-data ct103-gitea-pgdump vm200-pgdump ct105-vectors restic-yandex restic-yandex-photos"
|
|
STALE=""
|
|
for name in $BACKUP_NAMES; do
|
|
OK_FILE="$BACKUP_OK_DIR/backup-$name.ok"
|
|
if [ -f "$OK_FILE" ]; then
|
|
AGE_SEC=$(( $(date +%s) - $(stat -c %Y "$OK_FILE" 2>/dev/null || echo 0) ))
|
|
AGE_HOURS=$(( AGE_SEC / 3600 ))
|
|
if [ "$AGE_HOURS" -ge "$MAX_AGE_HOURS" ]; then
|
|
STALE="${STALE}backup-$name.ok (${AGE_HOURS}h)
|
|
"
|
|
fi
|
|
else
|
|
STALE="${STALE}backup-$name.ok (отсутствует)
|
|
"
|
|
fi
|
|
done
|
|
|
|
if [ -n "$STALE" ]; then
|
|
MSG="Файлы .ok старше ${MAX_AGE_HOURS} ч или отсутствуют (последний успешный бэкап):
|
|
$STALE"
|
|
if [ -x "$NOTIFY_SCRIPT" ]; then
|
|
"$NOTIFY_SCRIPT" "⚠️ Backup watchdog" "$MSG" || true
|
|
fi
|
|
echo "[watchdog] Устаревшие healthcheck-файлы"
|
|
echo "$STALE"
|
|
exit 1
|
|
fi
|
|
|
|
echo "[watchdog] OK: таймеры и healthcheck-файлы в порядке"
|
|
exit 0
|