Update architecture and backup documentation to include Healthchecks integration

Add Healthchecks service details to architecture and backup documentation, including its role as a Dead man's switch for backups. Update backup scripts to utilize systemd timers instead of cron for improved scheduling. Enhance network topology documentation to reflect Healthchecks integration in the VPS Miran setup. This update clarifies backup processes and enhances overall system reliability.
This commit is contained in:
2026-02-28 15:43:39 +03:00
parent 16c254510a
commit 53769e6832
61 changed files with 1697 additions and 39 deletions

58
scripts/watchdog-timers.sh Executable file
View File

@@ -0,0 +1,58 @@
#!/bin/bash
# Watchdog: проверка провалившихся systemd timers.
# Запускать раз в день (например 12:00). При наличии failed → notify в Telegram.
# Timer: backup-watchdog-timers.timer
NOTIFY_SCRIPT="${NOTIFY_SCRIPT:-/root/scripts/notify-telegram.sh}"
MAX_AGE_HOURS=24
BACKUP_OK_DIR="/var/run"
if [ "$(id -u)" -ne 0 ]; then
echo "Запускайте под root."
exit 1
fi
# 1. Проверка systemctl list-timers --failed
FAILED=$(systemctl list-timers --failed --no-legend --no-pager 2>/dev/null | grep -v '^$' || true)
if [ -n "$FAILED" ]; then
MSG="Провалившиеся таймеры:
$FAILED"
if [ -x "$NOTIFY_SCRIPT" ]; then
"$NOTIFY_SCRIPT" "⚠️ Systemd timers" "$MSG" || true
fi
echo "[watchdog] Найдены провалившиеся таймеры"
echo "$FAILED"
exit 1
fi
# 2. Проверка healthcheck-файлов (если файл старше 24 ч — алерт)
BACKUP_NAMES="vps-miran ct101-pgdump immich-photos vps-mtproto etc-pve ct104-pgdump vaultwarden-data ct103-gitea-pgdump vm200-pgdump ct105-vectors restic-yandex restic-yandex-photos"
STALE=""
for name in $BACKUP_NAMES; do
OK_FILE="$BACKUP_OK_DIR/backup-$name.ok"
if [ -f "$OK_FILE" ]; then
AGE_SEC=$(( $(date +%s) - $(stat -c %Y "$OK_FILE" 2>/dev/null || echo 0) ))
AGE_HOURS=$(( AGE_SEC / 3600 ))
if [ "$AGE_HOURS" -ge "$MAX_AGE_HOURS" ]; then
STALE="${STALE}backup-$name.ok (${AGE_HOURS}h)
"
fi
else
STALE="${STALE}backup-$name.ok (отсутствует)
"
fi
done
if [ -n "$STALE" ]; then
MSG="Файлы .ok старше ${MAX_AGE_HOURS} ч или отсутствуют (последний успешный бэкап):
$STALE"
if [ -x "$NOTIFY_SCRIPT" ]; then
"$NOTIFY_SCRIPT" "⚠️ Backup watchdog" "$MSG" || true
fi
echo "[watchdog] Устаревшие healthcheck-файлы"
echo "$STALE"
exit 1
fi
echo "[watchdog] OK: таймеры и healthcheck-файлы в порядке"
exit 0