telegram-helper-bot/helper_bot/scripts/monitor_bot.sh

#!/bin/bash

# Script for monitoring and auto-restarting the Telegram bot
# Usage: ./monitor_bot.sh

set -e

# Configuration
BOT_CONTAINER="telegram-helper-bot"
HEALTH_ENDPOINT="http://localhost:8080/health"
CHECK_INTERVAL=60  # seconds
MAX_FAILURES=3
LOG_FILE="logs/bot_monitor.log"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Logging function
log() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}

# Check if container is running
check_container_running() {
    if docker ps --format "table {{.Names}}" | grep -q "^${BOT_CONTAINER}$"; then
        return 0
    else
        return 1
    fi
}

# Check health endpoint
check_health() {
    if curl -f --connect-timeout 5 --max-time 10 "$HEALTH_ENDPOINT" >/dev/null 2>&1; then
        return 0
    else
        return 1
    fi
}

# Restart container
restart_container() {
    log "${YELLOW}Restarting container ${BOT_CONTAINER}...${NC}"

    if docker restart "$BOT_CONTAINER" >/dev/null 2>&1; then
        log "${GREEN}Container restarted successfully${NC}"

        # Wait for container to be ready
        log "Waiting for container to be ready..."
        sleep 30

        # Check if container is healthy
        local attempts=0
        while [ $attempts -lt 10 ]; do
            if check_health; then
                log "${GREEN}Container is healthy after restart${NC}"
                return 0
            fi
            attempts=$((attempts + 1))
            sleep 10
        done

        log "${RED}Container failed to become healthy after restart${NC}"
        return 1
    else
        log "${RED}Failed to restart container${NC}"
        return 1
    fi
}

# Main monitoring loop
main() {
    log "${GREEN}Starting bot monitoring...${NC}"
    log "Container: $BOT_CONTAINER"
    log "Health endpoint: $HEALTH_ENDPOINT"
    log "Check interval: ${CHECK_INTERVAL}s"
    log "Max failures: $MAX_FAILURES"

    local failure_count=0

    while true; do
        # Check if container is running
        if ! check_container_running; then
            log "${RED}Container $BOT_CONTAINER is not running!${NC}"
            if restart_container; then
                failure_count=0
            else
                failure_count=$((failure_count + 1))
            fi
        else
            # Check health endpoint
            if check_health; then
                if [ $failure_count -gt 0 ]; then
                    log "${GREEN}Container recovered, resetting failure count${NC}"
                    failure_count=0
                fi
                log "${GREEN}Container is healthy${NC}"
            else
                failure_count=$((failure_count + 1))
                log "${YELLOW}Health check failed (${failure_count}/${MAX_FAILURES})${NC}"

                if [ $failure_count -ge $MAX_FAILURES ]; then
                    log "${RED}Max failures reached, restarting container${NC}"
                    if restart_container; then
                        failure_count=0
                    else
                        log "${RED}Failed to restart container after max failures${NC}"
                    fi
                fi
            fi
        fi

        sleep "$CHECK_INTERVAL"
    done
}

# Handle script interruption
trap 'log "Monitoring stopped by user"; exit 0' INT TERM

# Run main function
main "$@"