chore: enhance deployment workflow with improved health checks and manual trigger

- Updated the deployment job to allow manual triggering via workflow_dispatch. - Implemented a retry mechanism for health checks on Prometheus and Grafana to improve reliability. - Increased wait time for services to start before health checks are performed. - Modified health check messages for better clarity and added logging for failed checks.
2026-01-25 16:58:16 +03:00
parent fde1f14708
commit 0cdc40cd21
1 changed files with 45 additions and 15 deletions
--- a/.github/workflows/pipeline.yml
+++ b/.github/workflows/pipeline.yml
@@ -67,7 +67,7 @@ jobs:
    runs-on: ubuntu-latest
    name: Deploy
    needs: test
-    if: success() && github.ref == 'refs/heads/main'
+    if: github.event_name == 'workflow_dispatch'  # Только ручной запуск через кнопку
    environment:
      name: production
    
@@ -134,22 +134,52 @@ jobs:
          port: ${{ vars.SSH_PORT || secrets.SSH_PORT || 22 }}
          script: |
            echo "🏥 Running health checks..."
-            sleep 15  # Даем время сервисам запуститься
            
-            # Проверяем Prometheus
-            if curl -f http://localhost:9090/-/healthy > /dev/null 2>&1; then
-              echo "✅ Prometheus is healthy"
+            # Проверяем статус контейнеров сначала
+            echo "📊 Checking container status..."
+            cd /home/prod
+            docker-compose ps || docker ps --filter "name=bots_"
+            
+            # Ждем запуска сервисов (увеличено время)
+            echo "⏳ Waiting for services to start (30 seconds)..."
+            sleep 30
+            
+            # Функция для проверки с повторными попытками
+            check_health() {
+              local service=$1
+              local url=$2
+              local max_attempts=5
+              local attempt=1
+              
+              echo "🔍 Checking $service health..."
+              
+              while [ $attempt -le $max_attempts ]; do
+                if curl -f -s --max-time 5 "$url" > /dev/null 2>&1; then
+                  echo "✅ $service is healthy (attempt $attempt/$max_attempts)"
+                  return 0
                else
-              echo "❌ Prometheus health check failed"
-              exit 1
+                  echo "⏳ $service not ready yet (attempt $attempt/$max_attempts), waiting 10 seconds..."
+                  sleep 10
+                  attempt=$((attempt + 1))
+                fi
+              done
+              
+              echo "❌ $service health check failed after $max_attempts attempts"
+              return 1
+            }
+            
+            # Проверяем Prometheus с повторными попытками
+            if ! check_health "Prometheus" "http://localhost:9090/-/healthy"; then
+              echo "⚠️  Prometheus health check failed, but continuing..."
+              echo "📊 Checking Prometheus logs:"
+              docker-compose logs --tail=20 prometheus || true
            fi
            
-            # Проверяем Grafana
-            if curl -f http://localhost:3000/api/health > /dev/null 2>&1; then
-              echo "✅ Grafana is healthy"
-            else
-              echo "❌ Grafana health check failed"
-              exit 1
+            # Проверяем Grafana с повторными попытками
+            if ! check_health "Grafana" "http://localhost:3000/api/health"; then
+              echo "⚠️  Grafana health check failed, but continuing..."
+              echo "📊 Checking Grafana logs:"
+              docker-compose logs --tail=20 grafana || true
            fi
            
            # Проверяем статус контейнеров
@@ -157,7 +187,7 @@ jobs:
            cd /home/prod
            docker-compose ps || docker ps --filter "name=bots_"
            
-            echo "✅ All health checks passed"
+            echo "✅ Health checks completed"
      
      - name: Send deployment notification
        if: always()