Added enhanced log collection script and workflow steps for improved debugging of test failures

hbelmiro · hbelmiro · commit 12f5bad9d0c4 · 2025-10-29T10:55:08.000-03:00
Signed-off-by: Helber Belmiro &lt;helber.belmiro@gmail.com&gt;
diff --git a/.github/resources/scripts/collect-enhanced-logs.sh b/.github/resources/scripts/collect-enhanced-logs.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+set -e
+
+NS=""
+OUTPUT_FILE="/tmp/enhanced_pod_logs.txt"
+TEST_CONTEXT=""
+START_TIME=""
+
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --ns) NS="$2"; shift ;;
+        --output) OUTPUT_FILE="$2"; shift ;;
+        --test-context) TEST_CONTEXT="$2"; shift ;;
+        --start-time) START_TIME="$2"; shift ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+mkdir -p /tmp/enhanced.log
+
+if [[ -z "$NS" ]]; then
+    echo "Namespace (--ns) parameter is required."
+    exit 1
+fi
+
+function check_namespace {
+    if ! kubectl get namespace "$1" &>/dev/null; then
+        echo "Namespace '$1' does not exist."
+        exit 1
+    fi
+}
+
+function collect_comprehensive_logs {
+    local NAMESPACE=$1
+
+    echo "===== ENHANCED LOG COLLECTION REPORT =====" > "$OUTPUT_FILE"
+    echo "Collection Time: $(date)" >> "$OUTPUT_FILE"
+    echo "Test Context: ${TEST_CONTEXT:-'Not specified'}" >> "$OUTPUT_FILE"
+    echo "Test Start Time: ${START_TIME:-'Not specified'}" >> "$OUTPUT_FILE"
+    echo "Namespace: ${NAMESPACE}" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+
+    # 1. Collect all pod information with labels and annotations
+    echo "===== POD OVERVIEW WITH LABELS =====" >> "$OUTPUT_FILE"
+    kubectl get pods -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "Failed to get pod overview" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+
+    # 2. Collect Argo Workflow information
+    echo "===== ARGO WORKFLOWS =====" >> "$OUTPUT_FILE"
+    kubectl get workflows -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "No workflows found or failed to get workflows" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+
+    # 3. Collect recent events (last 30 minutes)
+    echo "===== RECENT EVENTS =====" >> "$OUTPUT_FILE"
+    kubectl get events -n "${NAMESPACE}" --sort-by='.lastTimestamp' >> "$OUTPUT_FILE" 2>&1 || echo "Failed to get events" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+
+    # 4. Filter pods created after test start time (if provided)
+    local POD_NAMES
+    if [[ -n "$START_TIME" ]]; then
+        echo "===== PODS CREATED DURING TEST (after $START_TIME) =====" >> "$OUTPUT_FILE"
+        POD_NAMES=$(kubectl get pods -n "${NAMESPACE}" -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.metadata.creationTimestamp}{"\n"}{end}' | awk -v start_time="$START_TIME" '$2 >= start_time {print $1}')
+        if [[ -n "$POD_NAMES" ]]; then
+            echo "Test-related pods: $POD_NAMES" >> "$OUTPUT_FILE"
+        else
+            echo "No pods found created after $START_TIME" >> "$OUTPUT_FILE"
+            # Fall back to all pods
+            POD_NAMES=$(kubectl get pods -n "${NAMESPACE}" -o custom-columns=":metadata.name" --no-headers)
+        fi
+    else
+        POD_NAMES=$(kubectl get pods -n "${NAMESPACE}" -o custom-columns=":metadata.name" --no-headers)
+    fi
+    echo "" >> "$OUTPUT_FILE"
+
+    if [[ -z "${POD_NAMES}" ]]; then
+        echo "No pods found in namespace '${NAMESPACE}'." >> "$OUTPUT_FILE"
+        return
+    fi
+
+    # 5. Detailed pod information with logs
+    for POD_NAME in ${POD_NAMES}; do
+        {
+            echo "=========================================="
+            echo "POD: ${POD_NAME}"
+            echo "=========================================="
+
+            echo "----- POD METADATA -----"
+            kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o yaml | grep -E "(name:|namespace:|labels:|annotations:|creationTimestamp:|phase:|conditions:)" || echo "Failed to get pod metadata"
+
+            echo ""
+            echo "----- POD DESCRIPTION -----"
+            kubectl describe pod "${POD_NAME}" -n "${NAMESPACE}" || echo "Failed to describe pod ${POD_NAME}"
+
+            echo ""
+            echo "----- POD LOGS (last 1000 lines) -----"
+            kubectl logs "${POD_NAME}" -n "${NAMESPACE}" --tail=1000 || echo "No logs found for pod ${POD_NAME}"
+
+            # Check for multiple containers
+            local CONTAINERS
+            CONTAINERS=$(kubectl get pod "${POD_NAME}" -n "${NAMESPACE}" -o jsonpath='{.spec.containers[*].name}' 2>/dev/null)
+            if [[ $(echo "$CONTAINERS" | wc -w) -gt 1 ]]; then
+                echo ""
+                echo "----- CONTAINER LOGS -----"
+                for CONTAINER in $CONTAINERS; do
+                    echo "--- Container: $CONTAINER ---"
+                    kubectl logs "${POD_NAME}" -c "$CONTAINER" -n "${NAMESPACE}" --tail=500 || echo "No logs for container $CONTAINER"
+                done
+            fi
+
+            echo ""
+            echo "=========================================="
+            echo ""
+        } >> "$OUTPUT_FILE"
+    done
+
+    # 6. Collect pipeline run information if available
+    echo "===== PIPELINE RUNS (if available) =====" >> "$OUTPUT_FILE"
+    kubectl get runs -n "${NAMESPACE}" -o wide --show-labels >> "$OUTPUT_FILE" 2>&1 || echo "No pipeline runs found or CRD not available" >> "$OUTPUT_FILE"
+    echo "" >> "$OUTPUT_FILE"
+
+    echo "Enhanced log collection completed. Output saved to: $OUTPUT_FILE"
+}
+
+check_namespace "$NS"
+collect_comprehensive_logs "$NS"
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
@@ -158,6 +158,40 @@ jobs:
           tls_enabled: ${{ matrix.pod_to_pod_tls_enabled }}
           ca_cert_path: ${{ env.CA_CERT_PATH }}
 
+      - name: Collect Pod logs on test failure
+        id: collect-failure-logs
+        shell: bash
+        if: ${{ steps.test-run.outcome == 'failure' }}
+        run: |
+          echo "=== Collecting enhanced logs after test failure ==="
+          NAMESPACE=${{ steps.configure.outputs.NAMESPACE }}
+          TEST_START_TIME=$(date -u -d '30 minutes ago' '+%Y-%m-%dT%H:%M:%SZ')
+          TEST_CONTEXT="${{ matrix.test_label}}_K8s-${{ matrix.k8s_version }}_cache-${{ matrix.cache_enabled }}"
+
+          # Create enhanced log collection
+          chmod +x ./.github/resources/scripts/collect-enhanced-logs.sh
+          ./.github/resources/scripts/collect-enhanced-logs.sh \
+            --ns $NAMESPACE \
+            --output /tmp/enhanced_failure_logs.txt \
+            --test-context "$TEST_CONTEXT" \
+            --start-time "$TEST_START_TIME"
+
+          # Also collect Ginkgo test output if available
+          if [ -f "${{ env.E2E_TESTS_DIR }}/reports/junit.xml" ]; then
+            echo "=== GINKGO TEST RESULTS ===" >> /tmp/enhanced_failure_logs.txt
+            cat "${{ env.E2E_TESTS_DIR }}/reports/junit.xml" >> /tmp/enhanced_failure_logs.txt 2>/dev/null || true
+          fi
+        continue-on-error: true
+
+      - name: Upload Pod logs on test failure
+        uses: actions/upload-artifact@v4
+        if: ${{ steps.test-run.outcome == 'failure' && steps.collect-failure-logs.outcome == 'success' }}
+        with:
+          name: failure-logs-${{ matrix.test_label}}-K8s-${{ matrix.k8s_version }}-cache-${{ matrix.cache_enabled }}-argo-${{ matrix.argo_version}}-proxy-${{ matrix.proxy }}-storage-${{ matrix.storage }}
+          path: /tmp/enhanced_failure_logs.txt
+          retention-days: 30
+        continue-on-error: true
+
       - name: Notify test reports
         shell: bash
         if: ${{ steps.test-run.outcome == 'success' }}
@@ -264,6 +298,52 @@ jobs:
           user_namespace: ${{ env.USER_NAMESPACE }}
           report_name: "E2EMultiUserTests_K8s=${{ matrix.k8s_version }}_cacheEnabled=${{ matrix.cache_enabled }}_multiUser=${{ matrix.multi_user }}_storage=${{ matrix.storage }}"
 
+      - name: Collect Pod logs on test failure
+        id: collect-failure-logs
+        shell: bash
+        if: ${{ steps.test-run.outcome == 'failure' }}
+        run: |
+          echo "=== Collecting enhanced logs after test failure ==="
+          NAMESPACE=${{ steps.configure.outputs.NAMESPACE }}
+          TEST_START_TIME=$(date -u -d '30 minutes ago' '+%Y-%m-%dT%H:%M:%SZ')
+          TEST_CONTEXT="MultiUser_K8s-${{ matrix.k8s_version }}_cache-${{ matrix.cache_enabled }}_storage-${{ matrix.storage }}"
+
+          # Create enhanced log collection
+          chmod +x ./.github/resources/scripts/collect-enhanced-logs.sh
+          ./.github/resources/scripts/collect-enhanced-logs.sh \
+            --ns $NAMESPACE \
+            --output /tmp/enhanced_failure_logs.txt \
+            --test-context "$TEST_CONTEXT" \
+            --start-time "$TEST_START_TIME"
+
+          # Also collect user namespace logs for multi-user tests
+          USER_NS="${{ env.USER_NAMESPACE }}"
+          if [ "$USER_NS" != "$NAMESPACE" ]; then
+            echo "=== USER NAMESPACE LOGS ===" >> /tmp/enhanced_failure_logs.txt
+            ./.github/resources/scripts/collect-enhanced-logs.sh \
+              --ns "$USER_NS" \
+              --output /tmp/user_ns_logs.txt \
+              --test-context "$TEST_CONTEXT" \
+              --start-time "$TEST_START_TIME"
+            cat /tmp/user_ns_logs.txt >> /tmp/enhanced_failure_logs.txt 2>/dev/null || true
+          fi
+
+          # Also collect Ginkgo test output if available
+          if [ -f "${{ env.E2E_TESTS_DIR }}/reports/junit.xml" ]; then
+            echo "=== GINKGO TEST RESULTS ===" >> /tmp/enhanced_failure_logs.txt
+            cat "${{ env.E2E_TESTS_DIR }}/reports/junit.xml" >> /tmp/enhanced_failure_logs.txt 2>/dev/null || true
+          fi
+        continue-on-error: true
+
+      - name: Upload Pod logs on test failure
+        uses: actions/upload-artifact@v4
+        if: ${{ steps.test-run.outcome == 'failure' && steps.collect-failure-logs.outcome == 'success' }}
+        with:
+          name: failure-logs-multiuser-K8s-${{ matrix.k8s_version }}-cache-${{ matrix.cache_enabled }}-storage-${{ matrix.storage }}
+          path: /tmp/enhanced_failure_logs.txt
+          retention-days: 30
+        continue-on-error: true
+
       - name: Notify test reports
         shell: bash
         if: ${{ steps.test-run.outcome == 'success' }}