From 5879ce330621451f6aa7b954def8a3610a07cb97 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Wed, 8 Apr 2026 15:58:50 -0700 Subject: [PATCH 1/2] ci: add OOM kill detection to integration test runs Monitor kernel dmesg for OOM killer messages during integration tests to help diagnose flaky dpkg segfaults (exit status 139) on deb distros that may be caused by memory pressure on the CI runners. Signed-off-by: Brian Goff --- .github/workflows/ci.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ffffa3dc..4fd278152 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -265,6 +265,13 @@ jobs: echo "================ CLEANUP COMPLETE ================" - name: Use azure ubuntu archive uses: ./.github/actions/dns-spoof-ubuntu-archive + - name: Start OOM monitor + run: | + sudo dmesg --clear || true + setsid sh -c 'sudo dmesg --follow 2>/dev/null | \ + grep --line-buffered -iE "oom|out of memory|killed process|invoked oom" \ + > /tmp/oom-monitor.log 2>&1' & + echo "OOM_MONITOR_PID=$!" >> "$GITHUB_ENV" - name: Pre-build base images run: | set -eu @@ -296,6 +303,22 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} TEST_SKIP: ${{ matrix.skip }} + - name: Check for OOM kills + if: always() + run: | + kill -- -"${OOM_MONITOR_PID}" 2>/dev/null || true + mkdir -p /tmp/reports + + if [ -s /tmp/oom-monitor.log ]; then + echo "::warning::OOM kills detected during test run" + cat /tmp/oom-monitor.log + cp /tmp/oom-monitor.log /tmp/reports/oom-monitor.log + else + echo "No OOM kills detected" + fi + + # Capture recent dmesg for context on any failures + sudo dmesg -T 2>/dev/null | tail -200 > /tmp/reports/dmesg-tail.log || true - name: Get traces if: always() run: | From c1d73ff4387d0041040bc6431094bd7bf0fa00ff Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Fri, 10 Apr 2026 10:59:54 -0700 Subject: [PATCH 2/2] ci: collect dockerd pprof dumps on test timeout Add timeout signaling from test2json2gha to GITHUB_OUTPUT so subsequent CI steps can detect when tests timed out. On timeout, the dump logs step now collects goroutine stacks, a binary heap profile, and the dockerd binary from the runner for offline analysis with go tool pprof. Signed-off-by: Brian Goff --- .github/workflows/ci.yml | 23 +++++++++++++++++++---- cmd/test2json2gha/main.go | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4fd278152..fd53a0ca1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -291,6 +291,7 @@ jobs: env: TEST_SUITE: ${{ matrix.suite }} - name: Run integration tests + id: run-tests run: | set -ex if [ -n "${TEST_SUITE}" ] && [ ! "${TEST_SUITE}" = "other" ]; then @@ -350,15 +351,29 @@ jobs: set -e dir="$(mktemp -d)" - f="${dir}/dockerd.log" - echo "DOCKERD_LOG_PATH=${f}" >> $GITHUB_OUTPUT - sudo journalctl -u docker > "${f}" + echo "DOCKERD_LOG_PATH=${dir}" >> $GITHUB_OUTPUT + + if [ "${{ steps.run-tests.outputs.test_timeout }}" = "true" ]; then + echo "::group::Collecting pprof data from dockerd (test timeout detected)" + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/goroutine-stacks.txt" \ + "http://localhost/debug/pprof/goroutine?debug=2" || true + + curl --unix-socket /var/run/docker.sock \ + -o "${dir}/heap-profile.bin" \ + "http://localhost/debug/pprof/heap" || true + + cp "$(which dockerd)" "${dir}/dockerd" || true + echo "::endgroup::" + fi + + sudo journalctl -u docker > "${dir}/dockerd.log" - name: Upload buildkit logs if: failure() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: e2e-dockerd-logs-${{ matrix.suite }} - path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }} + path: ${{ steps.dump-logs.outputs.DOCKERD_LOG_PATH }}/* retention-days: 1 unit: diff --git a/cmd/test2json2gha/main.go b/cmd/test2json2gha/main.go index 3c81441bf..55d546de7 100644 --- a/cmd/test2json2gha/main.go +++ b/cmd/test2json2gha/main.go @@ -5,6 +5,7 @@ import ( "flag" "fmt" "io" + "iter" "log/slog" "os" "runtime/debug" @@ -73,6 +74,7 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { var wg waitGroup results.markUnfinishedAsTimeout() + signalTimeout(results.Results()) wg.Go(func() { var rf ResultsFormatter @@ -135,6 +137,30 @@ func do(in io.Reader, out io.Writer, cfg config) (bool, error) { return bool(anyFailed), nil } +// signalTimeout writes test_timeout=true to GITHUB_OUTPUT if any test timed out. +// This allows subsequent CI steps to detect that a timeout occurred. +func signalTimeout(results iter.Seq[*TestResult]) { + ghOutput := os.Getenv("GITHUB_OUTPUT") + if ghOutput == "" { + return + } + + for r := range results { + if r.timeout { + f, err := os.OpenFile(ghOutput, os.O_WRONLY|os.O_APPEND, 0) + if err != nil { + slog.Error("Error opening GITHUB_OUTPUT", "error", err) + return + } + if _, err := fmt.Fprintln(f, "test_timeout=true"); err != nil { + slog.Error("Error writing timeout status to GITHUB_OUTPUT", "error", err) + } + f.Close() + return + } + } +} + type waitGroup struct { sync.WaitGroup }