From 9856925142dcc88075fac55d1c9da0ce794d3e52 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Tue, 16 Jun 2026 18:49:01 +0000 Subject: [PATCH 1/3] Use deterministic alphabetical test run order in surefire Set surefire runOrder to alphabetical so test classes execute in the same sequence on every machine. The previous default (filesystem order) varied between CI runners and local checkouts, which made the component-c fork hang appear at an unpredictable class boundary and prevented local reproduction. Deterministic ordering makes the hang reproduce at a stable point so the responsible class can be identified. --- pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pom.xml b/pom.xml index 5762dc2289e..60e55d8b39b 100644 --- a/pom.xml +++ b/pom.xml @@ -411,6 +411,10 @@ ${test-forkCount} false + + alphabetical ${test-forkedProcessTimeout} From 220e00e90d5b396223d6cdd5fea865200822d3c4 Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Tue, 16 Jun 2026 20:08:56 +0000 Subject: [PATCH 2/3] Capture stacks of stalled surefire forks in test container Add a watchdog to the docker test entrypoint that detects a surefire fork whose cumulative CPU time stops advancing while the process stays alive (the signature of a fork that finished its tests but never exits). On detection it dumps the fork's Java stacks (jstack -l and forced jstack -F -l), mixed native frames (jstack -m), and per-thread kernel wait channels from /proc to the job log and to target/thread-dumps. This surfaces the JVM-shutdown stall behind the component-c job hang, which is not a live non-daemon Java thread (ruled out locally) and is therefore only observable in CI at the moment of the stall, before the fork is force-killed. --- docker/entrypoint.sh | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 53dfabb96e6..cb6e7c9c17e 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -51,10 +51,94 @@ if [ "$compile_transient_failure" = true ]; then else echo "No transient Maven repository error detected; no retry needed." fi + +# --- Hung-fork diagnostics ------------------------------------------------- +# Some fork JVMs finish their tests but never exit (the JVM stays alive while +# surefire waits, eventually hitting the job timeout). The thread leak is not a +# live non-daemon Java thread (verified locally), so the stall happens at JVM +# shutdown -- a blocking shutdown hook or a stuck native frame. Neither is +# visible after the fork is killed, so we snapshot the fork's stacks the moment +# it goes idle, before surefire force-kills it. +# +# Detection: the surefire fork (its command line contains "surefirebooter") +# burns CPU while running tests; if its cumulative CPU time stops advancing for +# a sustained window while the process is still alive, it is stalled. We then +# emit Java (jstack -l), native (jstack -m), and per-thread kernel wait-channel +# (/proc//task/*/wchan) snapshots to the job log and to target artifacts. +dump_dir="/github/workspace/target/thread-dumps" +mkdir -p "$dump_dir" +jstack_bin="$JAVA_HOME/bin/jstack" + +hung_fork_watchdog() { + local poll=3 idle_limit=12 # dump after ~12s of zero CPU progress + local last_pid="" last_cpu=-1 idle=0 dumps=0 + while true; do + sleep "$poll" + # Locate the current surefire fork via its command line. Use bash string + # matching (not grep) so the scan never matches its own helper process. + local pid="" p cl + for p in /proc/[0-9]*; do + [ -r "$p/cmdline" ] || continue + cl=$(tr '\0' ' ' < "$p/cmdline" 2>/dev/null) + case "$cl" in + *surefirebooter*) pid="${p#/proc/}"; break ;; + esac + done + if [ -z "$pid" ]; then last_pid=""; last_cpu=-1; idle=0; continue; fi + + # Cumulative CPU (utime+stime, clock ticks) from /proc//stat. + local stat rest cpu + stat=$(cat "/proc/$pid/stat" 2>/dev/null) || continue + rest=${stat#*") "} + # shellcheck disable=SC2086 + set -- $rest + cpu=$(( ${12} + ${13} )) + + if [ "$pid" != "$last_pid" ]; then + last_pid="$pid"; last_cpu="$cpu"; idle=0; dumps=0; continue + fi + if [ "$cpu" = "$last_cpu" ]; then + idle=$(( idle + poll )) + else + idle=0; last_cpu="$cpu" + fi + + if [ "$idle" -ge "$idle_limit" ] && [ "$dumps" -lt 3 ]; then + local ts f + ts=$(date +%H%M%S) + f="$dump_dir/stall_${pid}_${ts}.txt" + { + echo "================ STALLED SUREFIRE FORK ================" + echo "time=$(date +%H:%M:%S) pid=$pid cpu_ticks=$cpu idle>=${idle}s dump#$((dumps+1))" + echo "--- cmdline ---"; tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null; echo + echo "--- /proc/$pid/status (State/Threads) ---" + grep -E '^(State|Threads):' "/proc/$pid/status" 2>/dev/null + echo "--- per-thread kernel wait channel (tid comm wchan) ---" + for t in /proc/$pid/task/*; do + echo " ${t##*/} $(cat "$t/comm" 2>/dev/null) $(cat "$t/wchan" 2>/dev/null)" + done + echo "--- jstack -l (live attach: Java threads + locks) ---" + "$jstack_bin" -l "$pid" 2>&1 + echo "--- jstack -F -l (forced: works when the JVM is unresponsive) ---" + "$jstack_bin" -F -l "$pid" 2>&1 + echo "--- jstack -m (mixed Java+native frames) ---" + "$jstack_bin" -m "$pid" 2>&1 + echo "======================================================" + } 2>&1 | tee -a "$f" + dumps=$(( dumps + 1 )) + fi + done +} + +hung_fork_watchdog & +watchdog_pid=$! + mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ | stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \ | tee $log +kill "$watchdog_pid" 2>/dev/null + grep_args="SUCCESS" grepvals="$( tail -n 100 $log | grep $grep_args)" From bec5e70d4ad047fa06fd172589aee9c1e7ab138b Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Tue, 16 Jun 2026 20:53:49 +0000 Subject: [PATCH 3/3] Remove surefire fork watchdog from test container entrypoint The watchdog crashed healthy test forks and turned nearly every Java test group red. It matched any process whose command line contained "surefirebooter", which includes the /bin/sh wrapper that launches the fork and waits in do_wait with zero CPU forever, so the zero-CPU-progress detector fired on essentially every fork after 12s. The subsequent jstack attach to a live fork disrupted surefire's master/fork stream protocol, killing the JVM (exit 131 / SIGQUIT) and producing "forked VM terminated without properly saying goodbye". The forced and mixed-mode jstack variants are also no-ops on JDK 17. Attaching to a busy fork is inherently unsafe for diagnostics, so drop the watchdog entirely and restore the previous entrypoint. The deterministic alphabetical run order added separately is unaffected. --- docker/entrypoint.sh | 84 -------------------------------------------- 1 file changed, 84 deletions(-) diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index cb6e7c9c17e..53dfabb96e6 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -51,94 +51,10 @@ if [ "$compile_transient_failure" = true ]; then else echo "No transient Maven repository error detected; no retry needed." fi - -# --- Hung-fork diagnostics ------------------------------------------------- -# Some fork JVMs finish their tests but never exit (the JVM stays alive while -# surefire waits, eventually hitting the job timeout). The thread leak is not a -# live non-daemon Java thread (verified locally), so the stall happens at JVM -# shutdown -- a blocking shutdown hook or a stuck native frame. Neither is -# visible after the fork is killed, so we snapshot the fork's stacks the moment -# it goes idle, before surefire force-kills it. -# -# Detection: the surefire fork (its command line contains "surefirebooter") -# burns CPU while running tests; if its cumulative CPU time stops advancing for -# a sustained window while the process is still alive, it is stalled. We then -# emit Java (jstack -l), native (jstack -m), and per-thread kernel wait-channel -# (/proc//task/*/wchan) snapshots to the job log and to target artifacts. -dump_dir="/github/workspace/target/thread-dumps" -mkdir -p "$dump_dir" -jstack_bin="$JAVA_HOME/bin/jstack" - -hung_fork_watchdog() { - local poll=3 idle_limit=12 # dump after ~12s of zero CPU progress - local last_pid="" last_cpu=-1 idle=0 dumps=0 - while true; do - sleep "$poll" - # Locate the current surefire fork via its command line. Use bash string - # matching (not grep) so the scan never matches its own helper process. - local pid="" p cl - for p in /proc/[0-9]*; do - [ -r "$p/cmdline" ] || continue - cl=$(tr '\0' ' ' < "$p/cmdline" 2>/dev/null) - case "$cl" in - *surefirebooter*) pid="${p#/proc/}"; break ;; - esac - done - if [ -z "$pid" ]; then last_pid=""; last_cpu=-1; idle=0; continue; fi - - # Cumulative CPU (utime+stime, clock ticks) from /proc//stat. - local stat rest cpu - stat=$(cat "/proc/$pid/stat" 2>/dev/null) || continue - rest=${stat#*") "} - # shellcheck disable=SC2086 - set -- $rest - cpu=$(( ${12} + ${13} )) - - if [ "$pid" != "$last_pid" ]; then - last_pid="$pid"; last_cpu="$cpu"; idle=0; dumps=0; continue - fi - if [ "$cpu" = "$last_cpu" ]; then - idle=$(( idle + poll )) - else - idle=0; last_cpu="$cpu" - fi - - if [ "$idle" -ge "$idle_limit" ] && [ "$dumps" -lt 3 ]; then - local ts f - ts=$(date +%H%M%S) - f="$dump_dir/stall_${pid}_${ts}.txt" - { - echo "================ STALLED SUREFIRE FORK ================" - echo "time=$(date +%H:%M:%S) pid=$pid cpu_ticks=$cpu idle>=${idle}s dump#$((dumps+1))" - echo "--- cmdline ---"; tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null; echo - echo "--- /proc/$pid/status (State/Threads) ---" - grep -E '^(State|Threads):' "/proc/$pid/status" 2>/dev/null - echo "--- per-thread kernel wait channel (tid comm wchan) ---" - for t in /proc/$pid/task/*; do - echo " ${t##*/} $(cat "$t/comm" 2>/dev/null) $(cat "$t/wchan" 2>/dev/null)" - done - echo "--- jstack -l (live attach: Java threads + locks) ---" - "$jstack_bin" -l "$pid" 2>&1 - echo "--- jstack -F -l (forced: works when the JVM is unresponsive) ---" - "$jstack_bin" -F -l "$pid" 2>&1 - echo "--- jstack -m (mixed Java+native frames) ---" - "$jstack_bin" -m "$pid" 2>&1 - echo "======================================================" - } 2>&1 | tee -a "$f" - dumps=$(( dumps + 1 )) - fi - done -} - -hung_fork_watchdog & -watchdog_pid=$! - mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ | stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \ | tee $log -kill "$watchdog_pid" 2>/dev/null - grep_args="SUCCESS" grepvals="$( tail -n 100 $log | grep $grep_args)"