|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Deep diagnostic wrapper around `yarn test:browser` for investigating |
| 3 | +# browser-test hangs and silent failures. |
| 4 | +# |
| 5 | +# Wraps a single vitest run with a hybrid 1s-steady / 0.1s-burst sampler |
| 6 | +# that captures /proc state (wchan, stack, syscall, fd), cgroup CPU/memory |
| 7 | +# counters, /proc/net socket state counts, and a CPU-availability |
| 8 | +# microbench. When vitest goes silent for >3s the sampler enters dense |
| 9 | +# burst mode and emits a stacks_snapshot every 1s of burst. Outer 90s |
| 10 | +# timeout converts a hang into a clean exit with the full diagnostic dump |
| 11 | +# emitted to stderr by the EXIT trap. |
| 12 | +# |
| 13 | +# Single-tenant invocation: |
| 14 | +# bash yarn-project/kv-store/scripts/probe-test-browser.sh |
| 15 | +# |
| 16 | +# Parallel invocation (avoids tmp-path collisions across slots): |
| 17 | +# DIAG_DIR=/tmp/diag.$$ bash yarn-project/kv-store/scripts/probe-test-browser.sh |
| 18 | +# |
| 19 | +# Was used to root-cause the kv-store CDP teardown deadlock that |
| 20 | +# scripts/run-browser-tests.sh now works around; kept for future regression |
| 21 | +# debugging. scripts/repro-browser-hang.sh runs this in a parallel grind |
| 22 | +# under docker_isolate constraints. |
| 23 | + |
| 24 | +cd "$(dirname "$0")/.." |
| 25 | + |
| 26 | +DIAG_DIR=${DIAG_DIR:-/tmp/diag} |
| 27 | +PROBE_LOG=$DIAG_DIR/probe.log |
| 28 | +STACKS_LOG=$DIAG_DIR/stacks.log |
| 29 | +VITEST_LOG=$DIAG_DIR/vitest.log |
| 30 | +mkdir -p "$DIAG_DIR" |
| 31 | +: > "$PROBE_LOG" |
| 32 | +: > "$STACKS_LOG" |
| 33 | +: > "$VITEST_LOG" |
| 34 | + |
| 35 | +echo "probe-test-browser v2: starting at $(date +%T) (pid $$)" >&2 |
| 36 | + |
| 37 | +pids_of_interest() { |
| 38 | + pgrep -f 'chrom|node|vitest|esbuild|yarn' 2>/dev/null |
| 39 | +} |
| 40 | + |
| 41 | +# Force fixed CPU demand and measure what fraction we actually got. |
| 42 | +# ratio = (user+sys)/real. ~1.0 = full CPU; <0.5 = heavily preempted by host. |
| 43 | +# Disambiguates "processes not asking for CPU" (scenario A) from "host not |
| 44 | +# scheduling us" (scenario B) when cgroup counters are flat. |
| 45 | +microbench() { |
| 46 | + local TIMEFORMAT='microbench: real=%R user=%U sys=%S' |
| 47 | + { time awk 'BEGIN{x=0; for(i=0;i<100000;i++) x+=i*i; print x > "/dev/null"}' \ |
| 48 | + </dev/null 2>/dev/null; } 2>&1 |
| 49 | +} |
| 50 | + |
| 51 | +snapshot() { |
| 52 | + local tag="$1" |
| 53 | + echo "=== $(date +%T.%N | cut -c1-12) $tag ===" |
| 54 | + echo "-- /tmp --" |
| 55 | + df -h /tmp 2>/dev/null | tail -1 |
| 56 | + du -sh /tmp 2>/dev/null |
| 57 | + echo "-- cgroup mem --" |
| 58 | + if [ -r /sys/fs/cgroup/memory.current ]; then |
| 59 | + awk '{printf "current %d bytes (%.0f MB)\n", $1, $1/1048576}' /sys/fs/cgroup/memory.current |
| 60 | + cat /sys/fs/cgroup/memory.events 2>/dev/null |
| 61 | + fi |
| 62 | + echo "-- cgroup cpu --" |
| 63 | + cat /sys/fs/cgroup/cpu.stat 2>/dev/null |
| 64 | + echo "-- top by rss --" |
| 65 | + ps -eo pid,ppid,stat,rss,pcpu,wchan:20,comm --sort=-rss --no-headers 2>/dev/null | head -15 |
| 66 | + echo "-- procs of interest --" |
| 67 | + for pid in $(pids_of_interest); do |
| 68 | + [ -d "/proc/$pid" ] || continue |
| 69 | + local state threads rss_kb wchan cmd |
| 70 | + state=$(awk '{print $3}' "/proc/$pid/stat" 2>/dev/null) |
| 71 | + wchan=$(cat "/proc/$pid/wchan" 2>/dev/null) |
| 72 | + rss_kb=$(awk '/^VmRSS:/{print $2}' "/proc/$pid/status" 2>/dev/null) |
| 73 | + threads=$(awk '/^Threads:/{print $2}' "/proc/$pid/status" 2>/dev/null) |
| 74 | + cmd=$({ tr '\0' ' ' < "/proc/$pid/cmdline"; } 2>/dev/null | cut -c1-120) |
| 75 | + echo "pid=$pid state=$state threads=$threads rss=${rss_kb}kB wchan=$wchan cmd=$cmd" |
| 76 | + done |
| 77 | + # TCP state hex (col 4): 01=ESTABLISHED 04=FIN_WAIT1 05=FIN_WAIT2 06=TIME_WAIT |
| 78 | + # 07=CLOSE 08=CLOSE_WAIT 09=LAST_ACK 0A=LISTEN 0B=CLOSING |
| 79 | + echo "-- tcp4 sockets (state counts) --" |
| 80 | + awk 'NR>1{print $4}' /proc/net/tcp 2>/dev/null | sort | uniq -c |
| 81 | + echo "-- tcp6 sockets (state counts) --" |
| 82 | + awk 'NR>1{print $4}' /proc/net/tcp6 2>/dev/null | sort | uniq -c |
| 83 | + # Unix socket state (col 6): 01=SS_UNCONNECTED 02=SS_CONNECTING 03=SS_CONNECTED |
| 84 | + # 04=SS_DISCONNECTING — CDP/vitest IPC is typically over SS_CONNECTED unix sockets. |
| 85 | + echo "-- unix sockets (state counts) --" |
| 86 | + awk 'NR>1{print $6}' /proc/net/unix 2>/dev/null | sort | uniq -c |
| 87 | +} |
| 88 | + |
| 89 | +stacks_snapshot() { |
| 90 | + local tag="$1" |
| 91 | + echo "=== $(date +%T.%N | cut -c1-12) $tag ===" |
| 92 | + echo "-- lsof tcp/unix (-n -P, head -40) --" |
| 93 | + lsof -n -P -i TCP 2>/dev/null | head -25 |
| 94 | + lsof -n -P -U 2>/dev/null | head -15 |
| 95 | + for pid in $(pids_of_interest); do |
| 96 | + [ -d "/proc/$pid" ] || continue |
| 97 | + echo "--- pid=$pid ---" |
| 98 | + { tr '\0' ' ' < "/proc/$pid/cmdline"; } 2>/dev/null | cut -c1-140 |
| 99 | + echo "" |
| 100 | + # /proc/$pid/syscall: syscall_nr arg0..arg5 sp pc (readable by process owner) |
| 101 | + echo "syscall:" |
| 102 | + cat "/proc/$pid/syscall" 2>/dev/null | head -1 |
| 103 | + # /proc/$pid/stack: may require CAP_SYS_ADMIN; print only if non-empty |
| 104 | + local stack_content |
| 105 | + stack_content=$(cat "/proc/$pid/stack" 2>/dev/null | head -15) |
| 106 | + if [ -n "$stack_content" ]; then |
| 107 | + echo "kernel stack:" |
| 108 | + echo "$stack_content" |
| 109 | + fi |
| 110 | + # Per-thread wchan — tells us if threads are stuck on different things |
| 111 | + echo "thread wchans (sorted | uniq -c):" |
| 112 | + for t in /proc/$pid/task/*/wchan; do |
| 113 | + cat "$t" 2>/dev/null |
| 114 | + echo "" |
| 115 | + done | sort | uniq -c | sort -rn | head -10 |
| 116 | + echo "socket/pipe fds:" |
| 117 | + for fd in /proc/$pid/fd/*; do |
| 118 | + local target |
| 119 | + target=$(readlink "$fd" 2>/dev/null) || continue |
| 120 | + case "$target" in |
| 121 | + socket:*|pipe:*|anon_inode:*) echo " $(basename "$fd") -> $target" ;; |
| 122 | + esac |
| 123 | + done |
| 124 | + done |
| 125 | +} |
| 126 | + |
| 127 | +probe_loop() { |
| 128 | + local last_vitest_size=0 |
| 129 | + local silent_since |
| 130 | + silent_since=$(date +%s) |
| 131 | + local burst_count=0 |
| 132 | + local burst_cap=200 # ~20s dense sampling at 0.1s |
| 133 | + local steady_count=0 # increments once per steady iteration |
| 134 | + while true; do |
| 135 | + local now_s cur_size silent_for |
| 136 | + now_s=$(date +%s) |
| 137 | + cur_size=$(stat -c %s "$VITEST_LOG" 2>/dev/null || echo 0) |
| 138 | + if [ "$cur_size" != "$last_vitest_size" ]; then |
| 139 | + last_vitest_size=$cur_size |
| 140 | + silent_since=$now_s |
| 141 | + burst_count=0 |
| 142 | + fi |
| 143 | + silent_for=$((now_s - silent_since)) |
| 144 | + |
| 145 | + if [ "$silent_for" -ge 3 ] && [ "$burst_count" -lt "$burst_cap" ]; then |
| 146 | + { |
| 147 | + snapshot "burst(${silent_for}s silent)" |
| 148 | + # During a hang, microbench every burst snapshot — it's the core |
| 149 | + # signal for "is the host giving us CPU?" Overhead is irrelevant |
| 150 | + # since tests aren't running. |
| 151 | + microbench |
| 152 | + } >> "$PROBE_LOG" 2>&1 |
| 153 | + # Stacks every 10th burst tick (~every 1s of burst) to keep log size sane |
| 154 | + if [ $((burst_count % 10)) -eq 0 ]; then |
| 155 | + stacks_snapshot "burst(${silent_for}s silent)" >> "$STACKS_LOG" 2>&1 |
| 156 | + fi |
| 157 | + burst_count=$((burst_count + 1)) |
| 158 | + sleep 0.1 |
| 159 | + else |
| 160 | + { |
| 161 | + snapshot "steady" |
| 162 | + # Microbench every 5th steady iteration to bound CPU overhead |
| 163 | + # while tests are actually running (~0.4% vs tests' CPU budget). |
| 164 | + if [ $((steady_count % 5)) -eq 0 ]; then |
| 165 | + microbench |
| 166 | + fi |
| 167 | + } >> "$PROBE_LOG" 2>&1 |
| 168 | + # Periodic in-flight stacks baseline — gives us a "healthy waiting" |
| 169 | + # reference to diff against burst-triggered dumps during a hang. |
| 170 | + if [ $((steady_count % 10)) -eq 0 ]; then |
| 171 | + stacks_snapshot "steady(tick=${steady_count})" >> "$STACKS_LOG" 2>&1 |
| 172 | + fi |
| 173 | + steady_count=$((steady_count + 1)) |
| 174 | + sleep 1 |
| 175 | + fi |
| 176 | + done |
| 177 | +} |
| 178 | + |
| 179 | +probe_loop & |
| 180 | +PROBE_PID=$! |
| 181 | + |
| 182 | +cleanup() { |
| 183 | + local rc=$? |
| 184 | + kill "$PROBE_PID" 2>/dev/null |
| 185 | + wait "$PROBE_PID" 2>/dev/null |
| 186 | + # One last stacks snapshot at the moment of failure |
| 187 | + stacks_snapshot "cleanup" >> "$STACKS_LOG" 2>&1 |
| 188 | + { |
| 189 | + echo "" |
| 190 | + echo "=== VITEST LOG (tail -300) ===" |
| 191 | + tail -300 "$VITEST_LOG" |
| 192 | + echo "" |
| 193 | + echo "=== FULL PROBE LOG ($(wc -l < "$PROBE_LOG") lines) ===" |
| 194 | + cat "$PROBE_LOG" |
| 195 | + echo "" |
| 196 | + echo "=== FULL STACKS LOG ($(wc -l < "$STACKS_LOG") lines, test exit=$rc) ===" |
| 197 | + cat "$STACKS_LOG" |
| 198 | + echo "=== END DIAGNOSTICS ===" |
| 199 | + } >&2 |
| 200 | + exit "$rc" |
| 201 | +} |
| 202 | +trap cleanup EXIT |
| 203 | + |
| 204 | +# Tee vitest output to a file so the probe loop can detect silence. |
| 205 | +# pipefail ensures we surface timeout's exit code, not tee's. |
| 206 | +set -o pipefail |
| 207 | +timeout -v 90s yarn test:browser 2>&1 | tee "$VITEST_LOG" |
0 commit comments