diff --git a/.test_patterns.yml b/.test_patterns.yml index 5a334cd586c6..afe61faee696 100644 --- a/.test_patterns.yml +++ b/.test_patterns.yml @@ -146,6 +146,11 @@ tests: - *palla # yarn-project tests + # Attempt to catch all kv-store browser test failures (consider them quarantined for now) + - regex: "yarn-project/kv-store" + error_regex: "vitest" + owners: + - *martin - regex: "yarn-project/kv-store" error_regex: "Could not import your test module" owners: @@ -153,7 +158,7 @@ tests: - regex: "yarn-project/kv-store" error_regex: "timeout: sending signal TERM to command" owners: - - *alex + - *martin - regex: "yarn-project/kv-store" error_regex: "Failed to fetch dynamically imported module" owners: diff --git a/yarn-project/bootstrap.sh b/yarn-project/bootstrap.sh index 75c3aacbce73..28cc9d41619f 100755 --- a/yarn-project/bootstrap.sh +++ b/yarn-project/bootstrap.sh @@ -170,7 +170,7 @@ function test_cmds { # Exclusions: # end-to-end: e2e tests handled separately with end-to-end/bootstrap.sh. - # kv-store: Uses mocha so will need different treatment. + # kv-store: per-file fan-out handled by kv-store/bootstrap.sh test_cmds. for test in !(end-to-end|kv-store|aztec)/src/**/*.test.ts; do # Skip benchmarks here. [[ "$test" =~ \.bench\.test\.ts$ ]] && continue @@ -218,8 +218,8 @@ function test_cmds { echo "${prefix}${cmd_env} yarn-project/scripts/run_test.sh $test" done - # Uses mocha for browser tests, so we have to treat it differently. - echo "$hash:ISOLATE=1 cd yarn-project/kv-store && yarn test" + # kv-store: per-file fan-out (mocha for node tests, vitest for browser tests). + kv-store/bootstrap.sh test_cmds # Aztec CLI tests aztec/bootstrap.sh test_cmds diff --git a/yarn-project/kv-store/bootstrap.sh b/yarn-project/kv-store/bootstrap.sh new file mode 100755 index 000000000000..0f4d344a8812 --- /dev/null +++ b/yarn-project/kv-store/bootstrap.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +source $(git rev-parse --show-toplevel)/ci3/source_bootstrap + +hash=$(../bootstrap.sh hash) + +function test_cmds { + # Node tests (mocha): files outside the browser-test and bench dirs. + # Mirrors .mocharc.json's spec. + for test in src/**/!(indexeddb|sqlite-opfs|bench)/*.test.ts; do + echo "$hash yarn-project/kv-store/scripts/run_test.sh $test" + done + + # Browser tests (vitest + chromium). Each file runs in its own ISOLATE + # container — running multiple files in a single vitest invocation + # triggers a CDP teardown deadlock on the 2-CPU CI executor. See + # scripts/run-browser-tests.sh for the root-cause analysis. + for test in src/indexeddb/*.test.ts src/sqlite-opfs/*.test.ts; do + echo "$hash:ISOLATE=1 yarn-project/kv-store/scripts/run_test.sh $test" + done +} + +case "$cmd" in + "") + ;; + *) + default_cmd_handler "$@" + ;; +esac diff --git a/yarn-project/kv-store/package.json b/yarn-project/kv-store/package.json index 781fd22c1077..b9efae78907a 100644 --- a/yarn-project/kv-store/package.json +++ b/yarn-project/kv-store/package.json @@ -16,9 +16,9 @@ "build:dev": "../scripts/tsc.sh --watch", "clean": "rm -rf ./dest .tsbuildinfo", "test:node": "NODE_NO_WARNINGS=1 mocha --config ./.mocharc.json", - "test:browser": "vitest run --config ./vitest.config.ts", + "test:browser": "bash scripts/run-browser-tests.sh", "bench:browser": "VITE_BENCH=1 vitest run --config ./vitest.config.ts src/bench", - "test": "yarn test:node", + "test": "yarn test:node && yarn test:browser", "test:jest": "NODE_NO_WARNINGS=1 node --experimental-vm-modules ../node_modules/.bin/jest --passWithNoTests --maxWorkers=${JEST_MAX_WORKERS:-8}" }, "inherits": [ diff --git a/yarn-project/kv-store/package.local.json b/yarn-project/kv-store/package.local.json index b430a575259d..87f59219bf36 100644 --- a/yarn-project/kv-store/package.local.json +++ b/yarn-project/kv-store/package.local.json @@ -1,5 +1,6 @@ { "scripts": { - "test": "yarn test:node" + "test": "yarn test:node && yarn test:browser", + "test:browser": "bash scripts/run-browser-tests.sh" } } diff --git a/yarn-project/kv-store/scripts/probe-test-browser.sh b/yarn-project/kv-store/scripts/probe-test-browser.sh new file mode 100755 index 000000000000..f0cfb168159a --- /dev/null +++ b/yarn-project/kv-store/scripts/probe-test-browser.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash +# Deep diagnostic wrapper around `yarn test:browser` for investigating +# browser-test hangs and silent failures. +# +# Wraps a single vitest run with a hybrid 1s-steady / 0.1s-burst sampler +# that captures /proc state (wchan, stack, syscall, fd), cgroup CPU/memory +# counters, /proc/net socket state counts, and a CPU-availability +# microbench. When vitest goes silent for >3s the sampler enters dense +# burst mode and emits a stacks_snapshot every 1s of burst. Outer 90s +# timeout converts a hang into a clean exit with the full diagnostic dump +# emitted to stderr by the EXIT trap. +# +# Single-tenant invocation: +# bash yarn-project/kv-store/scripts/probe-test-browser.sh +# +# Parallel invocation (avoids tmp-path collisions across slots): +# DIAG_DIR=/tmp/diag.$$ bash yarn-project/kv-store/scripts/probe-test-browser.sh +# +# Was used to root-cause the kv-store CDP teardown deadlock that +# scripts/run-browser-tests.sh now works around; kept for future regression +# debugging. scripts/repro-browser-hang.sh runs this in a parallel grind +# under docker_isolate constraints. + +cd "$(dirname "$0")/.." + +DIAG_DIR=${DIAG_DIR:-/tmp/diag} +PROBE_LOG=$DIAG_DIR/probe.log +STACKS_LOG=$DIAG_DIR/stacks.log +VITEST_LOG=$DIAG_DIR/vitest.log +mkdir -p "$DIAG_DIR" +: > "$PROBE_LOG" +: > "$STACKS_LOG" +: > "$VITEST_LOG" + +echo "probe-test-browser v2: starting at $(date +%T) (pid $$)" >&2 + +pids_of_interest() { + pgrep -f 'chrom|node|vitest|esbuild|yarn' 2>/dev/null +} + +# Force fixed CPU demand and measure what fraction we actually got. +# ratio = (user+sys)/real. ~1.0 = full CPU; <0.5 = heavily preempted by host. +# Disambiguates "processes not asking for CPU" (scenario A) from "host not +# scheduling us" (scenario B) when cgroup counters are flat. +microbench() { + local TIMEFORMAT='microbench: real=%R user=%U sys=%S' + { time awk 'BEGIN{x=0; for(i=0;i<100000;i++) x+=i*i; print x > "/dev/null"}' \ + /dev/null; } 2>&1 +} + +snapshot() { + local tag="$1" + echo "=== $(date +%T.%N | cut -c1-12) $tag ===" + echo "-- /tmp --" + df -h /tmp 2>/dev/null | tail -1 + du -sh /tmp 2>/dev/null + echo "-- cgroup mem --" + if [ -r /sys/fs/cgroup/memory.current ]; then + awk '{printf "current %d bytes (%.0f MB)\n", $1, $1/1048576}' /sys/fs/cgroup/memory.current + cat /sys/fs/cgroup/memory.events 2>/dev/null + fi + echo "-- cgroup cpu --" + cat /sys/fs/cgroup/cpu.stat 2>/dev/null + echo "-- top by rss --" + ps -eo pid,ppid,stat,rss,pcpu,wchan:20,comm --sort=-rss --no-headers 2>/dev/null | head -15 + echo "-- procs of interest --" + for pid in $(pids_of_interest); do + [ -d "/proc/$pid" ] || continue + local state threads rss_kb wchan cmd + state=$(awk '{print $3}' "/proc/$pid/stat" 2>/dev/null) + wchan=$(cat "/proc/$pid/wchan" 2>/dev/null) + rss_kb=$(awk '/^VmRSS:/{print $2}' "/proc/$pid/status" 2>/dev/null) + threads=$(awk '/^Threads:/{print $2}' "/proc/$pid/status" 2>/dev/null) + cmd=$({ tr '\0' ' ' < "/proc/$pid/cmdline"; } 2>/dev/null | cut -c1-120) + echo "pid=$pid state=$state threads=$threads rss=${rss_kb}kB wchan=$wchan cmd=$cmd" + done + # TCP state hex (col 4): 01=ESTABLISHED 04=FIN_WAIT1 05=FIN_WAIT2 06=TIME_WAIT + # 07=CLOSE 08=CLOSE_WAIT 09=LAST_ACK 0A=LISTEN 0B=CLOSING + echo "-- tcp4 sockets (state counts) --" + awk 'NR>1{print $4}' /proc/net/tcp 2>/dev/null | sort | uniq -c + echo "-- tcp6 sockets (state counts) --" + awk 'NR>1{print $4}' /proc/net/tcp6 2>/dev/null | sort | uniq -c + # Unix socket state (col 6): 01=SS_UNCONNECTED 02=SS_CONNECTING 03=SS_CONNECTED + # 04=SS_DISCONNECTING — CDP/vitest IPC is typically over SS_CONNECTED unix sockets. + echo "-- unix sockets (state counts) --" + awk 'NR>1{print $6}' /proc/net/unix 2>/dev/null | sort | uniq -c +} + +stacks_snapshot() { + local tag="$1" + echo "=== $(date +%T.%N | cut -c1-12) $tag ===" + echo "-- lsof tcp/unix (-n -P, head -40) --" + lsof -n -P -i TCP 2>/dev/null | head -25 + lsof -n -P -U 2>/dev/null | head -15 + for pid in $(pids_of_interest); do + [ -d "/proc/$pid" ] || continue + echo "--- pid=$pid ---" + { tr '\0' ' ' < "/proc/$pid/cmdline"; } 2>/dev/null | cut -c1-140 + echo "" + # /proc/$pid/syscall: syscall_nr arg0..arg5 sp pc (readable by process owner) + echo "syscall:" + cat "/proc/$pid/syscall" 2>/dev/null | head -1 + # /proc/$pid/stack: may require CAP_SYS_ADMIN; print only if non-empty + local stack_content + stack_content=$(cat "/proc/$pid/stack" 2>/dev/null | head -15) + if [ -n "$stack_content" ]; then + echo "kernel stack:" + echo "$stack_content" + fi + # Per-thread wchan — tells us if threads are stuck on different things + echo "thread wchans (sorted | uniq -c):" + for t in /proc/$pid/task/*/wchan; do + cat "$t" 2>/dev/null + echo "" + done | sort | uniq -c | sort -rn | head -10 + echo "socket/pipe fds:" + for fd in /proc/$pid/fd/*; do + local target + target=$(readlink "$fd" 2>/dev/null) || continue + case "$target" in + socket:*|pipe:*|anon_inode:*) echo " $(basename "$fd") -> $target" ;; + esac + done + done +} + +probe_loop() { + local last_vitest_size=0 + local silent_since + silent_since=$(date +%s) + local burst_count=0 + local burst_cap=200 # ~20s dense sampling at 0.1s + local steady_count=0 # increments once per steady iteration + while true; do + local now_s cur_size silent_for + now_s=$(date +%s) + cur_size=$(stat -c %s "$VITEST_LOG" 2>/dev/null || echo 0) + if [ "$cur_size" != "$last_vitest_size" ]; then + last_vitest_size=$cur_size + silent_since=$now_s + burst_count=0 + fi + silent_for=$((now_s - silent_since)) + + if [ "$silent_for" -ge 3 ] && [ "$burst_count" -lt "$burst_cap" ]; then + { + snapshot "burst(${silent_for}s silent)" + # During a hang, microbench every burst snapshot — it's the core + # signal for "is the host giving us CPU?" Overhead is irrelevant + # since tests aren't running. + microbench + } >> "$PROBE_LOG" 2>&1 + # Stacks every 10th burst tick (~every 1s of burst) to keep log size sane + if [ $((burst_count % 10)) -eq 0 ]; then + stacks_snapshot "burst(${silent_for}s silent)" >> "$STACKS_LOG" 2>&1 + fi + burst_count=$((burst_count + 1)) + sleep 0.1 + else + { + snapshot "steady" + # Microbench every 5th steady iteration to bound CPU overhead + # while tests are actually running (~0.4% vs tests' CPU budget). + if [ $((steady_count % 5)) -eq 0 ]; then + microbench + fi + } >> "$PROBE_LOG" 2>&1 + # Periodic in-flight stacks baseline — gives us a "healthy waiting" + # reference to diff against burst-triggered dumps during a hang. + if [ $((steady_count % 10)) -eq 0 ]; then + stacks_snapshot "steady(tick=${steady_count})" >> "$STACKS_LOG" 2>&1 + fi + steady_count=$((steady_count + 1)) + sleep 1 + fi + done +} + +probe_loop & +PROBE_PID=$! + +cleanup() { + local rc=$? + kill "$PROBE_PID" 2>/dev/null + wait "$PROBE_PID" 2>/dev/null + # One last stacks snapshot at the moment of failure + stacks_snapshot "cleanup" >> "$STACKS_LOG" 2>&1 + { + echo "" + echo "=== VITEST LOG (tail -300) ===" + tail -300 "$VITEST_LOG" + echo "" + echo "=== FULL PROBE LOG ($(wc -l < "$PROBE_LOG") lines) ===" + cat "$PROBE_LOG" + echo "" + echo "=== FULL STACKS LOG ($(wc -l < "$STACKS_LOG") lines, test exit=$rc) ===" + cat "$STACKS_LOG" + echo "=== END DIAGNOSTICS ===" + } >&2 + exit "$rc" +} +trap cleanup EXIT + +# Tee vitest output to a file so the probe loop can detect silence. +# pipefail ensures we surface timeout's exit code, not tee's. +set -o pipefail +timeout -v 90s yarn test:browser 2>&1 | tee "$VITEST_LOG" diff --git a/yarn-project/kv-store/scripts/repro-browser-hang.sh b/yarn-project/kv-store/scripts/repro-browser-hang.sh new file mode 100755 index 000000000000..84f3ff75f871 --- /dev/null +++ b/yarn-project/kv-store/scripts/repro-browser-hang.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Reproduces the kv-store browser-test hang under CI3 ISOLATE constraints. +# +# Failure mode: CDP CLOSE_WAIT deadlock between vitest and chromium's network +# service at test-file transitions. Vitest closes 6 of 10 CDP TCP connections +# on file-boundary teardown; chromium's network-service event loop, starved of +# CPU under --cpus=2, never drains the closed sockets. Vitest's teardown hangs +# waiting for the close-handshake. Both processes end up with zero on-CPU +# threads. Outer `timeout -v 90s` in probe-test-browser.sh fires SIGTERM after +# 90s of silence. +# +# Crucially, this does NOT reproduce on unconstrained hardware — chromium has +# enough cores to drain sockets immediately. Must run inside docker_isolate +# (--cpus=2 --memory=8g --tmpfs /tmp:exec,size=1g) to surface the hang. +# +# Usage: +# bash yarn-project/kv-store/scripts/repro-browser-hang.sh [JOBS] +# +# JOBS: parallel container count (default 8). Higher = faster repro, +# higher RAM/CPU ceiling on host. 8 reliably catches a hang in ~3min on +# a multi-core box. +# +# On hang: parallel halts (rc=124) and the failing job's full probe diagnostic +# (vitest tail + probe.log + stacks.log) is dumped to this script's stderr by +# dump_fail. Capture stderr to a file if you want to keep it. +# +# Example: capture both streams for later analysis +# bash yarn-project/kv-store/scripts/repro-browser-hang.sh 8 > /tmp/repro.log 2>&1 + +set -uo pipefail + +cd "$(git rev-parse --show-toplevel)" + +JOBS=${1:-8} + +echo "=== repro-browser-hang start: $(date -Is), jobs=$JOBS ===" +while true; do + echo './ci3/dump_fail "CPUS=2 MEM=8g TMPFS_SIZE=1g ./ci3/docker_isolate \"cd yarn-project/kv-store && bash scripts/probe-test-browser.sh\"" >/dev/null' +done | parallel -j"$JOBS" --halt now,fail=1 +rc=$? +echo "=== repro-browser-hang end (parallel rc=$rc): $(date -Is) ===" +exit $rc diff --git a/yarn-project/kv-store/scripts/run-browser-tests.sh b/yarn-project/kv-store/scripts/run-browser-tests.sh new file mode 100755 index 000000000000..b13f9f662c28 --- /dev/null +++ b/yarn-project/kv-store/scripts/run-browser-tests.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Local-dev entrypoint for `yarn test:browser`: runs every browser test file +# in its own vitest process, sequentially. Delegates per-file dispatch to +# scripts/run_test.sh (which CI also uses for per-file fan-out). +# +# Why not a single `vitest run` over all files: vitest+chromium have a CDP +# teardown deadlock at test-file transitions under CPU-constrained +# environments (CI3 ISOLATE: --cpus=2). Vitest closes a cohort of CDP TCP +# connections when switching files; chromium's network service can't drain +# them fast enough under contention; vitest's teardown blocks indefinitely +# on the close-handshake. By running each file in a separate vitest process +# the close-handshake only happens at process exit, avoiding the cross-file +# teardown path entirely. See scripts/repro-browser-hang.sh for the repro. +set -euo pipefail + +cd "$(dirname "$0")/.." + +files=$(find src/indexeddb src/sqlite-opfs -name '*.test.ts' 2>/dev/null | sort) + +if [ -z "$files" ]; then + echo "No test files found in src/indexeddb or src/sqlite-opfs" + exit 0 +fi + +count=$(echo "$files" | wc -l) +echo "Running $count browser test files (one vitest process per file)" + +i=0 +for f in $files; do + i=$((i + 1)) + echo "==> [$i/$count] $f" + bash scripts/run_test.sh "$f" +done diff --git a/yarn-project/kv-store/scripts/run_test.sh b/yarn-project/kv-store/scripts/run_test.sh new file mode 100755 index 000000000000..999aa9dd3271 --- /dev/null +++ b/yarn-project/kv-store/scripts/run_test.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Runs a single kv-store test file. Dispatches to vitest+chromium for +# browser tests (under src/indexeddb or src/sqlite-opfs) and to mocha for +# everything else. Emitted by yarn-project/kv-store/bootstrap.sh test_cmds +# for CI per-file fan-out and runnable directly for local reproduction: +# +# yarn-project/kv-store/scripts/run_test.sh src/lmdb-v2/store.test.ts +source $(git rev-parse --show-toplevel)/ci3/source + +test=${1:?"Usage: $0 "} +cd .. + +case "$test" in + src/indexeddb/*|src/sqlite-opfs/*) + exec yarn vitest run --config ./vitest.config.ts "$test" + ;; + *) + NODE_NO_WARNINGS=1 exec yarn mocha --config ./.mocharc.json "$test" + ;; +esac diff --git a/yarn-project/kv-store/vitest.config.ts b/yarn-project/kv-store/vitest.config.ts index cee6d7f49cd1..7e73fbcc72a2 100644 --- a/yarn-project/kv-store/vitest.config.ts +++ b/yarn-project/kv-store/vitest.config.ts @@ -52,12 +52,7 @@ export default defineConfig({ reporters: ['verbose'], include: [ './src/indexeddb/**/*.test.ts', - // sqlite-opfs browser tests hang intermittently in the 2-CPU CI container - // (see https://github.com/AztecProtocol/aztec-packages/pull/22693). The - // backend is still experimental — gate these behind VITE_SQLITE_OPFS=1 - // so they run in dev but don't block the merge-train until the hang is - // root-caused. - ...(process.env.VITE_SQLITE_OPFS === '1' ? ['./src/sqlite-opfs/**/*.test.ts'] : []), + './src/sqlite-opfs/**/*.test.ts', // Benchmarks self-skip unless VITE_BENCH=1; include so they're discoverable. './src/bench/indexeddb/**/*.test.ts', './src/bench/sqlite-opfs/**/*.test.ts',