Skip to content

Commit 832fddf

Browse files
committed
Add GAP9 cluster hang-trace debug script
scripts/gap9-cluster-hang-trace.sh traces a GAP9 (GVSoC) cluster core (default pe0; use pe8 for the cluster controller) to locate where a training/inference run hangs or crashes. It distinguishes a PC-corruption trap loop (c.unimp), an FC Invalid fetch, a stalled/idle core, and a clean exit, then prints the last instructions + addr2line of the crash site. Needs the GVSoC debug models on LD_LIBRARY_PATH (handled internally).
1 parent 9e51c34 commit 832fddf

1 file changed

Lines changed: 116 additions & 0 deletions

File tree

scripts/gap9-cluster-hang-trace.sh

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env bash
2+
3+
# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
# gap9-cluster-hang-trace.sh
8+
#
9+
# Locate where a GAP9 (GVSoC) training/inference run dies on the cluster.
10+
#
11+
# Why this exists:
12+
# When a GAP9 cluster run corrupts a control pointer (e.g. a buffer/DMA
13+
# overrun clobbers the pi_cl fork-dispatch entry), a worker core jumps to a
14+
# garbage PC and spins on an illegal `c.unimp` instruction. Tracing the FC is
15+
# useless (the FC just idle-spins in the scheduler), so this script traces a
16+
# *cluster* core, watches for the PC-corruption trap loop, stops GVSoC, and
17+
# prints the last valid instructions + addr2line of the crash site — i.e. the
18+
# function/closure that was running when the corruption surfaced.
19+
#
20+
# It distinguishes three outcomes:
21+
# - PC-CORRUPTION (c.unimp trap loop) -> the informative case; shows crash site
22+
# - FC "Invalid fetch" -> FC-side abort (GVSoC dies, trace lost)
23+
# - no corruption past the target cyc -> ran clean (bug not triggered)
24+
#
25+
# Requirements:
26+
# - The test must already be generated+built (a build_master with a binary).
27+
# - GVSoC `--trace` needs the debug models dir on LD_LIBRARY_PATH (handled here),
28+
# otherwise it aborts with "mem_plug_debug.so: cannot open shared object".
29+
# - Run as the SAME user that built build_master (root/agent ownership of the
30+
# gvsoc_workdir artifacts must be writable, else gapy fails to dump flash).
31+
#
32+
# Usage:
33+
# scripts/gap9-cluster-hang-trace.sh [TEST_NAME] [CORE] [TEST_DIR]
34+
# Examples:
35+
# scripts/gap9-cluster-hang-trace.sh # mobilenetv1_train, pe0
36+
# scripts/gap9-cluster-hang-trace.sh resnet8_train pe3
37+
#
38+
# Env overrides:
39+
# GAP9_SDK (default /app/install/gap9-sdk)
40+
# GAP9_GCC (default /app/install/gcc/gap9)
41+
# PASS_CYCLES (default 40000000) cycles past which "no c.unimp" => ran clean
42+
43+
set +e
44+
45+
TEST_NAME="${1:-mobilenetv1_train}"
46+
CORE="${2:-pe0}"
47+
TEST_DIR="${3:-TEST_GAP9}"
48+
GAP9_SDK="${GAP9_SDK:-/app/install/gap9-sdk}"
49+
GAP9_GCC="${GAP9_GCC:-/app/install/gcc/gap9}"
50+
PASS_CYCLES="${PASS_CYCLES:-40000000}"
51+
GARBAGE='c.unimp' # illegal-instruction marker of the PC-corruption trap loop
52+
53+
# --- locate the repo's DeeployTest build dir ---
54+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
55+
REPO="$(cd "$SCRIPT_DIR/.." && pwd)"
56+
BUILD="$REPO/DeeployTest/$TEST_DIR/build_master"
57+
ELF="$BUILD/$TEST_NAME"
58+
A2L="$GAP9_GCC/bin/riscv32-unknown-elf-addr2line"
59+
OUT="/tmp/gap9_hang_${TEST_NAME}_${CORE}.trace"
60+
61+
[ -x "$ELF" ] || { echo "ERROR: binary not found: $ELF (generate+build the test first)"; exit 1; }
62+
63+
# --- extract the GVSoC gapy run command emitted by CMake for this target ---
64+
GVMAKE="$BUILD/DeeployTest/Platforms/GAP9/CMakeFiles/gvsoc_${TEST_NAME}.dir/build.make"
65+
CMD="$(grep -hoa "${GAP9_SDK}/utils/gapy_v2/bin/gapy --target=gap9.evk[^\"]*--binary=[^ \"]*${TEST_NAME}" "$GVMAKE" 2>/dev/null | head -1)"
66+
[ -n "$CMD" ] || { echo "ERROR: could not extract gapy gvsoc command from $GVMAKE"; exit 1; }
67+
68+
# --- environment (debug models on LD_LIBRARY_PATH so --trace doesn't abort) ---
69+
source "$GAP9_SDK/configs/gap9_evk_audio.sh" >/dev/null 2>&1
70+
export CHIP_FAMILY=9 CHIP_VERSION=2 GVSOC_INSTALL_DIR="$GAP9_SDK/install/workstation"
71+
export LD_LIBRARY_PATH="$GAP9_SDK/install/workstation/models/debug:$LD_LIBRARY_PATH"
72+
unset PYTHONPATH
73+
cd "$BUILD" || exit 1
74+
# stale root-owned gvsoc artifacts block writing; drop the ones gapy regenerates
75+
find gvsoc_workdir -maxdepth 1 \( -name 'chip.*' -o -name 'flash.bin' \) ! -writable -delete 2>/dev/null
76+
77+
echo ">>> tracing /chip/cluster/$CORE/insn for $TEST_NAME (out: $OUT)"
78+
rm -f "$OUT"
79+
stdbuf -oL -eL $CMD --trace=/chip/cluster/$CORE/insn > "$OUT" 2>&1 &
80+
GVPID=$!
81+
reason="?"
82+
prevsz=-1; stall=0; STALL_NEEDED=12 # ~60s of no new pe trace => pe core idle/hung
83+
trap 'reason=ctrl-c; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher 2>/dev/null' INT
84+
while kill -0 $GVPID 2>/dev/null; do
85+
sleep 5
86+
sz=$(stat -c%s "$OUT" 2>/dev/null || echo 0)
87+
cyc=$(grep -a "$CORE/insn" "$OUT" 2>/dev/null | tail -1 | awk -F: '{print $2}' | tr -dc 0-9)
88+
hit=$(grep -ac "$GARBAGE" "$OUT" 2>/dev/null)
89+
free_gb=$(df -BG --output=avail / 2>/dev/null | tail -1 | tr -dc 0-9)
90+
[ "$sz" -eq "$prevsz" ] && stall=$((stall+1)) || stall=0; prevsz=$sz
91+
echo "[$(date +%H:%M:%S)] $((sz/1048576))MB cyc=${cyc:-?} c.unimp=${hit} stall=${stall}/${STALL_NEEDED} free=${free_gb}G"
92+
if [ "$hit" -gt 0 ]; then reason="PC-CORRUPTION (c.unimp trap loop) — crash site below"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi
93+
if grep -qa "Invalid fetch" "$OUT" 2>/dev/null; then reason="FC Invalid fetch — GVSoC aborted, cluster trace lost (try a build that crashes cluster-side)"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi
94+
if [ "$stall" -ge "$STALL_NEEDED" ]; then reason="$CORE STALLED at cyc ${cyc} — this core stopped executing (idle during a master/optimizer phase, OR a hang). Trace another core / run the full training to tell which."; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi
95+
if [ -n "$free_gb" ] && [ "$free_gb" -lt 8 ]; then reason="ABORT: low disk"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi
96+
done
97+
# loop fell through without a detected crash -> GVSoC exited by itself
98+
[ "$reason" = "?" ] && reason="GVSoC exited on its own — run finished, NO hang/crash detected"
99+
100+
echo ""
101+
echo "===================== RESULT: $reason ====================="
102+
LN=$(grep -an "$GARBAGE" "$OUT" | head -1 | cut -d: -f1)
103+
[ -z "$LN" ] && LN=$(grep -an "$CORE/insn" "$OUT" | tail -1 | cut -d: -f1)
104+
echo "anchor line ${LN:-none} last_cycle=$(grep -a "$CORE/insn" "$OUT" 2>/dev/null | tail -1 | awk -F: '{print $2}') pe_trace_lines=$(grep -ac "$CORE/insn" "$OUT" 2>/dev/null)"
105+
if [ -n "$LN" ]; then
106+
S=$((LN-40)); [ $S -lt 1 ] && S=1
107+
echo "--- ~40 instructions before the crash anchor ---"
108+
sed -n "${S},${LN}p" "$OUT" | sed -E 's/\x1b\[[0-9]*m//g' | sed -E 's#.*'"$CORE"'/insn *\] *##' | cut -c1-95
109+
echo "--- distinct functions near the crash (call sequence) ---"
110+
grep -a "$CORE/insn" "$OUT" | tail -6000 | sed -E 's/\x1b\[[0-9]*m//g' | grep -oE '\] [A-Za-z0-9_]+:[0-9]' | awk '{print $2}' | awk '!s[$0]++' | tail -25
111+
echo "--- addr2line of the 12 PCs before the anchor ---"
112+
for pc in $(sed -n "$((LN-12)),$((LN-1))p" "$OUT" | sed -E 's/\x1b\[[0-9]*m//g' | grep -oE ' M [0-9a-f]+ ' | awk '{print $2}'); do
113+
printf "0x%s -> " "$pc"; "$A2L" -f -e "$ELF" "0x$pc" | tr '\n' ' '; echo
114+
done
115+
fi
116+
grep -a "Invalid fetch" "$OUT" 2>/dev/null | tail -1 | sed -E 's/\x1b\[[0-9]*m//g'

0 commit comments

Comments
 (0)