|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna |
| 4 | +# |
| 5 | +# SPDX-License-Identifier: Apache-2.0 |
| 6 | + |
| 7 | +# gap9-cluster-hang-trace.sh |
| 8 | +# |
| 9 | +# Locate where a GAP9 (GVSoC) training/inference run dies on the cluster. |
| 10 | +# |
| 11 | +# Why this exists: |
| 12 | +# When a GAP9 cluster run corrupts a control pointer (e.g. a buffer/DMA |
| 13 | +# overrun clobbers the pi_cl fork-dispatch entry), a worker core jumps to a |
| 14 | +# garbage PC and spins on an illegal `c.unimp` instruction. Tracing the FC is |
| 15 | +# useless (the FC just idle-spins in the scheduler), so this script traces a |
| 16 | +# *cluster* core, watches for the PC-corruption trap loop, stops GVSoC, and |
| 17 | +# prints the last valid instructions + addr2line of the crash site — i.e. the |
| 18 | +# function/closure that was running when the corruption surfaced. |
| 19 | +# |
| 20 | +# It distinguishes three outcomes: |
| 21 | +# - PC-CORRUPTION (c.unimp trap loop) -> the informative case; shows crash site |
| 22 | +# - FC "Invalid fetch" -> FC-side abort (GVSoC dies, trace lost) |
| 23 | +# - no corruption past the target cyc -> ran clean (bug not triggered) |
| 24 | +# |
| 25 | +# Requirements: |
| 26 | +# - The test must already be generated+built (a build_master with a binary). |
| 27 | +# - GVSoC `--trace` needs the debug models dir on LD_LIBRARY_PATH (handled here), |
| 28 | +# otherwise it aborts with "mem_plug_debug.so: cannot open shared object". |
| 29 | +# - Run as the SAME user that built build_master (root/agent ownership of the |
| 30 | +# gvsoc_workdir artifacts must be writable, else gapy fails to dump flash). |
| 31 | +# |
| 32 | +# Usage: |
| 33 | +# scripts/gap9-cluster-hang-trace.sh [TEST_NAME] [CORE] [TEST_DIR] |
| 34 | +# Examples: |
| 35 | +# scripts/gap9-cluster-hang-trace.sh # mobilenetv1_train, pe0 |
| 36 | +# scripts/gap9-cluster-hang-trace.sh resnet8_train pe3 |
| 37 | +# |
| 38 | +# Env overrides: |
| 39 | +# GAP9_SDK (default /app/install/gap9-sdk) |
| 40 | +# GAP9_GCC (default /app/install/gcc/gap9) |
| 41 | +# PASS_CYCLES (default 40000000) cycles past which "no c.unimp" => ran clean |
| 42 | + |
| 43 | +set +e |
| 44 | + |
| 45 | +TEST_NAME="${1:-mobilenetv1_train}" |
| 46 | +CORE="${2:-pe0}" |
| 47 | +TEST_DIR="${3:-TEST_GAP9}" |
| 48 | +GAP9_SDK="${GAP9_SDK:-/app/install/gap9-sdk}" |
| 49 | +GAP9_GCC="${GAP9_GCC:-/app/install/gcc/gap9}" |
| 50 | +PASS_CYCLES="${PASS_CYCLES:-40000000}" |
| 51 | +GARBAGE='c.unimp' # illegal-instruction marker of the PC-corruption trap loop |
| 52 | + |
| 53 | +# --- locate the repo's DeeployTest build dir --- |
| 54 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 55 | +REPO="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 56 | +BUILD="$REPO/DeeployTest/$TEST_DIR/build_master" |
| 57 | +ELF="$BUILD/$TEST_NAME" |
| 58 | +A2L="$GAP9_GCC/bin/riscv32-unknown-elf-addr2line" |
| 59 | +OUT="/tmp/gap9_hang_${TEST_NAME}_${CORE}.trace" |
| 60 | + |
| 61 | +[ -x "$ELF" ] || { echo "ERROR: binary not found: $ELF (generate+build the test first)"; exit 1; } |
| 62 | + |
| 63 | +# --- extract the GVSoC gapy run command emitted by CMake for this target --- |
| 64 | +GVMAKE="$BUILD/DeeployTest/Platforms/GAP9/CMakeFiles/gvsoc_${TEST_NAME}.dir/build.make" |
| 65 | +CMD="$(grep -hoa "${GAP9_SDK}/utils/gapy_v2/bin/gapy --target=gap9.evk[^\"]*--binary=[^ \"]*${TEST_NAME}" "$GVMAKE" 2>/dev/null | head -1)" |
| 66 | +[ -n "$CMD" ] || { echo "ERROR: could not extract gapy gvsoc command from $GVMAKE"; exit 1; } |
| 67 | + |
| 68 | +# --- environment (debug models on LD_LIBRARY_PATH so --trace doesn't abort) --- |
| 69 | +source "$GAP9_SDK/configs/gap9_evk_audio.sh" >/dev/null 2>&1 |
| 70 | +export CHIP_FAMILY=9 CHIP_VERSION=2 GVSOC_INSTALL_DIR="$GAP9_SDK/install/workstation" |
| 71 | +export LD_LIBRARY_PATH="$GAP9_SDK/install/workstation/models/debug:$LD_LIBRARY_PATH" |
| 72 | +unset PYTHONPATH |
| 73 | +cd "$BUILD" || exit 1 |
| 74 | +# stale root-owned gvsoc artifacts block writing; drop the ones gapy regenerates |
| 75 | +find gvsoc_workdir -maxdepth 1 \( -name 'chip.*' -o -name 'flash.bin' \) ! -writable -delete 2>/dev/null |
| 76 | + |
| 77 | +echo ">>> tracing /chip/cluster/$CORE/insn for $TEST_NAME (out: $OUT)" |
| 78 | +rm -f "$OUT" |
| 79 | +stdbuf -oL -eL $CMD --trace=/chip/cluster/$CORE/insn > "$OUT" 2>&1 & |
| 80 | +GVPID=$! |
| 81 | +reason="?" |
| 82 | +prevsz=-1; stall=0; STALL_NEEDED=12 # ~60s of no new pe trace => pe core idle/hung |
| 83 | +trap 'reason=ctrl-c; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher 2>/dev/null' INT |
| 84 | +while kill -0 $GVPID 2>/dev/null; do |
| 85 | + sleep 5 |
| 86 | + sz=$(stat -c%s "$OUT" 2>/dev/null || echo 0) |
| 87 | + cyc=$(grep -a "$CORE/insn" "$OUT" 2>/dev/null | tail -1 | awk -F: '{print $2}' | tr -dc 0-9) |
| 88 | + hit=$(grep -ac "$GARBAGE" "$OUT" 2>/dev/null) |
| 89 | + free_gb=$(df -BG --output=avail / 2>/dev/null | tail -1 | tr -dc 0-9) |
| 90 | + [ "$sz" -eq "$prevsz" ] && stall=$((stall+1)) || stall=0; prevsz=$sz |
| 91 | + echo "[$(date +%H:%M:%S)] $((sz/1048576))MB cyc=${cyc:-?} c.unimp=${hit} stall=${stall}/${STALL_NEEDED} free=${free_gb}G" |
| 92 | + if [ "$hit" -gt 0 ]; then reason="PC-CORRUPTION (c.unimp trap loop) — crash site below"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi |
| 93 | + if grep -qa "Invalid fetch" "$OUT" 2>/dev/null; then reason="FC Invalid fetch — GVSoC aborted, cluster trace lost (try a build that crashes cluster-side)"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi |
| 94 | + if [ "$stall" -ge "$STALL_NEEDED" ]; then reason="$CORE STALLED at cyc ${cyc} — this core stopped executing (idle during a master/optimizer phase, OR a hang). Trace another core / run the full training to tell which."; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi |
| 95 | + if [ -n "$free_gb" ] && [ "$free_gb" -lt 8 ]; then reason="ABORT: low disk"; kill -9 $GVPID 2>/dev/null; pkill -9 gvsoc_launcher; break; fi |
| 96 | +done |
| 97 | +# loop fell through without a detected crash -> GVSoC exited by itself |
| 98 | +[ "$reason" = "?" ] && reason="GVSoC exited on its own — run finished, NO hang/crash detected" |
| 99 | + |
| 100 | +echo "" |
| 101 | +echo "===================== RESULT: $reason =====================" |
| 102 | +LN=$(grep -an "$GARBAGE" "$OUT" | head -1 | cut -d: -f1) |
| 103 | +[ -z "$LN" ] && LN=$(grep -an "$CORE/insn" "$OUT" | tail -1 | cut -d: -f1) |
| 104 | +echo "anchor line ${LN:-none} last_cycle=$(grep -a "$CORE/insn" "$OUT" 2>/dev/null | tail -1 | awk -F: '{print $2}') pe_trace_lines=$(grep -ac "$CORE/insn" "$OUT" 2>/dev/null)" |
| 105 | +if [ -n "$LN" ]; then |
| 106 | + S=$((LN-40)); [ $S -lt 1 ] && S=1 |
| 107 | + echo "--- ~40 instructions before the crash anchor ---" |
| 108 | + sed -n "${S},${LN}p" "$OUT" | sed -E 's/\x1b\[[0-9]*m//g' | sed -E 's#.*'"$CORE"'/insn *\] *##' | cut -c1-95 |
| 109 | + echo "--- distinct functions near the crash (call sequence) ---" |
| 110 | + grep -a "$CORE/insn" "$OUT" | tail -6000 | sed -E 's/\x1b\[[0-9]*m//g' | grep -oE '\] [A-Za-z0-9_]+:[0-9]' | awk '{print $2}' | awk '!s[$0]++' | tail -25 |
| 111 | + echo "--- addr2line of the 12 PCs before the anchor ---" |
| 112 | + for pc in $(sed -n "$((LN-12)),$((LN-1))p" "$OUT" | sed -E 's/\x1b\[[0-9]*m//g' | grep -oE ' M [0-9a-f]+ ' | awk '{print $2}'); do |
| 113 | + printf "0x%s -> " "$pc"; "$A2L" -f -e "$ELF" "0x$pc" | tr '\n' ' '; echo |
| 114 | + done |
| 115 | +fi |
| 116 | +grep -a "Invalid fetch" "$OUT" 2>/dev/null | tail -1 | sed -E 's/\x1b\[[0-9]*m//g' |
0 commit comments