Skip to content

Commit 553ff35

Browse files
sbryngelsonclaude
andcommitted
Harden Frontier CI orchestration scripts
- Eliminate config table duplication by writing configs to a file and reading it inside the sbatch heredoc (now unquoted for variable expansion) - Add EXIT trap in sbatch scripts to kill orphaned SSH processes on job cancellation - Add per-config timeout (90 min tests, 120 min benchmarks) to prevent a single hanging config from consuming the full walltime - Extract SLURM account, partition, walltime, and node count into variables at the top of each orchestration script - Validate GPU count from rocm-smi (1-16 range) with diagnostic output on failure in both test and benchmark compute-node scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6e173e4 commit 553ff35

4 files changed

Lines changed: 133 additions & 94 deletions

File tree

.github/scripts/frontier_bench_config.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ device_opts=""
2020
if [ "$device" = "gpu" ]; then
2121
gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
2222
n_ranks=$(echo "$gpus" | wc -w)
23+
if [ "$n_ranks" -lt 1 ] || [ "$n_ranks" -gt 16 ]; then
24+
echo "ERROR: Unexpected GPU count ($n_ranks). Expected 1-16 for Frontier MI250X."
25+
echo "rocm-smi output:"
26+
rocm-smi --showid
27+
exit 1
28+
fi
29+
echo "Detected $n_ranks GPUs: $gpus"
2330
gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//')
2431
device_opts="--gpu"
2532
[ "$interface" = "acc" ] && device_opts+=" acc"

.github/scripts/frontier_test_config.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ rdma=""
2828
if [ "$device" = "gpu" ]; then
2929
gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
3030
ngpus=$(echo "$gpus" | wc -w)
31+
if [ "$ngpus" -lt 1 ] || [ "$ngpus" -gt 16 ]; then
32+
echo "ERROR: Unexpected GPU count ($ngpus). Expected 1-16 for Frontier MI250X."
33+
echo "rocm-smi output:"
34+
rocm-smi --showid
35+
exit 1
36+
fi
37+
echo "Detected $ngpus GPUs: $gpus"
3138
./mfc.sh test -v -a $rdma --max-attempts 3 -j $ngpus $device_opts -- -c "$cluster"
3239
else
3340
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c "$cluster"

.github/scripts/run_frontier_all_benchmarks.sh

Lines changed: 60 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
# Orchestrate all Frontier benchmark configs in one multi-node SLURM allocation.
33
# 1. Builds all configs on the login node (PR and master, in parallel)
4-
# 2. Submits a single 6-node SLURM job running benchmarks in parallel via ssh
4+
# 2. Submits a single SLURM job running benchmarks in parallel via ssh
55

66
set -euo pipefail
77

@@ -10,6 +10,12 @@ trap '' HUP
1010

1111
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1212

13+
# SLURM parameters
14+
SLURM_ACCOUNT="ENG160"
15+
SLURM_PARTITION="extended"
16+
SLURM_WALLTIME="05:59:00"
17+
CONFIG_TIMEOUT=7200 # 120 min per config
18+
1319
# Benchmark configs: version cluster device interface
1420
# 6 total: 3 configs x 2 versions (PR + master)
1521
configs=(
@@ -26,6 +32,10 @@ echo "=========================================="
2632
echo "Frontier consolidated benchmarks: $num_nodes configs on $num_nodes nodes"
2733
echo "=========================================="
2834

35+
# Write config file for sbatch to read (single source of truth)
36+
config_file="frontier-bench-configs.txt"
37+
printf '%s\n' "${configs[@]}" > "$config_file"
38+
2939
# --- Phase 1: Create per-config source copies ---
3040
for cfg in "${configs[@]}"; do
3141
read -r version cluster device interface <<< "$cfg"
@@ -127,63 +137,65 @@ echo "=========================================="
127137
# --- Phase 3: Submit one sbatch job with N nodes ---
128138
output_file="bench-frontier-all.out"
129139

130-
submit_output=$(sbatch <<'OUTER'
140+
submit_output=$(sbatch <<OUTER
131141
#!/bin/bash
132142
#SBATCH -J MFC-frontier-all-bench
133-
#SBATCH -A ENG160
134-
#SBATCH -N 6
135-
#SBATCH -t 05:59:00
136-
#SBATCH -obench-frontier-all.out
137-
#SBATCH -p extended
143+
#SBATCH -A $SLURM_ACCOUNT
144+
#SBATCH -N $num_nodes
145+
#SBATCH -t $SLURM_WALLTIME
146+
#SBATCH -o$output_file
147+
#SBATCH -p $SLURM_PARTITION
138148
139149
set -x
140150
141-
cd "$SLURM_SUBMIT_DIR"
142-
echo "Running in $(pwd)"
143-
echo "Allocated nodes: $SLURM_NODELIST"
151+
cd "\$SLURM_SUBMIT_DIR"
152+
echo "Running in \$(pwd)"
153+
echo "Allocated nodes: \$SLURM_NODELIST"
144154
145155
# Get list of individual node hostnames
146-
mapfile -t nodes < <(scontrol show hostnames "$SLURM_NODELIST")
147-
echo "Nodes: ${nodes[*]}"
156+
mapfile -t nodes < <(scontrol show hostnames "\$SLURM_NODELIST")
157+
echo "Nodes: \${nodes[*]}"
148158
149-
# Config table (must match the outer script)
150-
configs=(
151-
"pr frontier gpu acc"
152-
"pr frontier gpu omp"
153-
"pr frontier_amd gpu omp"
154-
"master frontier gpu acc"
155-
"master frontier gpu omp"
156-
"master frontier_amd gpu omp"
157-
)
159+
# Read config table from file (written by outer script, avoids duplication)
160+
mapfile -t configs < "$config_file"
158161
159162
pids=()
160163
161-
for i in "${!configs[@]}"; do
162-
read -r version cluster device interface <<< "${configs[$i]}"
163-
node="${nodes[$i]}"
164-
dir="${version}-${cluster}-${device}-${interface}"
165-
outfile="${dir}/bench-${device}-${interface}.out"
166-
167-
echo "[$node] Starting bench: $version $cluster $device $interface in $dir"
168-
169-
ssh -q -o StrictHostKeyChecking=no "$node" \
170-
"cd $SLURM_SUBMIT_DIR/$dir && bash .github/scripts/frontier_bench_config.sh $cluster $device $interface" \
171-
> "$outfile" 2>&1 &
172-
pids+=($!)
164+
cleanup() {
165+
echo "Cleaning up — killing all remote processes..."
166+
for pid in "\${pids[@]}"; do
167+
kill "\$pid" 2>/dev/null
168+
done
169+
wait
170+
}
171+
trap cleanup EXIT
172+
173+
for i in "\${!configs[@]}"; do
174+
read -r version cluster device interface <<< "\${configs[\$i]}"
175+
node="\${nodes[\$i]}"
176+
dir="\${version}-\${cluster}-\${device}-\${interface}"
177+
outfile="\${dir}/bench-\${device}-\${interface}.out"
178+
179+
echo "[\$node] Starting bench: \$version \$cluster \$device \$interface in \$dir"
180+
181+
timeout $CONFIG_TIMEOUT ssh -q -o StrictHostKeyChecking=no "\$node" \
182+
"cd \$SLURM_SUBMIT_DIR/\$dir && bash .github/scripts/frontier_bench_config.sh \$cluster \$device \$interface" \
183+
> "\$outfile" 2>&1 &
184+
pids+=(\$!)
173185
done
174186
175187
echo "All bench configs launched, waiting for completion..."
176188
177189
# Wait for all and collect exit codes
178190
overall_exit=0
179-
for i in "${!pids[@]}"; do
180-
read -r version cluster device interface <<< "${configs[$i]}"
181-
pid=${pids[$i]}
182-
if wait "$pid"; then
183-
echo "PASSED: $version $cluster $device $interface (PID $pid)"
191+
for i in "\${!pids[@]}"; do
192+
read -r version cluster device interface <<< "\${configs[\$i]}"
193+
pid=\${pids[\$i]}
194+
if wait "\$pid"; then
195+
echo "PASSED: \$version \$cluster \$device \$interface (PID \$pid)"
184196
else
185-
code=$?
186-
echo "FAILED: $version $cluster $device $interface (PID $pid, exit code $code)"
197+
code=\$?
198+
echo "FAILED: \$version \$cluster \$device \$interface (PID \$pid, exit code \$code)"
187199
overall_exit=1
188200
fi
189201
done
@@ -192,19 +204,19 @@ done
192204
echo ""
193205
echo "=========================================="
194206
echo "Benchmark summary:"
195-
for cfg in "${configs[@]}"; do
196-
read -r version cluster device interface <<< "$cfg"
197-
dir="${version}-${cluster}-${device}-${interface}"
198-
yaml="${dir}/bench-${device}-${interface}.yaml"
199-
if [ -f "$yaml" ]; then
200-
echo " $version $cluster $device $interface: OK ($(stat -c%s "$yaml" 2>/dev/null) bytes)"
207+
for cfg in "\${configs[@]}"; do
208+
read -r version cluster device interface <<< "\$cfg"
209+
dir="\${version}-\${cluster}-\${device}-\${interface}"
210+
yaml="\${dir}/bench-\${device}-\${interface}.yaml"
211+
if [ -f "\$yaml" ]; then
212+
echo " \$version \$cluster \$device \$interface: OK (\$(stat -c%s "\$yaml" 2>/dev/null) bytes)"
201213
else
202-
echo " $version $cluster $device $interface: MISSING YAML"
214+
echo " \$version \$cluster \$device \$interface: MISSING YAML"
203215
fi
204216
done
205217
echo "=========================================="
206218
207-
exit $overall_exit
219+
exit \$overall_exit
208220
OUTER
209221
)
210222

.github/scripts/run_frontier_all_tests.sh

Lines changed: 59 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
# Orchestrate all Frontier test configs in one multi-node SLURM allocation.
33
# 1. Builds all configs on the login node (in parallel, different modules each)
4-
# 2. Submits a single 5-node SLURM job running tests in parallel via ssh
4+
# 2. Submits a single SLURM job running tests in parallel via ssh
55

66
set -euo pipefail
77

@@ -10,6 +10,12 @@ trap '' HUP
1010

1111
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1212

13+
# SLURM parameters
14+
SLURM_ACCOUNT="ENG160"
15+
SLURM_PARTITION="extended"
16+
SLURM_WALLTIME="05:59:00"
17+
CONFIG_TIMEOUT=5400 # 90 min per config
18+
1319
# Config table: cluster device interface
1420
configs=(
1521
"frontier gpu acc"
@@ -24,6 +30,10 @@ echo "=========================================="
2430
echo "Frontier consolidated tests: $num_nodes configs"
2531
echo "=========================================="
2632

33+
# Write config file for sbatch to read (single source of truth)
34+
config_file="frontier-test-configs.txt"
35+
printf '%s\n' "${configs[@]}" > "$config_file"
36+
2737
# --- Phase 1: Create per-config source copies ---
2838
# Build exclude list to prevent copying into self
2939
excludes=""
@@ -132,62 +142,65 @@ echo "=========================================="
132142
# --- Phase 3: Submit one sbatch job with N nodes ---
133143
output_file="test-frontier-all.out"
134144

135-
submit_output=$(sbatch <<'OUTER'
145+
submit_output=$(sbatch <<OUTER
136146
#!/bin/bash
137147
#SBATCH -J MFC-frontier-all-tests
138-
#SBATCH -A ENG160
139-
#SBATCH -N 5
140-
#SBATCH -t 05:59:00
141-
#SBATCH -otest-frontier-all.out
142-
#SBATCH -p extended
148+
#SBATCH -A $SLURM_ACCOUNT
149+
#SBATCH -N $num_nodes
150+
#SBATCH -t $SLURM_WALLTIME
151+
#SBATCH -o$output_file
152+
#SBATCH -p $SLURM_PARTITION
143153
144154
set -x
145155
146-
cd "$SLURM_SUBMIT_DIR"
147-
echo "Running in $(pwd)"
148-
echo "Allocated nodes: $SLURM_NODELIST"
156+
cd "\$SLURM_SUBMIT_DIR"
157+
echo "Running in \$(pwd)"
158+
echo "Allocated nodes: \$SLURM_NODELIST"
149159
150160
# Get list of individual node hostnames
151-
mapfile -t nodes < <(scontrol show hostnames "$SLURM_NODELIST")
152-
echo "Nodes: ${nodes[*]}"
161+
mapfile -t nodes < <(scontrol show hostnames "\$SLURM_NODELIST")
162+
echo "Nodes: \${nodes[*]}"
153163
154-
# Config table (must match the outer script)
155-
configs=(
156-
"frontier gpu acc"
157-
"frontier gpu omp"
158-
"frontier cpu none"
159-
"frontier_amd gpu omp"
160-
"frontier_amd cpu none"
161-
)
164+
# Read config table from file (written by outer script, avoids duplication)
165+
mapfile -t configs < "$config_file"
162166
163167
pids=()
164168
165-
for i in "${!configs[@]}"; do
166-
read -r cluster device interface <<< "${configs[$i]}"
167-
node="${nodes[$i]}"
168-
dir="test-${cluster}-${device}-${interface}"
169-
outfile="test-${cluster}-${device}-${interface}.out"
170-
171-
echo "[$node] Starting test: $cluster $device $interface in $dir"
172-
173-
ssh -q -o StrictHostKeyChecking=no "$node" \
174-
"cd $SLURM_SUBMIT_DIR/$dir && bash .github/scripts/frontier_test_config.sh $cluster $device $interface" \
175-
> "$outfile" 2>&1 &
176-
pids+=($!)
169+
cleanup() {
170+
echo "Cleaning up — killing all remote processes..."
171+
for pid in "\${pids[@]}"; do
172+
kill "\$pid" 2>/dev/null
173+
done
174+
wait
175+
}
176+
trap cleanup EXIT
177+
178+
for i in "\${!configs[@]}"; do
179+
read -r cluster device interface <<< "\${configs[\$i]}"
180+
node="\${nodes[\$i]}"
181+
dir="test-\${cluster}-\${device}-\${interface}"
182+
outfile="test-\${cluster}-\${device}-\${interface}.out"
183+
184+
echo "[\$node] Starting test: \$cluster \$device \$interface in \$dir"
185+
186+
timeout $CONFIG_TIMEOUT ssh -q -o StrictHostKeyChecking=no "\$node" \
187+
"cd \$SLURM_SUBMIT_DIR/\$dir && bash .github/scripts/frontier_test_config.sh \$cluster \$device \$interface" \
188+
> "\$outfile" 2>&1 &
189+
pids+=(\$!)
177190
done
178191
179192
echo "All test configs launched, waiting for completion..."
180193
181194
# Wait for all and collect exit codes
182195
overall_exit=0
183-
for i in "${!pids[@]}"; do
184-
read -r cluster device interface <<< "${configs[$i]}"
185-
pid=${pids[$i]}
186-
if wait "$pid"; then
187-
echo "PASSED: $cluster $device $interface (PID $pid)"
196+
for i in "\${!pids[@]}"; do
197+
read -r cluster device interface <<< "\${configs[\$i]}"
198+
pid=\${pids[\$i]}
199+
if wait "\$pid"; then
200+
echo "PASSED: \$cluster \$device \$interface (PID \$pid)"
188201
else
189-
code=$?
190-
echo "FAILED: $cluster $device $interface (PID $pid, exit code $code)"
202+
code=\$?
203+
echo "FAILED: \$cluster \$device \$interface (PID \$pid, exit code \$code)"
191204
overall_exit=1
192205
fi
193206
done
@@ -196,18 +209,18 @@ done
196209
echo ""
197210
echo "=========================================="
198211
echo "Test summary:"
199-
for cfg in "${configs[@]}"; do
200-
read -r cluster device interface <<< "$cfg"
201-
outfile="test-${cluster}-${device}-${interface}.out"
202-
if [ -f "$outfile" ]; then
203-
echo " $cluster $device $interface: $(tail -n 1 "$outfile")"
212+
for cfg in "\${configs[@]}"; do
213+
read -r cluster device interface <<< "\$cfg"
214+
outfile="test-\${cluster}-\${device}-\${interface}.out"
215+
if [ -f "\$outfile" ]; then
216+
echo " \$cluster \$device \$interface: \$(tail -n 1 "\$outfile")"
204217
else
205-
echo " $cluster $device $interface: NO OUTPUT FILE"
218+
echo " \$cluster \$device \$interface: NO OUTPUT FILE"
206219
fi
207220
done
208221
echo "=========================================="
209222
210-
exit $overall_exit
223+
exit \$overall_exit
211224
OUTER
212225
)
213226

0 commit comments

Comments
 (0)