11#! /bin/bash
22# Orchestrate all Frontier benchmark configs in one multi-node SLURM allocation.
33# 1. Builds all configs on the login node (PR and master, in parallel)
4- # 2. Submits a single 6-node SLURM job running benchmarks in parallel via ssh
4+ # 2. Submits a single SLURM job running benchmarks in parallel via ssh
55
66set -euo pipefail
77
@@ -10,6 +10,12 @@ trap '' HUP
1010
1111SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
1212
13+ # SLURM parameters
14+ SLURM_ACCOUNT=" ENG160"
15+ SLURM_PARTITION=" extended"
16+ SLURM_WALLTIME=" 05:59:00"
17+ CONFIG_TIMEOUT=7200 # 120 min per config
18+
1319# Benchmark configs: version cluster device interface
1420# 6 total: 3 configs x 2 versions (PR + master)
1521configs=(
@@ -26,6 +32,10 @@ echo "=========================================="
2632echo " Frontier consolidated benchmarks: $num_nodes configs on $num_nodes nodes"
2733echo " =========================================="
2834
35+ # Write config file for sbatch to read (single source of truth)
36+ config_file=" frontier-bench-configs.txt"
37+ printf ' %s\n' " ${configs[@]} " > " $config_file "
38+
2939# --- Phase 1: Create per-config source copies ---
3040for cfg in " ${configs[@]} " ; do
3141 read -r version cluster device interface <<< " $cfg"
@@ -127,63 +137,65 @@ echo "=========================================="
127137# --- Phase 3: Submit one sbatch job with N nodes ---
128138output_file=" bench-frontier-all.out"
129139
130- submit_output=$( sbatch << ' OUTER '
140+ submit_output=$( sbatch << OUTER
131141#!/bin/bash
132142#SBATCH -J MFC-frontier-all-bench
133- #SBATCH -A ENG160
134- #SBATCH -N 6
135- #SBATCH -t 05:59:00
136- #SBATCH -obench-frontier-all.out
137- #SBATCH -p extended
143+ #SBATCH -A $SLURM_ACCOUNT
144+ #SBATCH -N $num_nodes
145+ #SBATCH -t $SLURM_WALLTIME
146+ #SBATCH -o $output_file
147+ #SBATCH -p $SLURM_PARTITION
138148
139149set -x
140150
141- cd "$SLURM_SUBMIT_DIR"
142- echo "Running in $(pwd)"
143- echo "Allocated nodes: $SLURM_NODELIST"
151+ cd "\ $ SLURM_SUBMIT_DIR"
152+ echo "Running in \ $ (pwd)"
153+ echo "Allocated nodes: \ $ SLURM_NODELIST"
144154
145155# Get list of individual node hostnames
146- mapfile -t nodes < <(scontrol show hostnames "$SLURM_NODELIST")
147- echo "Nodes: ${nodes[*]}"
156+ mapfile -t nodes < <(scontrol show hostnames "\ $ SLURM_NODELIST")
157+ echo "Nodes: \ $ {nodes[*]}"
148158
149- # Config table (must match the outer script)
150- configs=(
151- "pr frontier gpu acc"
152- "pr frontier gpu omp"
153- "pr frontier_amd gpu omp"
154- "master frontier gpu acc"
155- "master frontier gpu omp"
156- "master frontier_amd gpu omp"
157- )
159+ # Read config table from file (written by outer script, avoids duplication)
160+ mapfile -t configs < "$config_file "
158161
159162pids=()
160163
161- for i in "${!configs[@]}"; do
162- read -r version cluster device interface <<< "${configs[$i]}"
163- node="${nodes[$i]}"
164- dir="${version}-${cluster}-${device}-${interface}"
165- outfile="${dir}/bench-${device}-${interface}.out"
166-
167- echo "[$node] Starting bench: $version $cluster $device $interface in $dir"
168-
169- ssh -q -o StrictHostKeyChecking=no "$node" \
170- "cd $SLURM_SUBMIT_DIR/$dir && bash .github/scripts/frontier_bench_config.sh $cluster $device $interface" \
171- > "$outfile" 2>&1 &
172- pids+=($!)
164+ cleanup() {
165+ echo "Cleaning up — killing all remote processes..."
166+ for pid in "\$ {pids[@]}"; do
167+ kill "\$ pid" 2>/dev/null
168+ done
169+ wait
170+ }
171+ trap cleanup EXIT
172+
173+ for i in "\$ {!configs[@]}"; do
174+ read -r version cluster device interface <<< "\$ {configs[\$ i]}"
175+ node="\$ {nodes[\$ i]}"
176+ dir="\$ {version}-\$ {cluster}-\$ {device}-\$ {interface}"
177+ outfile="\$ {dir}/bench-\$ {device}-\$ {interface}.out"
178+
179+ echo "[\$ node] Starting bench: \$ version \$ cluster \$ device \$ interface in \$ dir"
180+
181+ timeout $CONFIG_TIMEOUT ssh -q -o StrictHostKeyChecking=no "\$ node" \
182+ "cd \$ SLURM_SUBMIT_DIR/\$ dir && bash .github/scripts/frontier_bench_config.sh \$ cluster \$ device \$ interface" \
183+ > "\$ outfile" 2>&1 &
184+ pids+=(\$ !)
173185done
174186
175187echo "All bench configs launched, waiting for completion..."
176188
177189# Wait for all and collect exit codes
178190overall_exit=0
179- for i in "${!pids[@]}"; do
180- read -r version cluster device interface <<< "${configs[$i]}"
181- pid=${pids[$i]}
182- if wait "$pid"; then
183- echo "PASSED: $version $cluster $device $interface (PID $pid)"
191+ for i in "\ $ {!pids[@]}"; do
192+ read -r version cluster device interface <<< "\ $ {configs[\ $ i]}"
193+ pid=\ $ {pids[\ $ i]}
194+ if wait "\ $ pid"; then
195+ echo "PASSED: \ $ version \ $ cluster \ $ device \ $ interface (PID \ $ pid)"
184196 else
185- code=$?
186- echo "FAILED: $version $cluster $device $interface (PID $pid, exit code $code)"
197+ code=\ $ ?
198+ echo "FAILED: \ $ version \ $ cluster \ $ device \ $ interface (PID \ $ pid, exit code \ $ code)"
187199 overall_exit=1
188200 fi
189201done
@@ -192,19 +204,19 @@ done
192204echo ""
193205echo "=========================================="
194206echo "Benchmark summary:"
195- for cfg in "${configs[@]}"; do
196- read -r version cluster device interface <<< "$cfg"
197- dir="${version}-${cluster}-${device}-${interface}"
198- yaml="${dir}/bench-${device}-${interface}.yaml"
199- if [ -f "$yaml" ]; then
200- echo " $version $cluster $device $interface: OK ($(stat -c%s "$yaml" 2>/dev/null) bytes)"
207+ for cfg in "\ $ {configs[@]}"; do
208+ read -r version cluster device interface <<< "\ $ cfg"
209+ dir="\ $ {version}-\ $ {cluster}-\ $ {device}-\ $ {interface}"
210+ yaml="\ $ {dir}/bench-\ $ {device}-\ $ {interface}.yaml"
211+ if [ -f "\ $ yaml" ]; then
212+ echo " \ $ version \ $ cluster \ $ device \ $ interface: OK (\ $ (stat -c%s "\ $ yaml" 2>/dev/null) bytes)"
201213 else
202- echo " $version $cluster $device $interface: MISSING YAML"
214+ echo " \ $ version \ $ cluster \ $ device \ $ interface: MISSING YAML"
203215 fi
204216done
205217echo "=========================================="
206218
207- exit $overall_exit
219+ exit \ $ overall_exit
208220OUTER
209221)
210222
0 commit comments