File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ # Provides clean_build(): renames build/ aside and deletes it in the background.
3+ # mv is a metadata-only operation that succeeds even with stale NFS file handles,
4+ # unlike rm -rf which fails on ESTALE. The background delete is best-effort and
5+ # scoped to this job's PID to avoid races with concurrent matrix jobs.
6+ #
7+ # Usage: source .github/scripts/clean-build.sh
8+ # clean_build
9+
10+ clean_build () {
11+ # Clean up leftover stale directories from previous runs before adding a new one.
12+ rm -rf build.stale.* 2> /dev/null || true
13+ mv build " build.stale.$$ " 2> /dev/null || true
14+ rm -rf " build.stale.$$ " 2> /dev/null & disown
15+ }
Original file line number Diff line number Diff line change @@ -22,7 +22,8 @@ case "$cluster" in
2222 * ) echo " ERROR: Unknown cluster '$cluster '" ; exit 1 ;;
2323esac
2424
25- rm -rf build
25+ source .github/scripts/clean-build.sh
26+ clean_build
2627
2728. ./mfc.sh load -c " $flag " -m g
2829
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ # Provides retry_sbatch(): submits a job script string via sbatch with retries.
3+ # Only retries on known transient SLURM/infrastructure errors (socket timeouts,
4+ # connection failures). Hard failures (bad account, invalid partition, QOS
5+ # violations) are not retried.
6+ #
7+ # Usage: source .github/scripts/retry-sbatch.sh
8+ # job_id=$(retry_sbatch "$script_contents")
9+
10+ retry_sbatch () {
11+ local script_contents=" $1 "
12+ local max_attempts=3
13+ local attempt=1
14+ local submit_output job_id last_output=" "
15+
16+ while [ $attempt -le $max_attempts ]; do
17+ echo " sbatch attempt $attempt of $max_attempts ..." >&2
18+ submit_output=$( printf ' %s\n' " $script_contents " | sbatch 2>&1 ) || true
19+ job_id=$( echo " $submit_output " | grep -oE ' Submitted batch job ([0-9]+)' | grep -oE ' [0-9]+$' )
20+ if [ -n " $job_id " ]; then
21+ echo " $job_id "
22+ return 0
23+ fi
24+ last_output=" $submit_output "
25+ echo " sbatch failed: $submit_output " >&2
26+ if ! echo " $submit_output " | grep -qiE " timed out|connection refused|connection reset|temporarily unavailable" ; then
27+ echo " Non-transient sbatch failure — not retrying." >&2
28+ return 1
29+ fi
30+ if [ $attempt -lt $max_attempts ]; then
31+ echo " Transient error — retrying in 30s..." >&2
32+ sleep 30
33+ fi
34+ attempt=$(( attempt + 1 ))
35+ done
36+
37+ echo " sbatch failed after $max_attempts attempts. Last error: $last_output " >&2
38+ return 1
39+ }
40+
Original file line number Diff line number Diff line change @@ -85,9 +85,9 @@ if [ "$device" = "cpu" ]; then
8585 case " $cluster " in
8686 phoenix)
8787 sbatch_device_opts=" \
88- #SBATCH -p cpu-small
89- #SBATCH --ntasks-per-node=24
90- #SBATCH --mem-per-cpu=2G "
88+ #SBATCH -p cpu-small,cpu-medium,cpu-large
89+ #SBATCH --ntasks-per-node=12
90+ #SBATCH --mem-per-cpu=8G "
9191 ;;
9292 frontier|frontier_amd)
9393 sbatch_device_opts=" \
@@ -161,8 +161,9 @@ rm -f "$output_file"
161161# --- Module load mode (short form) ---
162162module_mode=$( [ " $device " = " gpu" ] && echo " g" || echo " c" )
163163
164- # --- Submit ---
165- submit_output=$( sbatch << EOT
164+ # --- Submit (with retries for transient SLURM errors) ---
165+ source " ${SCRIPT_DIR} /retry-sbatch.sh"
166+ _sbatch_script=$( cat << EOT
166167#!/bin/bash
167168#SBATCH -J ${job_prefix} -${job_slug}
168169#SBATCH --account=${account}
@@ -192,12 +193,8 @@ $sbatch_script_contents
192193EOT
193194)
194195
195- job_id=$( echo " $submit_output " | grep -oE ' [0-9]+' )
196- if [ -z " $job_id " ]; then
197- echo " ERROR: Failed to submit job. sbatch output:"
198- echo " $submit_output "
199- exit 1
200- fi
196+ job_id=$( retry_sbatch " $_sbatch_script " )
197+ unset _sbatch_script
201198
202199echo " Submitted batch job $job_id "
203200echo " $job_id " > " $id_file "
Original file line number Diff line number Diff line change 2525# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
2626# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2727if [ " $job_cluster " = " phoenix" ]; then
28- rm -rf build
28+ source .github/scripts/clean-build.sh
29+ clean_build
2930fi
3031
3132if [ ! -d " build" ]; then
Original file line number Diff line number Diff line change @@ -8,12 +8,26 @@ set -euo pipefail
88source .github/scripts/gpu-opts.sh
99build_opts=" $gpu_opts "
1010
11+ # --- Phoenix TMPDIR setup ---
12+ # Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13+ # spawning MPI processes, it fills up and ORTE session dir creation fails.
14+ # Redirect TMPDIR to project storage, same as bench.sh.
15+ if [ " $job_cluster " = " phoenix" ]; then
16+ tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
17+ currentdir=$tmpbuild /run-$(( RANDOM % 9000 ))
18+ mkdir -p $tmpbuild
19+ mkdir -p $currentdir
20+ export TMPDIR=$currentdir
21+ trap ' rm -rf "$currentdir" || true' EXIT
22+ fi
23+
1124# --- Build (if not pre-built on login node) ---
1225# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
1326# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
1427# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
1528if [ " $job_cluster " = " phoenix" ]; then
16- rm -rf build
29+ source .github/scripts/clean-build.sh
30+ clean_build
1731fi
1832
1933if [ ! -d " build" ]; then
Original file line number Diff line number Diff line change @@ -20,7 +20,8 @@ build_opts="$gpu_opts"
2020
2121. ./mfc.sh load -c $compiler_flag -m $( [ " $job_device " = " gpu" ] && echo " g" || echo " c" )
2222
23- rm -rf build
23+ source .github/scripts/clean-build.sh
24+ clean_build
2425
2526source .github/scripts/retry-build.sh
2627if [ " $run_bench " == " bench" ]; then
Original file line number Diff line number Diff line change @@ -70,7 +70,7 @@ o-gpu nvhpc cuda/12.3.0 cmake/3.26.3
7070o-gpu CC=nvc CXX=nvc++ FC=nvfortran
7171
7272dai NCSA DeltaAI
73- dai-all python cmake nvhpc-openmpi3/24.3 cuda
73+ dai-all python cmake nvidia/25.5
7474dai-all CC=nvc CXX=nvc++ FC=nvfortran
7575dai-gpu MFC_CUDA_CC=89,90
7676
You can’t perform that action at this time.
0 commit comments