Skip to content

Commit 956ea0c

Browse files
authored
Merge branch 'master' into bala_force_changes
2 parents 8e916c6 + 92e751f commit 956ea0c

8 files changed

Lines changed: 85 additions & 16 deletions

File tree

.github/scripts/clean-build.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Provides clean_build(): renames build/ aside and deletes it in the background.
3+
# mv is a metadata-only operation that succeeds even with stale NFS file handles,
4+
# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
5+
# scoped to this job's PID to avoid races with concurrent matrix jobs.
6+
#
7+
# Usage: source .github/scripts/clean-build.sh
8+
# clean_build
9+
10+
clean_build() {
11+
# Clean up leftover stale directories from previous runs before adding a new one.
12+
rm -rf build.stale.* 2>/dev/null || true
13+
mv build "build.stale.$$" 2>/dev/null || true
14+
rm -rf "build.stale.$$" 2>/dev/null & disown
15+
}

.github/scripts/prebuild-case-optimization.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ case "$cluster" in
2222
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2323
esac
2424

25-
rm -rf build
25+
source .github/scripts/clean-build.sh
26+
clean_build
2627

2728
. ./mfc.sh load -c "$flag" -m g
2829

.github/scripts/retry-sbatch.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
# Provides retry_sbatch(): submits a job script string via sbatch with retries.
3+
# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
4+
# connection failures). Hard failures (bad account, invalid partition, QOS
5+
# violations) are not retried.
6+
#
7+
# Usage: source .github/scripts/retry-sbatch.sh
8+
# job_id=$(retry_sbatch "$script_contents")
9+
10+
retry_sbatch() {
11+
local script_contents="$1"
12+
local max_attempts=3
13+
local attempt=1
14+
local submit_output job_id last_output=""
15+
16+
while [ $attempt -le $max_attempts ]; do
17+
echo "sbatch attempt $attempt of $max_attempts..." >&2
18+
submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
19+
job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
20+
if [ -n "$job_id" ]; then
21+
echo "$job_id"
22+
return 0
23+
fi
24+
last_output="$submit_output"
25+
echo "sbatch failed: $submit_output" >&2
26+
if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
27+
echo "Non-transient sbatch failure — not retrying." >&2
28+
return 1
29+
fi
30+
if [ $attempt -lt $max_attempts ]; then
31+
echo "Transient error — retrying in 30s..." >&2
32+
sleep 30
33+
fi
34+
attempt=$((attempt + 1))
35+
done
36+
37+
echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
38+
return 1
39+
}
40+

.github/scripts/submit-slurm-job.sh

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ if [ "$device" = "cpu" ]; then
8585
case "$cluster" in
8686
phoenix)
8787
sbatch_device_opts="\
88-
#SBATCH -p cpu-small
89-
#SBATCH --ntasks-per-node=24
90-
#SBATCH --mem-per-cpu=2G"
88+
#SBATCH -p cpu-small,cpu-medium,cpu-large
89+
#SBATCH --ntasks-per-node=12
90+
#SBATCH --mem-per-cpu=8G"
9191
;;
9292
frontier|frontier_amd)
9393
sbatch_device_opts="\
@@ -161,8 +161,9 @@ rm -f "$output_file"
161161
# --- Module load mode (short form) ---
162162
module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
163163

164-
# --- Submit ---
165-
submit_output=$(sbatch <<EOT
164+
# --- Submit (with retries for transient SLURM errors) ---
165+
source "${SCRIPT_DIR}/retry-sbatch.sh"
166+
_sbatch_script=$(cat <<EOT
166167
#!/bin/bash
167168
#SBATCH -J ${job_prefix}-${job_slug}
168169
#SBATCH --account=${account}
@@ -192,12 +193,8 @@ $sbatch_script_contents
192193
EOT
193194
)
194195

195-
job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
196-
if [ -z "$job_id" ]; then
197-
echo "ERROR: Failed to submit job. sbatch output:"
198-
echo "$submit_output"
199-
exit 1
200-
fi
196+
job_id=$(retry_sbatch "$_sbatch_script")
197+
unset _sbatch_script
201198

202199
echo "Submitted batch job $job_id"
203200
echo "$job_id" > "$id_file"

.github/workflows/common/bench.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ fi
2525
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
2626
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2727
if [ "$job_cluster" = "phoenix" ]; then
28-
rm -rf build
28+
source .github/scripts/clean-build.sh
29+
clean_build
2930
fi
3031

3132
if [ ! -d "build" ]; then

.github/workflows/common/test.sh

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,26 @@ set -euo pipefail
88
source .github/scripts/gpu-opts.sh
99
build_opts="$gpu_opts"
1010

11+
# --- Phoenix TMPDIR setup ---
12+
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13+
# spawning MPI processes, it fills up and ORTE session dir creation fails.
14+
# Redirect TMPDIR to project storage, same as bench.sh.
15+
if [ "$job_cluster" = "phoenix" ]; then
16+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
17+
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
18+
mkdir -p $tmpbuild
19+
mkdir -p $currentdir
20+
export TMPDIR=$currentdir
21+
trap 'rm -rf "$currentdir" || true' EXIT
22+
fi
23+
1124
# --- Build (if not pre-built on login node) ---
1225
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
1326
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
1427
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
1528
if [ "$job_cluster" = "phoenix" ]; then
16-
rm -rf build
29+
source .github/scripts/clean-build.sh
30+
clean_build
1731
fi
1832

1933
if [ ! -d "build" ]; then

.github/workflows/frontier/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ build_opts="$gpu_opts"
2020

2121
. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
2222

23-
rm -rf build
23+
source .github/scripts/clean-build.sh
24+
clean_build
2425

2526
source .github/scripts/retry-build.sh
2627
if [ "$run_bench" == "bench" ]; then

toolchain/modules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ o-gpu nvhpc cuda/12.3.0 cmake/3.26.3
7070
o-gpu CC=nvc CXX=nvc++ FC=nvfortran
7171

7272
dai NCSA DeltaAI
73-
dai-all python cmake nvhpc-openmpi3/24.3 cuda
73+
dai-all python cmake nvidia/25.5
7474
dai-all CC=nvc CXX=nvc++ FC=nvfortran
7575
dai-gpu MFC_CUDA_CC=89,90
7676

0 commit comments

Comments
 (0)