Skip to content

Commit bbae88f

Browse files
authored
Merge branch 'master' into fix/viscous-3d-gpu-private
2 parents ebbdeb3 + 0ac2be7 commit bbae88f

99 files changed

Lines changed: 6546 additions & 4598 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/scripts/clean-build.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# Provides clean_build(): renames build/ aside and deletes it in the background.
3+
# mv is a metadata-only operation that succeeds even with stale NFS file handles,
4+
# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
5+
# scoped to this job's PID to avoid races with concurrent matrix jobs.
6+
#
7+
# Usage: source .github/scripts/clean-build.sh
8+
# clean_build
9+
10+
clean_build() {
11+
# Clean up leftover stale directories from previous runs before adding a new one.
12+
rm -rf build.stale.* 2>/dev/null || true
13+
mv build "build.stale.$$" 2>/dev/null || true
14+
rm -rf "build.stale.$$" 2>/dev/null & disown
15+
}

.github/scripts/monitor_slurm_job.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,15 @@ get_job_state() {
5252
# Fallback to sacct (works for completed/historical jobs)
5353
if command -v sacct >/dev/null 2>&1; then
5454
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
55+
# When a job is preempted+requeued, sacct -X reports PREEMPTED for the
56+
# original attempt while the requeued run may have completed. Check all
57+
# records (without -X) for a terminal state that supersedes PREEMPTED.
58+
if [ "$state" = "PREEMPTED" ]; then
59+
requeue_state=$(sacct -j "$jid" -n -P -o State 2>/dev/null | grep -v PREEMPTED | head -n1 | cut -d'|' -f1 || true)
60+
if [ -n "$requeue_state" ]; then
61+
state="$requeue_state"
62+
fi
63+
fi
5564
if [ -n "$state" ]; then
5665
echo "$state"
5766
return

.github/scripts/prebuild-case-optimization.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ case "$cluster" in
2222
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2323
esac
2424

25-
rm -rf build
25+
source .github/scripts/clean-build.sh
26+
clean_build
2627

2728
. ./mfc.sh load -c "$flag" -m g
2829

.github/scripts/retry-sbatch.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
# Provides retry_sbatch(): submits a job script string via sbatch with retries.
3+
# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
4+
# connection failures). Hard failures (bad account, invalid partition, QOS
5+
# violations) are not retried.
6+
#
7+
# Usage: source .github/scripts/retry-sbatch.sh
8+
# job_id=$(retry_sbatch "$script_contents")
9+
10+
retry_sbatch() {
11+
local script_contents="$1"
12+
local max_attempts=3
13+
local attempt=1
14+
local submit_output job_id last_output=""
15+
16+
while [ $attempt -le $max_attempts ]; do
17+
echo "sbatch attempt $attempt of $max_attempts..." >&2
18+
submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
19+
job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
20+
if [ -n "$job_id" ]; then
21+
echo "$job_id"
22+
return 0
23+
fi
24+
last_output="$submit_output"
25+
echo "sbatch failed: $submit_output" >&2
26+
if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
27+
echo "Non-transient sbatch failure — not retrying." >&2
28+
return 1
29+
fi
30+
if [ $attempt -lt $max_attempts ]; then
31+
echo "Transient error — retrying in 30s..." >&2
32+
sleep 30
33+
fi
34+
attempt=$((attempt + 1))
35+
done
36+
37+
echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
38+
return 1
39+
}
40+

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
#!/bin/bash
2-
# Run PR and master benchmarks in parallel and verify outputs
2+
# Run PR and master benchmarks and verify outputs.
3+
# Both SLURM jobs are submitted up front so they run concurrently on
4+
# compute nodes (fair comparison under the same cluster load), but
5+
# monitoring happens sequentially to stay within the per-user cgroup
6+
# memory limit on login nodes (4 GB on Phoenix shared by 7 runners).
37
# Usage: run_parallel_benchmarks.sh <device> <interface> <cluster>
48

59
set -euo pipefail
@@ -17,88 +21,104 @@ cluster="$3"
1721
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1822

1923
echo "=========================================="
20-
echo "Starting parallel benchmark jobs..."
24+
echo "Starting benchmark jobs..."
2125
echo "=========================================="
2226

23-
# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24-
# both parallel jobs so PR and master always land on the same GPU type.
27+
# For Phoenix GPU benchmarks, select a consistent GPU partition so PR and
28+
# master always land on the same GPU type.
2529
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
2630
echo "Selecting Phoenix GPU partition for benchmark consistency..."
27-
# Require 2 nodes so both PR and master jobs can run concurrently.
31+
# Require 2 nodes so both jobs can run concurrently on compute.
2832
GPU_PARTITION_MIN_NODES=2 source "${SCRIPT_DIR}/select-gpu-partition.sh"
2933
BENCH_GPU_PARTITION="$SELECTED_GPU_PARTITION"
3034
export BENCH_GPU_PARTITION
3135
fi
3236

33-
# Run both jobs with monitoring using dedicated script from PR
34-
# Use stdbuf for line-buffered output and prefix each line for clarity
35-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
36-
pr_pid=$!
37-
echo "PR job started in background (PID: $pr_pid)"
38-
39-
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" master "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[MASTER] $line"; done) &
40-
master_pid=$!
41-
echo "Master job started in background (PID: $master_pid)"
42-
43-
echo "Waiting for both jobs to complete..."
44-
45-
# Wait and capture exit codes reliably.
46-
# Use `wait ... || exit=$?` to avoid set -e aborting on the first failure
47-
# (which would orphan the second job).
37+
# The bench script must come from the PR tree (master may not have it).
38+
PR_BENCH_SCRIPT="$(cd "${SCRIPT_DIR}/../workflows/common" && pwd)/bench.sh"
39+
# Must match the slug computed by submit-slurm-job.sh:
40+
# basename("bench.sh") → "bench" → "bench-${device}-${interface}"
41+
job_slug="bench-${device}-${interface}"
42+
43+
# --- Phase 1: Submit both SLURM jobs (no monitoring yet) ---
44+
echo "Submitting PR benchmark..."
45+
(cd pr && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
46+
pr_job_id=$(cat "pr/${job_slug}.slurm_job_id")
47+
echo "PR job submitted: $pr_job_id"
48+
49+
echo "Submitting master benchmark..."
50+
(cd master && SUBMIT_ONLY=1 bash "${SCRIPT_DIR}/submit-slurm-job.sh" "$PR_BENCH_SCRIPT" "$device" "$interface" "$cluster")
51+
master_job_id=$(cat "master/${job_slug}.slurm_job_id")
52+
echo "Master job submitted: $master_job_id"
53+
54+
echo "Both SLURM jobs submitted — running concurrently on compute nodes."
55+
echo "Monitoring sequentially to conserve login node memory."
56+
57+
# --- Phase 2: Monitor sequentially (one at a time on login node) ---
58+
echo ""
59+
echo "=== Monitoring PR job $pr_job_id ==="
4860
pr_exit=0
49-
master_exit=0
50-
51-
wait "$pr_pid" || pr_exit=$?
61+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out" || pr_exit=$?
5262
if [ "$pr_exit" -ne 0 ]; then
53-
echo "PR job exited with code: $pr_exit"
54-
echo "Last 50 lines of PR job log:"
55-
tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
63+
echo "PR job exited with code: $pr_exit"
64+
tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
5665
else
57-
echo "PR job completed successfully"
66+
echo "PR job completed successfully"
5867
fi
5968

60-
wait "$master_pid" || master_exit=$?
69+
echo ""
70+
echo "=== Monitoring master job $master_job_id ==="
71+
master_exit=0
72+
bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$master_job_id" "master/${job_slug}.out" || master_exit=$?
6173
if [ "$master_exit" -ne 0 ]; then
62-
echo "Master job exited with code: $master_exit"
63-
echo "Last 50 lines of master job log:"
64-
tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
74+
echo "Master job exited with code: $master_exit"
75+
tail -n 50 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
6576
else
66-
echo "Master job completed successfully"
77+
echo "Master job completed successfully"
6778
fi
6879

69-
# Warn if either job failed (partial results may still be usable)
80+
# --- Phase 3: Verify outputs ---
7081
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
71-
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
72-
echo "Checking for partial results..."
82+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
83+
echo "Checking for partial results..."
7384
else
74-
echo "=========================================="
75-
echo "Both benchmark jobs completed successfully!"
76-
echo "=========================================="
85+
echo "=========================================="
86+
echo "Both benchmark jobs completed successfully!"
87+
echo "=========================================="
7788
fi
7889

79-
# Final verification that output files exist before proceeding
80-
pr_yaml="pr/bench-${device}-${interface}.yaml"
81-
master_yaml="master/bench-${device}-${interface}.yaml"
90+
pr_yaml="pr/${job_slug}.yaml"
91+
master_yaml="master/${job_slug}.yaml"
92+
93+
# Wait briefly for YAML files to appear on NFS. When monitoring starts
94+
# after a job has already completed (common for the second job), the
95+
# recovery path in run_monitored_slurm_job.sh sleeps 30s, but NFS
96+
# propagation can take longer under load.
97+
for yaml in "$pr_yaml" "$master_yaml"; do
98+
attempts=0
99+
while [ ! -f "$yaml" ] && [ $attempts -lt 6 ]; do
100+
echo "Waiting for $yaml to appear (NFS propagation)..."
101+
sleep 5
102+
attempts=$((attempts + 1))
103+
done
104+
done
82105

83106
if [ ! -f "$pr_yaml" ]; then
84-
echo "ERROR: PR benchmark output not found: $pr_yaml"
85-
ls -la pr/ || true
86-
echo ""
87-
echo "Last 100 lines of PR log:"
88-
tail -n 100 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
89-
exit 1
107+
echo "ERROR: PR benchmark output not found: $pr_yaml"
108+
ls -la pr/ || true
109+
echo ""
110+
tail -n 100 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
111+
exit 1
90112
fi
91113

92114
if [ ! -f "$master_yaml" ]; then
93-
echo "ERROR: Master benchmark output not found: $master_yaml"
94-
ls -la master/ || true
95-
echo ""
96-
echo "Last 100 lines of master log:"
97-
tail -n 100 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
98-
exit 1
115+
echo "ERROR: Master benchmark output not found: $master_yaml"
116+
ls -la master/ || true
117+
echo ""
118+
tail -n 100 "master/${job_slug}.out" 2>/dev/null || echo " Could not read master log"
119+
exit 1
99120
fi
100121

101122
echo "Verified both YAML files exist:"
102123
echo " - $pr_yaml"
103124
echo " - $master_yaml"
104-

.github/scripts/submit-slurm-job.sh

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ if [ "$device" = "cpu" ]; then
8585
case "$cluster" in
8686
phoenix)
8787
sbatch_device_opts="\
88-
#SBATCH -p cpu-small
89-
#SBATCH --ntasks-per-node=24
90-
#SBATCH --mem-per-cpu=2G"
88+
#SBATCH -p cpu-small,cpu-medium,cpu-large
89+
#SBATCH --ntasks-per-node=12
90+
#SBATCH --mem-per-cpu=8G"
9191
;;
9292
frontier|frontier_amd)
9393
sbatch_device_opts="\
@@ -161,8 +161,9 @@ rm -f "$output_file"
161161
# --- Module load mode (short form) ---
162162
module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
163163

164-
# --- Submit ---
165-
submit_output=$(sbatch <<EOT
164+
# --- Submit (with retries for transient SLURM errors) ---
165+
source "${SCRIPT_DIR}/retry-sbatch.sh"
166+
_sbatch_script=$(cat <<EOT
166167
#!/bin/bash
167168
#SBATCH -J ${job_prefix}-${job_slug}
168169
#SBATCH --account=${account}
@@ -192,16 +193,16 @@ $sbatch_script_contents
192193
EOT
193194
)
194195

195-
job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
196-
if [ -z "$job_id" ]; then
197-
echo "ERROR: Failed to submit job. sbatch output:"
198-
echo "$submit_output"
199-
exit 1
200-
fi
196+
job_id=$(retry_sbatch "$_sbatch_script")
197+
unset _sbatch_script
201198

202199
echo "Submitted batch job $job_id"
203200
echo "$job_id" > "$id_file"
204201
echo "Job ID written to $id_file"
205202

206-
# --- Monitor ---
207-
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
203+
# --- Monitor (skip if SUBMIT_ONLY=1, e.g. for parallel submission) ---
204+
if [ "${SUBMIT_ONLY:-0}" = "1" ]; then
205+
echo "SUBMIT_ONLY mode: skipping monitor (job_id=$job_id output=$output_file)"
206+
else
207+
bash "$SCRIPT_DIR/run_monitored_slurm_job.sh" "$job_id" "$output_file"
208+
fi

.github/workflows/common/bench.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ fi
2525
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
2626
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2727
if [ "$job_cluster" = "phoenix" ]; then
28-
rm -rf build
28+
source .github/scripts/clean-build.sh
29+
clean_build
2930
fi
3031

3132
if [ ! -d "build" ]; then

.github/workflows/common/test.sh

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,26 @@ set -euo pipefail
88
source .github/scripts/gpu-opts.sh
99
build_opts="$gpu_opts"
1010

11+
# --- Phoenix TMPDIR setup ---
12+
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13+
# spawning MPI processes, it fills up and ORTE session dir creation fails.
14+
# Redirect TMPDIR to project storage, same as bench.sh.
15+
if [ "$job_cluster" = "phoenix" ]; then
16+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
17+
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
18+
mkdir -p $tmpbuild
19+
mkdir -p $currentdir
20+
export TMPDIR=$currentdir
21+
trap 'rm -rf "$currentdir" || true' EXIT
22+
fi
23+
1124
# --- Build (if not pre-built on login node) ---
1225
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
1326
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
1427
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
1528
if [ "$job_cluster" = "phoenix" ]; then
16-
rm -rf build
29+
source .github/scripts/clean-build.sh
30+
clean_build
1731
fi
1832

1933
if [ ! -d "build" ]; then

.github/workflows/frontier/build.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ build_opts="$gpu_opts"
2020

2121
. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
2222

23-
rm -rf build
23+
source .github/scripts/clean-build.sh
24+
clean_build
2425

2526
source .github/scripts/retry-build.sh
2627
if [ "$run_bench" == "bench" ]; then

.github/workflows/test.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,9 @@ jobs:
234234
# submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry.
235235
clean: false
236236

237+
- name: Clean stale output files
238+
run: rm -f *.out
239+
237240
- name: Build (login node)
238241
if: matrix.cluster != 'phoenix'
239242
timeout-minutes: 60
@@ -317,6 +320,9 @@ jobs:
317320
with:
318321
clean: false
319322

323+
- name: Clean stale output files
324+
run: rm -f *.out
325+
320326
- name: Pre-Build (SLURM)
321327
if: matrix.cluster == 'phoenix'
322328
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

0 commit comments

Comments
 (0)