Skip to content

Commit 825adb2

Browse files
authored
CI: shard the AMD case-optimization pre-build (#1582)
1 parent ff4002b commit 825adb2

4 files changed

Lines changed: 112 additions & 7 deletions

File tree

.github/scripts/prebuild-case-optimization.sh

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,44 @@ case "$cluster" in
2222
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2323
esac
2424

25+
# Optional sharding (format "i/N", e.g. "1/2"), set by submit-slurm-job.sh's
26+
# [shard] argument via $job_shard: shard i builds every Nth case of the sorted
27+
# case list. Unset = build all cases in one job (default; other clusters).
28+
shard="${job_shard:-}"
29+
if [ -n "$shard" ]; then
30+
# Validate full shape: must be exactly "digits/digits" — one slash with
31+
# non-empty, purely numeric, non-leading-zero parts on both sides.
32+
# Split first, then validate each part independently so that inputs like
33+
# "1/" "/2" "//" "1/2/3" "a/b" "12" are all caught before any arithmetic.
34+
shard_idx="${shard%%/*}"
35+
shard_count="${shard##*/}"
36+
# Reject if no slash (idx and count are equal and equal to the whole string)
37+
case "$shard_idx" in
38+
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
39+
esac
40+
case "$shard_count" in
41+
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
42+
esac
43+
# Confirm the string is exactly "idx/count" — catches "12" (no slash) and
44+
# "1/2/3" (extra slash, where idx=1 and count=2/3 would have failed above,
45+
# but this is an extra safety net).
46+
if [ "$shard" != "$shard_idx/$shard_count" ]; then
47+
echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1
48+
fi
49+
if [ "$shard_idx" -lt 1 ] || [ "$shard_idx" -gt "$shard_count" ]; then
50+
echo "ERROR: bad shard '$shard' (expected i/N with 1 <= i <= N)"; exit 1
51+
fi
52+
fi
53+
2554
# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
2655
# build.sh first, so we must preserve them and only clean MFC target staging.
56+
# Sharded jobs share one workspace and run concurrently, so the workflow
57+
# cleans once before submitting them — cleaning here would wipe a sibling
58+
# shard's in-progress build.
2759
if [ "$cluster" = "phoenix" ]; then
2860
source .github/scripts/clean-build.sh
2961
clean_build
30-
else
62+
elif [ -z "$shard" ]; then
3163
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
3264
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
3365
fi
@@ -40,7 +72,49 @@ case "$job_interface" in
4072
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
4173
esac
4274

75+
# Case-optimized simulation builds land in per-case hash-named staging dirs,
76+
# but syscheck/pre_process/post_process hash identically across these cases.
77+
# Concurrent shards must not build those shared staging dirs simultaneously:
78+
# shard 1 builds them first and drops a done marker; other shards wait for it,
79+
# after which their builds no-op in the shared dirs.
80+
if [ -n "$shard" ] && [ "$shard_count" -gt 1 ]; then
81+
shared_marker_done="build/.prebuild-shared-targets-done"
82+
shared_marker_failed="build/.prebuild-shared-targets-failed"
83+
set -- benchmarks/*/case.py
84+
first_case="$1"
85+
if [ "$shard_idx" -eq 1 ]; then
86+
# Remove both markers at the start so reruns and manual invocations
87+
# never observe stale state from a prior run.
88+
rm -f "$shared_marker_done" "$shared_marker_failed"
89+
echo "=== Shard 1/$shard_count: building shared targets ==="
90+
# Write the failure marker if the build exits non-zero so other shards
91+
# can detect the failure immediately instead of waiting 90 minutes.
92+
trap 'touch "$shared_marker_failed"' ERR
93+
./mfc.sh build -i "$first_case" -t syscheck pre_process post_process --case-optimization $gpu_opts -j 8
94+
trap - ERR
95+
touch "$shared_marker_done"
96+
else
97+
echo "=== Shard $shard_idx/$shard_count: waiting for shard 1 to build shared targets ==="
98+
waited=0
99+
until [ -f "$shared_marker_done" ]; do
100+
if [ -f "$shared_marker_failed" ]; then
101+
echo "ERROR: shard 1 failed to build shared targets; see shard 1 log"; exit 1
102+
fi
103+
if [ "$waited" -ge 5400 ]; then
104+
echo "ERROR: timed out waiting for $shared_marker_done"; exit 1
105+
fi
106+
sleep 30
107+
waited=$((waited + 30))
108+
done
109+
fi
110+
fi
111+
112+
idx=0
43113
for case in benchmarks/*/case.py; do
114+
idx=$((idx + 1))
115+
if [ -n "$shard" ] && [ $(((idx - 1) % shard_count)) -ne $((shard_idx - 1)) ]; then
116+
continue
117+
fi
44118
echo "=== Pre-building: $case ==="
45119
./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
46120
done

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out
6262
if [ "$pr_exit" -ne 0 ]; then
6363
echo "PR job exited with code: $pr_exit"
6464
tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
65+
# The PR benchmark run genuinely failed (cases crashed/hung/SIGTERM'd, not a
66+
# monitor false-positive -- run_monitored_slurm_job.sh re-checks sacct). Fail
67+
# the job instead of falling through to the YAML-exists check, which would let
68+
# a broken PR pass green as long as a partial YAML was written. Scoped to PR
69+
# only: a master/baseline infra flake stays a warning and does not red-cross.
70+
exit 1
6571
else
6672
echo "PR job completed successfully"
6773
fi

.github/workflows/test.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,13 @@ jobs:
402402
cluster_name: 'Oak Ridge | Frontier (AMD)'
403403
device: 'cpu'
404404
interface: 'none'
405+
shard: '1/2'
406+
- runner: 'frontier'
407+
cluster: 'frontier_amd'
408+
cluster_name: 'Oak Ridge | Frontier (AMD)'
409+
device: 'cpu'
410+
interface: 'none'
411+
shard: '2/2'
405412
runs-on:
406413
group: phoenix
407414
labels: ${{ matrix.runner }}
@@ -420,7 +427,7 @@ jobs:
420427

421428
- name: Fetch Dependencies
422429
if: matrix.cluster != 'phoenix'
423-
timeout-minutes: 60
430+
timeout-minutes: 120
424431
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
425432

426433
- name: Build
@@ -523,7 +530,22 @@ jobs:
523530

524531
- name: Pre-Build (SLURM)
525532
if: matrix.cluster == 'frontier_amd'
526-
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
533+
# AMD flang is slow enough that one serial pre-build job exceeds its
534+
# walltime, so split the case list across two concurrent SLURM jobs.
535+
# The shards share this workspace and skip their in-job staging clean,
536+
# so clean once here on the login node before submitting.
537+
run: |
538+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
539+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
540+
rm -f build/.prebuild-shared-targets-done
541+
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 1/2 &
542+
pid1=$!
543+
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 2/2 &
544+
pid2=$!
545+
rc=0
546+
wait "$pid1" || rc=1
547+
wait "$pid2" || rc=1
548+
exit $rc
527549
528550
- name: Build & Run Case-Optimization Tests
529551
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
@@ -546,6 +568,8 @@ jobs:
546568
if: always()
547569
run: |
548570
for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \
571+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-1-of-2.out \
572+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-2-of-2.out \
549573
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do
550574
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
551575
done
@@ -556,5 +580,5 @@ jobs:
556580
with:
557581
name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }}
558582
path: |
559-
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
583+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}*.out
560584
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out

benchmarks/igr/case.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,11 @@
102102
"patch_icpp(1)%length_x": 2 * math.pi * L,
103103
"patch_icpp(1)%length_y": 2 * math.pi * L,
104104
"patch_icpp(1)%length_z": 2 * math.pi * L,
105-
"patch_icpp(1)%vel(1)": f"{V0}*sin(x/{L})*cos(y/{L})*sin(z/{L})",
106-
"patch_icpp(1)%vel(2)": f"-{V0}*cos(x/{L})*sin(y/{L})*sin(z/{L})",
105+
"patch_icpp(1)%vel(1)": 0.0,
106+
"patch_icpp(1)%vel(2)": 0.0,
107107
"patch_icpp(1)%vel(3)": 0,
108-
"patch_icpp(1)%pres": f"{P0} + ({rho0}*{V0}**2/16)*(cos(2*x/{L}) + cos(2*y/{L}))*(cos(2*z/{L}) + 2)",
108+
"patch_icpp(1)%pres": 0.0,
109+
"patch_icpp(1)%hcid": 380,
109110
"patch_icpp(1)%alpha_rho(1)": 1,
110111
"patch_icpp(1)%alpha(1)": 1,
111112
# Fluids Physical Parameters

0 commit comments

Comments
 (0)