@@ -22,12 +22,44 @@ case "$cluster" in
2222 * ) echo " ERROR: Unknown cluster '$cluster '" ; exit 1 ;;
2323esac
2424
25+ # Optional sharding (format "i/N", e.g. "1/2"), set by submit-slurm-job.sh's
26+ # [shard] argument via $job_shard: shard i builds every Nth case of the sorted
27+ # case list. Unset = build all cases in one job (default; other clusters).
28+ shard=" ${job_shard:- } "
29+ if [ -n " $shard " ]; then
30+ # Validate full shape: must be exactly "digits/digits" — one slash with
31+ # non-empty, purely numeric, non-leading-zero parts on both sides.
32+ # Split first, then validate each part independently so that inputs like
33+ # "1/" "/2" "//" "1/2/3" "a/b" "12" are all caught before any arithmetic.
34+ shard_idx=" ${shard%%/* } "
35+ shard_count=" ${shard##*/ } "
36+ # Reject if no slash (idx and count are equal and equal to the whole string)
37+ case " $shard_idx " in
38+ ' ' |* [!0-9]* |0* ) echo " ERROR: bad shard '$shard ' (expected i/N)" ; exit 1 ;;
39+ esac
40+ case " $shard_count " in
41+ ' ' |* [!0-9]* |0* ) echo " ERROR: bad shard '$shard ' (expected i/N)" ; exit 1 ;;
42+ esac
43+ # Confirm the string is exactly "idx/count" — catches "12" (no slash) and
44+ # "1/2/3" (extra slash, where idx=1 and count=2/3 would have failed above,
45+ # but this is an extra safety net).
46+ if [ " $shard " != " $shard_idx /$shard_count " ]; then
47+ echo " ERROR: bad shard '$shard ' (expected i/N)" ; exit 1
48+ fi
49+ if [ " $shard_idx " -lt 1 ] || [ " $shard_idx " -gt " $shard_count " ]; then
50+ echo " ERROR: bad shard '$shard ' (expected i/N with 1 <= i <= N)" ; exit 1
51+ fi
52+ fi
53+
2554# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
2655# build.sh first, so we must preserve them and only clean MFC target staging.
56+ # Sharded jobs share one workspace and run concurrently, so the workflow
57+ # cleans once before submitting them — cleaning here would wipe a sibling
58+ # shard's in-progress build.
2759if [ " $cluster " = " phoenix" ]; then
2860 source .github/scripts/clean-build.sh
2961 clean_build
30- else
62+ elif [ -z " $shard " ] ; then
3163 find build/staging -maxdepth 1 -regex ' .*/[0-9a-f]+' -type d -exec rm -rf {} + 2> /dev/null || true
3264 find build/install -maxdepth 1 -regex ' .*/[0-9a-f]+' -type d -exec rm -rf {} + 2> /dev/null || true
3365fi
@@ -40,7 +72,49 @@ case "$job_interface" in
4072 * ) echo " ERROR: prebuild requires gpu interface (acc or omp)" ; exit 1 ;;
4173esac
4274
75+ # Case-optimized simulation builds land in per-case hash-named staging dirs,
76+ # but syscheck/pre_process/post_process hash identically across these cases.
77+ # Concurrent shards must not build those shared staging dirs simultaneously:
78+ # shard 1 builds them first and drops a done marker; other shards wait for it,
79+ # after which their builds no-op in the shared dirs.
80+ if [ -n " $shard " ] && [ " $shard_count " -gt 1 ]; then
81+ shared_marker_done=" build/.prebuild-shared-targets-done"
82+ shared_marker_failed=" build/.prebuild-shared-targets-failed"
83+ set -- benchmarks/* /case.py
84+ first_case=" $1 "
85+ if [ " $shard_idx " -eq 1 ]; then
86+ # Remove both markers at the start so reruns and manual invocations
87+ # never observe stale state from a prior run.
88+ rm -f " $shared_marker_done " " $shared_marker_failed "
89+ echo " === Shard 1/$shard_count : building shared targets ==="
90+ # Write the failure marker if the build exits non-zero so other shards
91+ # can detect the failure immediately instead of waiting 90 minutes.
92+ trap ' touch "$shared_marker_failed"' ERR
93+ ./mfc.sh build -i " $first_case " -t syscheck pre_process post_process --case-optimization $gpu_opts -j 8
94+ trap - ERR
95+ touch " $shared_marker_done "
96+ else
97+ echo " === Shard $shard_idx /$shard_count : waiting for shard 1 to build shared targets ==="
98+ waited=0
99+ until [ -f " $shared_marker_done " ]; do
100+ if [ -f " $shared_marker_failed " ]; then
101+ echo " ERROR: shard 1 failed to build shared targets; see shard 1 log" ; exit 1
102+ fi
103+ if [ " $waited " -ge 5400 ]; then
104+ echo " ERROR: timed out waiting for $shared_marker_done " ; exit 1
105+ fi
106+ sleep 30
107+ waited=$(( waited + 30 ))
108+ done
109+ fi
110+ fi
111+
112+ idx=0
43113for case in benchmarks/* /case.py; do
114+ idx=$(( idx + 1 ))
115+ if [ -n " $shard " ] && [ $(( (idx - 1 ) % shard_count)) -ne $(( shard_idx - 1 )) ]; then
116+ continue
117+ fi
44118 echo " === Pre-building: $case ==="
45119 ./mfc.sh run " $case " --case-optimization $gpu_opts -j 8 --dry-run
46120done
0 commit comments