Skip to content

Commit 3b3a25e

Browse files
committed
merge: resolve conflict with check_manual_registry_bcasts from master
2 parents 9c819d4 + 825adb2 commit 3b3a25e

20 files changed

Lines changed: 541 additions & 130 deletions

.github/scripts/prebuild-case-optimization.sh

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,44 @@ case "$cluster" in
2222
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2323
esac
2424

25+
# Optional sharding (format "i/N", e.g. "1/2"), set by submit-slurm-job.sh's
26+
# [shard] argument via $job_shard: shard i builds every Nth case of the sorted
27+
# case list. Unset = build all cases in one job (default; other clusters).
28+
shard="${job_shard:-}"
29+
if [ -n "$shard" ]; then
30+
# Validate full shape: must be exactly "digits/digits" — one slash with
31+
# non-empty, purely numeric, non-leading-zero parts on both sides.
32+
# Split first, then validate each part independently so that inputs like
33+
# "1/" "/2" "//" "1/2/3" "a/b" "12" are all caught before any arithmetic.
34+
shard_idx="${shard%%/*}"
35+
shard_count="${shard##*/}"
36+
# Reject if no slash (idx and count are equal and equal to the whole string)
37+
case "$shard_idx" in
38+
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
39+
esac
40+
case "$shard_count" in
41+
''|*[!0-9]*|0*) echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1 ;;
42+
esac
43+
# Confirm the string is exactly "idx/count" — catches "12" (no slash) and
44+
# "1/2/3" (extra slash, where idx=1 and count=2/3 would have failed above,
45+
# but this is an extra safety net).
46+
if [ "$shard" != "$shard_idx/$shard_count" ]; then
47+
echo "ERROR: bad shard '$shard' (expected i/N)"; exit 1
48+
fi
49+
if [ "$shard_idx" -lt 1 ] || [ "$shard_idx" -gt "$shard_count" ]; then
50+
echo "ERROR: bad shard '$shard' (expected i/N with 1 <= i <= N)"; exit 1
51+
fi
52+
fi
53+
2554
# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
2655
# build.sh first, so we must preserve them and only clean MFC target staging.
56+
# Sharded jobs share one workspace and run concurrently, so the workflow
57+
# cleans once before submitting them — cleaning here would wipe a sibling
58+
# shard's in-progress build.
2759
if [ "$cluster" = "phoenix" ]; then
2860
source .github/scripts/clean-build.sh
2961
clean_build
30-
else
62+
elif [ -z "$shard" ]; then
3163
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
3264
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
3365
fi
@@ -40,7 +72,49 @@ case "$job_interface" in
4072
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
4173
esac
4274

75+
# Case-optimized simulation builds land in per-case hash-named staging dirs,
76+
# but syscheck/pre_process/post_process hash identically across these cases.
77+
# Concurrent shards must not build those shared staging dirs simultaneously:
78+
# shard 1 builds them first and drops a done marker; other shards wait for it,
79+
# after which their builds no-op in the shared dirs.
80+
if [ -n "$shard" ] && [ "$shard_count" -gt 1 ]; then
81+
shared_marker_done="build/.prebuild-shared-targets-done"
82+
shared_marker_failed="build/.prebuild-shared-targets-failed"
83+
set -- benchmarks/*/case.py
84+
first_case="$1"
85+
if [ "$shard_idx" -eq 1 ]; then
86+
# Remove both markers at the start so reruns and manual invocations
87+
# never observe stale state from a prior run.
88+
rm -f "$shared_marker_done" "$shared_marker_failed"
89+
echo "=== Shard 1/$shard_count: building shared targets ==="
90+
# Write the failure marker if the build exits non-zero so other shards
91+
# can detect the failure immediately instead of waiting 90 minutes.
92+
trap 'touch "$shared_marker_failed"' ERR
93+
./mfc.sh build -i "$first_case" -t syscheck pre_process post_process --case-optimization $gpu_opts -j 8
94+
trap - ERR
95+
touch "$shared_marker_done"
96+
else
97+
echo "=== Shard $shard_idx/$shard_count: waiting for shard 1 to build shared targets ==="
98+
waited=0
99+
until [ -f "$shared_marker_done" ]; do
100+
if [ -f "$shared_marker_failed" ]; then
101+
echo "ERROR: shard 1 failed to build shared targets; see shard 1 log"; exit 1
102+
fi
103+
if [ "$waited" -ge 5400 ]; then
104+
echo "ERROR: timed out waiting for $shared_marker_done"; exit 1
105+
fi
106+
sleep 30
107+
waited=$((waited + 30))
108+
done
109+
fi
110+
fi
111+
112+
idx=0
43113
for case in benchmarks/*/case.py; do
114+
idx=$((idx + 1))
115+
if [ -n "$shard" ] && [ $(((idx - 1) % shard_count)) -ne $((shard_idx - 1)) ]; then
116+
continue
117+
fi
44118
echo "=== Pre-building: $case ==="
45119
./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
46120
done

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ bash "${SCRIPT_DIR}/run_monitored_slurm_job.sh" "$pr_job_id" "pr/${job_slug}.out
6262
if [ "$pr_exit" -ne 0 ]; then
6363
echo "PR job exited with code: $pr_exit"
6464
tail -n 50 "pr/${job_slug}.out" 2>/dev/null || echo " Could not read PR log"
65+
# The PR benchmark run genuinely failed (cases crashed/hung/SIGTERM'd, not a
66+
# monitor false-positive -- run_monitored_slurm_job.sh re-checks sacct). Fail
67+
# the job instead of falling through to the YAML-exists check, which would let
68+
# a broken PR pass green as long as a partial YAML was written. Scoped to PR
69+
# only: a master/baseline infra flake stays a warning and does not red-cross.
70+
exit 1
6571
else
6672
echo "PR job completed successfully"
6773
fi

.github/workflows/test.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,13 @@ jobs:
402402
cluster_name: 'Oak Ridge | Frontier (AMD)'
403403
device: 'cpu'
404404
interface: 'none'
405+
shard: '1/2'
406+
- runner: 'frontier'
407+
cluster: 'frontier_amd'
408+
cluster_name: 'Oak Ridge | Frontier (AMD)'
409+
device: 'cpu'
410+
interface: 'none'
411+
shard: '2/2'
405412
runs-on:
406413
group: phoenix
407414
labels: ${{ matrix.runner }}
@@ -420,7 +427,7 @@ jobs:
420427

421428
- name: Fetch Dependencies
422429
if: matrix.cluster != 'phoenix'
423-
timeout-minutes: 60
430+
timeout-minutes: 120
424431
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
425432

426433
- name: Build
@@ -523,7 +530,22 @@ jobs:
523530

524531
- name: Pre-Build (SLURM)
525532
if: matrix.cluster == 'frontier_amd'
526-
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
533+
# AMD flang is slow enough that one serial pre-build job exceeds its
534+
# walltime, so split the case list across two concurrent SLURM jobs.
535+
# The shards share this workspace and skip their in-job staging clean,
536+
# so clean once here on the login node before submitting.
537+
run: |
538+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
539+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
540+
rm -f build/.prebuild-shared-targets-done
541+
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 1/2 &
542+
pid1=$!
543+
bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }} 2/2 &
544+
pid2=$!
545+
rc=0
546+
wait "$pid1" || rc=1
547+
wait "$pid2" || rc=1
548+
exit $rc
527549
528550
- name: Build & Run Case-Optimization Tests
529551
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
@@ -546,6 +568,8 @@ jobs:
546568
if: always()
547569
run: |
548570
for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \
571+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-1-of-2.out \
572+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}-2-of-2.out \
549573
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do
550574
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
551575
done
@@ -556,5 +580,5 @@ jobs:
556580
with:
557581
name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }}
558582
path: |
559-
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
583+
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}*.out
560584
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out

benchmarks/igr/case.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,11 @@
102102
"patch_icpp(1)%length_x": 2 * math.pi * L,
103103
"patch_icpp(1)%length_y": 2 * math.pi * L,
104104
"patch_icpp(1)%length_z": 2 * math.pi * L,
105-
"patch_icpp(1)%vel(1)": f"{V0}*sin(x/{L})*cos(y/{L})*sin(z/{L})",
106-
"patch_icpp(1)%vel(2)": f"-{V0}*cos(x/{L})*sin(y/{L})*sin(z/{L})",
105+
"patch_icpp(1)%vel(1)": 0.0,
106+
"patch_icpp(1)%vel(2)": 0.0,
107107
"patch_icpp(1)%vel(3)": 0,
108-
"patch_icpp(1)%pres": f"{P0} + ({rho0}*{V0}**2/16)*(cos(2*x/{L}) + cos(2*y/{L}))*(cos(2*z/{L}) + 2)",
108+
"patch_icpp(1)%pres": 0.0,
109+
"patch_icpp(1)%hcid": 380,
109110
"patch_icpp(1)%alpha_rho(1)": 1,
110111
"patch_icpp(1)%alpha(1)": 1,
111112
# Fluids Physical Parameters

src/common/include/acc_macros.fpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,8 @@
121121
& copyout_val.strip('\n') + create_val.strip('\n') + &
122122
& no_create_val.strip('\n') + present_val.strip('\n') + &
123123
& deviceptr_val.strip('\n') + attach_val.strip('\n')
124-
#:set acc_directive = '!$acc parallel ' + &
125-
& acc_clause_val + extraAccArgs_val.strip('\n')
124+
#:set acc_directive = FOLD_DIRECTIVE('!$acc parallel ' + &
125+
& acc_clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
126126
#:set end_acc_directive = '!$acc end parallel'
127127
$:acc_directive
128128
$:code
@@ -153,8 +153,8 @@
153153
& copyout_val.strip('\n') + create_val.strip('\n') + &
154154
& no_create_val.strip('\n') + present_val.strip('\n') + &
155155
& deviceptr_val.strip('\n') + attach_val.strip('\n')
156-
#:set acc_directive = '!$acc parallel loop ' + &
157-
& clause_val + extraAccArgs_val.strip('\n')
156+
#:set acc_directive = FOLD_DIRECTIVE('!$acc parallel loop ' + &
157+
& clause_val + extraAccArgs_val.strip('\n'), '!$acc').strip('\n')
158158
$:acc_directive
159159
#:enddef
160160

src/common/include/omp_macros.fpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@
141141
& deviceptr_val.strip('\n') + attach_val.strip('\n')
142142

143143
#:set omp_clause_val = omp_clause_val.strip('\n')
144-
#:set omp_directive = '!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n')
144+
#:set omp_directive = FOLD_DIRECTIVE('!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
145145

146146
#:set omp_end_directive = '!$omp end target teams'
147147
$:omp_directive
@@ -186,7 +186,7 @@
186186
#:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) '
187187
#:endif
188188

189-
#:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n')
189+
#:set omp_directive = FOLD_DIRECTIVE(omp_start_directive + clause_val + extraOmpArgs_val.strip('\n'), '!$omp').strip('\n')
190190
$:omp_directive
191191
#:enddef
192192

src/common/include/shared_parallel_macros.fpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,29 @@
117117
#:endif
118118
$:extraArgs_val
119119
#:enddef
120+
121+
#:def FOLD_DIRECTIVE(directive, sentinel, width=200)
122+
#! Fold a long GPU directive across free-form continuation lines so it stays
123+
#! under nvfortran's ~1000-char source-line limit. Breaks only at whole-clause
124+
#! boundaries (clause(args) groups and bare keywords), repeating the sentinel
125+
#! (e.g. '!$acc&') on each continuation -- which fypp's --no-folding cannot do
126+
#! because its generic folder omits the sentinel. Every emitted line is no
127+
#! longer than the prefix plus the single longest clause, i.e. no longer than
128+
#! the unfolded line a build with one fewer clause already compiles.
129+
#:set _toks = re.findall(r'\w+\([^)]*\)|\S+', directive)
130+
#:set _lines = []
131+
#:set _cur = ''
132+
#:for _t in _toks
133+
#:if _cur == ''
134+
#:set _cur = _t
135+
#:elif len(_cur) + 1 + len(_t) > width
136+
#:set _lines = _lines + [_cur + ' &']
137+
#:set _cur = sentinel + '& ' + _t
138+
#:else
139+
#:set _cur = _cur + ' ' + _t
140+
#:endif
141+
#:endfor
142+
#:set _lines = _lines + [_cur]
143+
$:'\n'.join(_lines)
144+
#:enddef
120145
! New line at end of file is required for FYPP

src/common/m_global_parameters_common.fpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ contains
419419
muscl_order = dflt_int
420420
num_fluids = dflt_int
421421
igr = .false.
422+
igr_order = dflt_int
422423
mhd = .false.
423424
relativity = .false.
424425
#:endif

src/pre_process/m_mpi_proxy.fpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ contains
8080

8181
! manual: patch_icpp (complex members: alter_patch, sph_har_coeff, size() arrays)
8282
do i = 1, num_patches_max
83-
#:for VAR in [ 'geometry', 'smooth_patch_id']
83+
#:for VAR in [ 'geometry', 'smooth_patch_id', 'hcid']
8484
call MPI_BCAST(patch_icpp(i)%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
8585
#:endfor
8686

@@ -91,7 +91,7 @@ contains
9191
#:for VAR in [ 'x_centroid', 'y_centroid', 'z_centroid', &
9292
& 'length_x', 'length_y', 'length_z', 'radius', 'epsilon', &
9393
& 'beta', 'smooth_coeff', 'rho', 'p0', 'm0', 'r0', 'v0', &
94-
& 'pres', 'gamma', 'pi_inf', 'hcid', 'cv', 'qv', 'qvp', &
94+
& 'pres', 'gamma', 'pi_inf', 'cv', 'qv', 'qvp', &
9595
& 'cf_val', 'Bx', 'By', 'Bz']
9696
call MPI_BCAST(patch_icpp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
9797
#:endfor

0 commit comments

Comments
 (0)