Skip to content

Commit f95b2c4

Browse files
sbryngelsonclaudeSpencer Bryngelson
authored
Work around CCE 19.0.0 compiler bugs for Cray+OpenACC builds (#1286)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
1 parent d4336a1 commit f95b2c4

14 files changed

Lines changed: 222 additions & 71 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
3+
# from the runner OS) before the SLURM job completes. When the monitor exits
4+
# non-zero, sacct is used to verify the job's actual final state; if the SLURM
5+
# job succeeded we exit 0 so the CI step is not falsely marked as failed.
6+
#
7+
# Usage: run_monitored_slurm_job.sh <job_id> <output_file>
8+
9+
set -euo pipefail
10+
11+
if [ $# -ne 2 ]; then
12+
echo "Usage: $0 <job_id> <output_file>"
13+
exit 1
14+
fi
15+
16+
job_id="$1"
17+
output_file="$2"
18+
19+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20+
21+
monitor_exit=0
22+
bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
23+
24+
if [ "$monitor_exit" -ne 0 ]; then
25+
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
26+
# Give the SLURM epilog time to finalize if the job just finished
27+
sleep 30
28+
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
29+
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
30+
echo "Final SLURM state=$final_state exit=$final_exit"
31+
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
32+
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
33+
else
34+
echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
35+
exit 1
36+
fi
37+
fi

.github/workflows/frontier/build.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,7 @@ build_opts="$gpu_opts"
2020

2121
. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
2222

23-
# Only set up build cache for test suite, not benchmarks
24-
if [ "$run_bench" != "bench" ]; then
25-
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
26-
fi
23+
rm -rf build
2724

2825
source .github/scripts/retry-build.sh
2926
if [ "$run_bench" == "bench" ]; then

.github/workflows/frontier/submit.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ fi
4545

4646
# Select SBATCH params based on job type
4747
if [ "$job_type" = "bench" ]; then
48-
sbatch_account="#SBATCH -A ENG160"
49-
sbatch_time="#SBATCH -t 05:59:00"
50-
sbatch_partition="#SBATCH -p extended"
51-
sbatch_extra=""
48+
sbatch_account="#SBATCH -A CFD154"
49+
sbatch_time="#SBATCH -t 01:59:00"
50+
sbatch_partition="#SBATCH -p batch"
51+
sbatch_extra="#SBATCH --qos=normal"
5252
else
5353
sbatch_account="#SBATCH -A CFD154"
5454
sbatch_time="#SBATCH -t 01:59:00"
@@ -102,5 +102,4 @@ fi
102102

103103
echo "Submitted batch job $job_id"
104104

105-
# Use resilient monitoring instead of sbatch -W
106-
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
105+
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

.github/workflows/phoenix/bench.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
source .github/scripts/bench-preamble.sh
44

5-
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
5+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
66
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
77
mkdir -p $tmpbuild
88
mkdir -p $currentdir
@@ -15,6 +15,8 @@ else
1515
bench_opts="--mem 1"
1616
fi
1717

18+
rm -rf build
19+
1820
source .github/scripts/retry-build.sh
1921
RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
2022

.github/workflows/phoenix/submit.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,5 @@ fi
9494

9595
echo "Submitted batch job $job_id"
9696

97-
# Use resilient monitoring instead of sbatch -W
9897
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
99-
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
98+
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

.github/workflows/phoenix/test.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
source .github/scripts/gpu-opts.sh
44
build_opts="$gpu_opts"
55

6-
# Set up persistent build cache
7-
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
6+
rm -rf build
87

98
# Build with retry; smoke-test cached binaries to catch architecture mismatches
109
# (SIGILL from binaries compiled on a different compute node).

.github/workflows/test.yml

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ jobs:
9898
- name: Clone
9999
uses: actions/checkout@v4
100100

101+
- name: Restore Build Cache
102+
uses: actions/cache@v4
103+
with:
104+
path: build
105+
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
106+
101107
- name: Setup MacOS
102108
if: matrix.os == 'macos'
103109
run: |
@@ -131,32 +137,20 @@ jobs:
131137
printenv | sort > /tmp/env_after
132138
diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV
133139
134-
- name: Get system info for cache key
135-
id: sys-info
136-
run: |
137-
{
138-
uname -m
139-
cat /proc/cpuinfo 2>/dev/null | grep 'model name' | head -1 || sysctl -n machdep.cpu.brand_string 2>/dev/null || true
140-
if command -v ifx &>/dev/null; then ifx --version 2>/dev/null | head -1; else ${FC:-gfortran} --version 2>/dev/null | head -1 || true; fi
141-
${CC:-gcc} --version 2>/dev/null | head -1 || true
142-
} | (sha256sum 2>/dev/null || shasum -a 256) | cut -c1-16 > /tmp/sys-hash
143-
echo "sys-hash=$(cat /tmp/sys-hash)" >> "$GITHUB_OUTPUT"
144-
145-
- name: Restore Build Cache
146-
uses: actions/cache@v4
140+
- name: Set up Python 3.14
141+
uses: actions/setup-python@v5
147142
with:
148-
path: build
149-
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ steps.sys-info.outputs.sys-hash }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
143+
python-version: '3.14'
150144

151145
- name: Build
152146
run: |
153-
/bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL
147+
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
154148
env:
155149
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
156-
PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }}
157150

158151
- name: Test
159-
run: bash .github/scripts/run-tests-with-retry.sh -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT
152+
run: |
153+
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
160154
env:
161155
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
162156
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
@@ -186,7 +180,7 @@ jobs:
186180
cluster_name: 'Georgia Tech | Phoenix'
187181
device: 'cpu'
188182
interface: 'none'
189-
# Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
183+
# Frontier (ORNL) — CCE
190184
- runner: 'frontier'
191185
cluster: 'frontier'
192186
cluster_name: 'Oak Ridge | Frontier'
@@ -243,21 +237,30 @@ jobs:
243237
- name: Clone
244238
uses: actions/checkout@v4
245239
with:
246-
clean: false
240+
clean: true
247241

248242
- name: Build
249243
if: matrix.cluster != 'phoenix'
250244
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
251245
with:
252-
max_attempts: 3
246+
max_attempts: 2
253247
retry_wait_seconds: 60
254248
timeout_minutes: 60
255249
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
256-
on_retry_command: ./mfc.sh clean
250+
on_retry_command: rm -rf build
257251

258252
- name: Test
259253
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
260254

255+
- name: Cancel SLURM Jobs
256+
if: cancelled()
257+
run: |
258+
find . -name "*.slurm_job_id" | while read -r f; do
259+
job_id=$(cat "$f")
260+
echo "Cancelling SLURM job $job_id"
261+
scancel "$job_id" 2>/dev/null || true
262+
done
263+
261264
- name: Compute Log Slug
262265
if: always()
263266
id: log
@@ -321,25 +324,28 @@ jobs:
321324
- name: Clone
322325
uses: actions/checkout@v4
323326
with:
324-
clean: false
327+
clean: true
325328

326329
- name: Pre-Build (SLURM)
327330
if: matrix.cluster == 'phoenix'
328331
run: bash .github/workflows/phoenix/submit.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }}
329332

330333
- name: Pre-Build (login node)
331334
if: matrix.cluster != 'phoenix'
332-
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
333-
with:
334-
max_attempts: 3
335-
retry_wait_seconds: 60
336-
timeout_minutes: 120
337-
command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
338-
on_retry_command: ./mfc.sh clean
335+
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
339336

340337
- name: Run Case-Optimization Tests
341338
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}
342339

340+
- name: Cancel SLURM Jobs
341+
if: cancelled()
342+
run: |
343+
find . -name "*.slurm_job_id" | while read -r f; do
344+
job_id=$(cat "$f")
345+
echo "Cancelling SLURM job $job_id"
346+
scancel "$job_id" 2>/dev/null || true
347+
done
348+
343349
- name: Print Logs
344350
if: always()
345351
run: |

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,8 @@ benchmarks/*.png
105105
*.avi
106106

107107
**isolation_rules/
108-
**.supercode/
108+
**.supercode/
109+
# CCE stress-test log directories (local testing artifacts)
110+
cce_*/
111+
cce_*.log
112+
run_cce_*.sh

CMakeLists.txt

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,24 @@ endif()
224224

225225
if (CMAKE_BUILD_TYPE STREQUAL "Release")
226226
# Processor tuning: Check if we can target the host's native CPU's ISA.
227-
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
228-
if (SUPPORTS_MARCH_NATIVE)
229-
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
230-
else()
231-
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
232-
if (SUPPORTS_MCPU_NATIVE)
233-
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
227+
# Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids)
228+
# can emit instructions the system assembler doesn't support.
229+
if (NOT MFC_GCov)
230+
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
231+
if (SUPPORTS_MARCH_NATIVE)
232+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
233+
# Disable AVX-512 FP16: gfortran >=12 emits vmovw instructions on
234+
# Granite Rapids CPUs, but binutils <2.38 cannot assemble them.
235+
# FP16 is unused in MFC's double-precision computations.
236+
CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16)
237+
if (SUPPORTS_MNO_AVX512FP16)
238+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mno-avx512fp16>)
239+
endif()
240+
else()
241+
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
242+
if (SUPPORTS_MCPU_NATIVE)
243+
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
244+
endif()
234245
endif()
235246
endif()
236247

@@ -397,6 +408,7 @@ HANDLE_SOURCES(simulation ON)
397408
HANDLE_SOURCES(post_process ON)
398409
HANDLE_SOURCES(syscheck OFF)
399410

411+
400412
# MFC_SETUP_TARGET: Given a target (herein <target>), this macro creates a new
401413
# executable <target> with the appropriate sources, compiler definitions, and
402414
# linked libraries (assuming HANDLE_SOURCES was called on <target>).
@@ -633,6 +645,23 @@ if (MFC_SIMULATION)
633645
MFC_SETUP_TARGET(TARGET simulation
634646
SOURCES "${simulation_SRCs}"
635647
MPI FFTW OpenACC OpenMP)
648+
# CCE 19.0.0 IPA workaround: two files trigger IPA crashes:
649+
# m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI)
650+
# m_phase_change: bring_routine_resident SIGSEGV
651+
# Disabling IPA per-file avoids the crashes while preserving IPA for
652+
# the rest of simulation (needed for thermochem INLINEALWAYS inlining).
653+
# Applied to Cray+OpenACC and Cray CPU, but NOT Cray+OpenMP: on OpenMP,
654+
# m_thermochem uses !DIR$ INLINEALWAYS (requires IPA), so disabling IPA
655+
# for these files breaks thermochem on-device calls. On OpenACC the
656+
# pyrometheus patch emits !$acc routine seq instead (no IPA needed).
657+
# See PR #1286.
658+
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND NOT MFC_OpenMP)
659+
set_source_files_properties(
660+
"${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90"
661+
"${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90"
662+
PROPERTIES COMPILE_OPTIONS "-Oipa0"
663+
)
664+
endif()
636665
endif()
637666

638667
if (MFC_POST_PROCESS)

src/common/include/parallel_macros.fpp

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,46 @@
4848

4949
#:enddef
5050

51-
#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None, extraOmpArgs=None)
51+
#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, cray_noinline=False, extraAccArgs=None, extraOmpArgs=None)
5252
#:assert isinstance(cray_inline, bool)
53+
#:assert isinstance(cray_noinline, bool)
54+
#:assert not (cray_inline and cray_noinline), "cray_inline and cray_noinline are mutually exclusive"
5355
#:set acc_directive = ACC_ROUTINE(function_name=function_name, parallelism=parallelism, nohost=nohost, extraAccArgs=extraAccArgs)
5456
#:set omp_directive = OMP_ROUTINE(function_name=function_name, nohost=nohost, extraOmpArgs=extraOmpArgs)
5557

56-
#:if cray_inline == True
58+
#:if cray_noinline == True
59+
#:if not isinstance(function_name, str)
60+
#:stop "When using cray_noinline, function name must be given and given as a string"
61+
#:endif
62+
#:set cray_noinline_directive = ('!DIR$ NOINLINE ' + function_name).strip('\n')
63+
#ifdef _CRAYFTN
64+
#if MFC_OpenACC
65+
$:acc_directive
66+
#elif MFC_OpenMP
67+
$:omp_directive
68+
#else
69+
$:cray_noinline_directive
70+
#endif
71+
#! On non-Cray CPU builds (no _CRAYFTN, no MFC_OpenACC, no MFC_OpenMP), nothing is
72+
#! emitted — intentional, since !DIR$ NOINLINE is a Cray-specific directive.
73+
#elif MFC_OpenACC
74+
$:acc_directive
75+
#elif MFC_OpenMP
76+
$:omp_directive
77+
#endif
78+
#:elif cray_inline == True
5779
#:if not isinstance(function_name, str)
5880
#:stop "When inlining for Cray Compiler, function name must be given and given as a string"
5981
#:endif
6082
#:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
6183
#ifdef _CRAYFTN
84+
#if MFC_OpenACC
85+
$:acc_directive
86+
#elif MFC_OpenMP
87+
$:omp_directive
88+
#else
6289
$:cray_directive
90+
#endif
6391
#elif MFC_OpenACC
6492
$:acc_directive
6593
#elif MFC_OpenMP

0 commit comments

Comments
 (0)