Skip to content

Commit f0e117d

Browse files
Merge branch 'MFlowCode:master' into master
2 parents 16bd456 + c76be93 commit f0e117d

107 files changed

Lines changed: 13850 additions & 20898 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ffmt.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# MFC Fortran formatting configuration
2+
# These are the defaults — this file makes them explicit.
3+
4+
indent-width = 4
5+
keyword-case = "lower"
6+
normalize-keywords = true
7+
indent-fypp = true
8+
9+
[whitespace]
10+
relational = true
11+
logical = true
12+
plusminus = true
13+
multdiv = false
14+
power = false
15+
assignment = true
16+
declaration = true
17+
comma = true
18+
slice-colon = false

.github/scripts/run_case_optimization.sh

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
1313
ngpus=1
1414
fi
1515

16-
# Verify the venv Python interpreter exists (created by ./mfc.sh build)
17-
if [ ! -x build/venv/bin/python3 ]; then
18-
echo "ERROR: build/venv/bin/python3 not found."
19-
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
20-
exit 1
21-
fi
22-
2316
benchmarks=(
2417
benchmarks/5eq_rk3_weno3_hllc/case.py
2518
benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -28,6 +21,30 @@ benchmarks=(
2821
benchmarks/igr/case.py
2922
)
3023

24+
# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
25+
# build case-optimized binaries here on the compute node before running.
26+
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
27+
#
28+
# Clean stale MFC target staging before building. On self-hosted CI runners,
29+
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
30+
# can persist and poison subsequent builds. Each case-opt config gets its own
31+
# hash-named staging dir, but install dirs and other artifacts may be stale.
32+
if [ "$job_cluster" != "phoenix" ]; then
33+
# Clean stale MFC target dirs (hash-named) from prior builds, but
34+
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
35+
# node has no internet to re-fetch them.
36+
echo "=== Cleaning stale MFC target staging/install ==="
37+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
38+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
39+
40+
echo "=== Building case-optimized binaries on compute node ==="
41+
for case in "${benchmarks[@]}"; do
42+
echo "--- Building: $case ---"
43+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
44+
done
45+
echo "=== All case-optimized binaries built ==="
46+
fi
47+
3148
passed=0
3249
failed=0
3350
failed_cases=""
@@ -44,7 +61,7 @@ for case in "${benchmarks[@]}"; do
4461
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
4562

4663
# Build + run with --case-optimization, small grid, 10 timesteps
47-
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
64+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -c "$job_cluster" -- --gbpp 1 --steps 10; then
4865
# Validate output
4966
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
5067
echo "PASS: $case_name"

.github/workflows/bench.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,42 +68,47 @@ jobs:
6868
flag: f
6969
device: gpu
7070
interface: acc
71-
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
71+
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
7272
- cluster: frontier
7373
name: Oak Ridge | Frontier (CCE)
7474
group: phoenix
7575
labels: frontier
7676
flag: f
7777
device: gpu
7878
interface: omp
79-
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
79+
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
8080
- cluster: frontier_amd
8181
name: Oak Ridge | Frontier (AMD)
8282
group: phoenix
8383
labels: frontier
8484
flag: famd
8585
device: gpu
8686
interface: omp
87-
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
87+
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
8888
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
8989
runs-on:
9090
group: ${{ matrix.group }}
9191
labels: ${{ matrix.labels }}
9292
timeout-minutes: 480
9393
steps:
94+
- name: Clean stale output files
95+
run: rm -f *.out
96+
9497
- name: Clone - PR
9598
uses: actions/checkout@v4
9699
with:
97100
path: pr
101+
clean: false
98102

99103
- name: Clone - Master
100104
uses: actions/checkout@v4
101105
with:
102106
repository: MFlowCode/MFC
103107
ref: master
104108
path: master
109+
clean: false
105110

106-
- name: Setup & Build
111+
- name: Fetch Dependencies
107112
if: matrix.build_script != ''
108113
timeout-minutes: 150
109114
run: |

.github/workflows/common/bench.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
2121
trap 'rm -rf "$currentdir" || true' EXIT
2222
fi
2323

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
24+
# --- Build ---
25+
# Phoenix builds everything inside SLURM (no login-node build step).
26+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
27+
# source code is built here on the compute node.
2628
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2729
if [ "$job_cluster" = "phoenix" ]; then
2830
source .github/scripts/clean-build.sh
2931
clean_build
3032
fi
3133

32-
if [ ! -d "build" ]; then
33-
source .github/scripts/retry-build.sh
34-
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
35-
fi
34+
source .github/scripts/retry-build.sh
35+
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
3636

3737
# --- Bench cluster flag ---
3838
if [ "$job_cluster" = "phoenix" ]; then

.github/workflows/common/build.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
# Build-only script for all clusters.
3+
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Builds MFC without running tests (--dry-run).
5+
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
6+
7+
set -euo pipefail
8+
9+
source .github/scripts/gpu-opts.sh
10+
build_opts="$gpu_opts"
11+
12+
# --- Phoenix TMPDIR setup ---
13+
if [ "$job_cluster" = "phoenix" ]; then
14+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
15+
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
16+
mkdir -p $tmpbuild
17+
mkdir -p $currentdir
18+
export TMPDIR=$currentdir
19+
trap 'rm -rf "$currentdir" || true' EXIT
20+
fi
21+
22+
# --- Build ---
23+
# Phoenix builds everything inside SLURM (no login-node build step).
24+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
25+
# source code is built here on the compute node.
26+
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
27+
# on a different microarchitecture.
28+
if [ "$job_cluster" = "phoenix" ]; then
29+
source .github/scripts/clean-build.sh
30+
clean_build
31+
fi
32+
33+
source .github/scripts/retry-build.sh
34+
35+
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
36+
# (SIGILL from binaries compiled on a different compute node).
37+
validate_cmd=""
38+
if [ "$job_cluster" = "phoenix" ]; then
39+
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
40+
fi
41+
42+
RETRY_VALIDATE_CMD="$validate_cmd" \
43+
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

.github/workflows/common/test.sh

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
2-
# Unified test script for all clusters.
2+
# Test-only script for all clusters.
33
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Assumes MFC is already built (by a prior build.sh SLURM job).
45
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
56

67
set -euo pipefail
@@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh
910
build_opts="$gpu_opts"
1011

1112
# --- Phoenix TMPDIR setup ---
12-
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13-
# spawning MPI processes, it fills up and ORTE session dir creation fails.
14-
# Redirect TMPDIR to project storage, same as bench.sh.
1513
if [ "$job_cluster" = "phoenix" ]; then
1614
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
1715
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
@@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then
2119
trap 'rm -rf "$currentdir" || true' EXIT
2220
fi
2321

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
26-
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
27-
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
28-
if [ "$job_cluster" = "phoenix" ]; then
29-
source .github/scripts/clean-build.sh
30-
clean_build
31-
fi
32-
33-
if [ ! -d "build" ]; then
34-
source .github/scripts/retry-build.sh
35-
36-
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
37-
# (SIGILL from binaries compiled on a different compute node).
38-
validate_cmd=""
39-
if [ "$job_cluster" = "phoenix" ]; then
40-
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
41-
fi
42-
43-
RETRY_VALIDATE_CMD="$validate_cmd" \
44-
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
45-
fi
46-
4722
# --- GPU detection and thread count ---
4823
device_opts=""
4924
rdma_opts=""
@@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
8863
prune_flag="--only-changes"
8964
fi
9065

91-
./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
66+
./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster

.github/workflows/frontier/build.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ esac
1414

1515
job_device=$1
1616
job_interface=$2
17-
run_bench=$3
1817
source .github/scripts/gpu-opts.sh
1918
build_opts="$gpu_opts"
2019

@@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
2423
clean_build
2524

2625
source .github/scripts/retry-build.sh
27-
if [ "$run_bench" == "bench" ]; then
28-
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
29-
else
30-
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
31-
fi
26+
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1

.github/workflows/test.yml

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ jobs:
162162
os: ['ubuntu', 'macos']
163163
mpi: ['mpi']
164164
precision: ['']
165-
debug: ['debug', 'no-debug']
165+
debug: ['reldebug', 'no-debug']
166166
intel: [true, false]
167167
exclude:
168168
- os: macos
@@ -225,7 +225,7 @@ jobs:
225225
if: matrix.os == 'macos'
226226
run: |
227227
brew update
228-
brew upgrade
228+
brew upgrade || true
229229
brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
230230
echo "FC=gfortran-15" >> $GITHUB_ENV
231231
echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
@@ -245,14 +245,20 @@ jobs:
245245
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
246246
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
247247
sudo apt-get update
248-
sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel
248+
sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-compiler-dpcpp-cpp intel-oneapi-mpi intel-oneapi-mpi-devel
249249
# Export only new/changed env vars from setvars.sh.
250250
# `printenv >> $GITHUB_ENV` dumps all vars including shell internals
251251
# with special characters that corrupt GITHUB_ENV parsing.
252252
printenv | sort > /tmp/env_before
253253
source /opt/intel/oneapi/setvars.sh
254254
printenv | sort > /tmp/env_after
255255
diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV
256+
echo "FC=ifx" >> $GITHUB_ENV
257+
echo "CC=icx" >> $GITHUB_ENV
258+
echo "CXX=icpx" >> $GITHUB_ENV
259+
echo "MPIFC=mpiifx" >> $GITHUB_ENV
260+
echo "MPICC=mpiicx" >> $GITHUB_ENV
261+
echo "MPICXX=mpiicpx" >> $GITHUB_ENV
256262
257263
- name: Build
258264
run: |
@@ -266,7 +272,7 @@ jobs:
266272
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $ONLY_CHANGES $TEST_ALL $TEST_PCT
267273
env:
268274
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
269-
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
275+
TEST_PCT: ${{ matrix.debug == 'reldebug' && '-% 20' || '' }}
270276
ONLY_CHANGES: ${{ github.event_name == 'pull_request' && '--only-changes' || '' }}
271277

272278
self:
@@ -400,11 +406,14 @@ jobs:
400406
echo "Coverage cache: none available — full test suite will run"
401407
fi
402408
403-
- name: Build (login node)
409+
- name: Fetch Dependencies
404410
if: matrix.cluster != 'phoenix'
405411
timeout-minutes: 60
406412
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
407413

414+
- name: Build
415+
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
416+
408417
- name: Test
409418
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
410419

@@ -421,23 +430,29 @@ jobs:
421430
if: always()
422431
id: log
423432
run: |
424-
SLUG="test-${{ matrix.device }}-${{ matrix.interface }}"
433+
SHARD_SUFFIX=""
425434
SHARD="${{ matrix.shard }}"
426435
if [ -n "$SHARD" ]; then
427-
SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')"
436+
SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
428437
fi
429-
echo "slug=${SLUG}" >> "$GITHUB_OUTPUT"
438+
echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
439+
echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
430440
431441
- name: Print Logs
432442
if: always()
433-
run: cat ${{ steps.log.outputs.slug }}.out
443+
run: |
444+
for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
445+
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
446+
done
434447
435448
- name: Archive Logs
436449
uses: actions/upload-artifact@v4
437450
if: matrix.cluster != 'phoenix'
438451
with:
439-
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
440-
path: ${{ steps.log.outputs.slug }}.out
452+
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
453+
path: |
454+
${{ steps.log.outputs.build_slug }}.out
455+
${{ steps.log.outputs.test_slug }}.out
441456
442457
case-optimization:
443458
name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
@@ -486,15 +501,20 @@ jobs:
486501
- name: Clean stale output files
487502
run: rm -f *.out
488503

504+
- name: Fetch Dependencies
505+
if: matrix.cluster != 'phoenix'
506+
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
507+
489508
- name: Pre-Build (SLURM)
490509
if: matrix.cluster == 'phoenix'
491510
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
492511

493-
- name: Pre-Build (login node)
512+
- name: Build & Run Case-Optimization Tests
494513
if: matrix.cluster != 'phoenix'
495-
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
514+
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
496515

497516
- name: Run Case-Optimization Tests
517+
if: matrix.cluster == 'phoenix'
498518
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
499519

500520
- name: Cancel SLURM Jobs

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,4 @@ benchmarks/*.png
113113
cce_*/
114114
cce_*.log
115115
run_cce_*.sh
116+
.ffmt_cache/

0 commit comments

Comments
 (0)