Skip to content

Commit 4accfe0

Browse files
authored
Merge branch 'master' into local-aware-ibm
2 parents 8008855 + 45cd6bb commit 4accfe0

47 files changed

Lines changed: 1689 additions & 2042 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.coderabbit.yaml

Lines changed: 0 additions & 64 deletions
This file was deleted.

.github/CODEOWNERS

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,3 @@
11
# Spencer H. Bryngelson (sbryngelson) is the
22
# default owner of all files in the repository.
33
* @sbryngelson
4-
5-
# @core-devs
6-
src/ @MFlowCode/core-devs
7-
docs/ @MFlowCode/core-devs
8-
toolchain/ @MFlowCode/core-devs
9-
tests/ @MFlowCode/core-devs
10-
benchmarks/ @MFlowCode/core-devs
11-
mfc.sh @MFlowCode/core-devs
12-
CMakeLists.txt @MFlowCode/core-devs
13-
14-
# @physics-devs
15-
src/ @MFlowCode/physics-devs

.github/Dockerfile

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ ARG CXX_COMPILER
77
ARG FC_COMPILER
88
ARG COMPILER_PATH
99
ARG COMPILER_LD_LIBRARY_PATH
10-
ARG AFAR_VERSION
11-
ARG OLCF_AFAR_ROOT=""
1210

1311
ENV DEBIAN_FRONTEND=noninteractive
1412
ENV TZ=UTC
@@ -29,47 +27,10 @@ RUN apt-get update -y && \
2927
python3.12 python3.12-venv python3-pip \
3028
libfftw3-dev \
3129
openmpi-bin libopenmpi-dev; \
32-
elif [ "$TARGET" = "amd" ]; then \
33-
apt-get install -y \
34-
build-essential git make gcc g++ gfortran bc \
35-
python3.12 python3.12-venv python3-pip \
36-
libfftw3-dev libnuma1 libdrm2 libdrm-amdgpu1; \
3730
fi && \
3831
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 && \
3932
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
4033

41-
# AMD: download AFAR, install cmake 3.28 (Ubuntu 22.04 ships 3.22 which doesn't
42-
# recognize amdflang as LLVMFlang), then build MPICH with amdflang so the
43-
# generated mpi.mod is compiler-compatible.
44-
RUN if [ "$TARGET" = "amd" ] && [ -n "$AFAR_VERSION" ]; then \
45-
OLCF_AFAR_ROOT="/opt/${AFAR_VERSION}" && \
46-
wget -q "https://repo.radeon.com/rocm/misc/flang/${AFAR_VERSION}-ubuntu.tar.bz2" -O /tmp/afar.tar.bz2 && \
47-
tar -xjf /tmp/afar.tar.bz2 -C /opt/ && \
48-
rm /tmp/afar.tar.bz2 && \
49-
CMAKE_VER=3.28.6 && \
50-
wget -q "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-linux-x86_64.sh" \
51-
-O /tmp/cmake-install.sh && \
52-
chmod +x /tmp/cmake-install.sh && \
53-
/tmp/cmake-install.sh --prefix=/usr/local --skip-license --exclude-subdir && \
54-
rm /tmp/cmake-install.sh && \
55-
printf '#!/bin/bash\nargs=()\nwhile [ "$#" -gt 0 ]; do\n if [ "$1" = "-soname" ]; then\n args+=("-Wl,-soname,$2"); shift 2\n else\n args+=("$1"); shift\n fi\ndone\nexec '"${OLCF_AFAR_ROOT}"'/bin/amdflang "${args[@]}"\n' \
56-
> /usr/local/bin/amdflang-ld-wrap && \
57-
chmod +x /usr/local/bin/amdflang-ld-wrap && \
58-
MPICH_VER=3.4.3 && \
59-
wget -q "https://www.mpich.org/static/downloads/${MPICH_VER}/mpich-${MPICH_VER}.tar.gz" \
60-
-O /tmp/mpich.tar.gz && \
61-
mkdir -p /tmp/mpich-src && \
62-
tar -xzf /tmp/mpich.tar.gz -C /tmp/mpich-src --strip-components=1 && \
63-
cd /tmp/mpich-src && \
64-
FC=/usr/local/bin/amdflang-ld-wrap CC=gcc CXX=g++ \
65-
./configure --prefix=/opt/mpich --enable-shared --disable-static \
66-
--with-device=ch3 2>&1 && \
67-
make -j$(nproc) 2>&1 && \
68-
make install 2>&1 && \
69-
cd / && \
70-
rm -rf /tmp/mpich-src /tmp/mpich.tar.gz; \
71-
fi
72-
7334
ENV OMPI_ALLOW_RUN_AS_ROOT=1
7435
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
7536
ENV HYDRA_LAUNCHER=fork
@@ -80,7 +41,6 @@ COPY ../ /opt/MFC
8041
ENV CC=${CC_COMPILER}
8142
ENV CXX=${CXX_COMPILER}
8243
ENV FC=${FC_COMPILER}
83-
ENV OLCF_AFAR_ROOT=${OLCF_AFAR_ROOT}
8444
ENV PATH="${COMPILER_PATH}:/opt/mpich/bin:$PATH"
8545
ENV LD_LIBRARY_PATH="${COMPILER_LD_LIBRARY_PATH}:/opt/mpich/lib:${LD_LIBRARY_PATH:-}"
8646

@@ -95,17 +55,13 @@ RUN echo "TARGET=$TARGET CC=$CC_COMPILER FC=$FC_COMPILER" && \
9555
cd /opt/MFC && \
9656
if [ "$TARGET" = "gpu" ]; then \
9757
./mfc.sh build --gpu acc -j $(nproc); \
98-
elif [ "$TARGET" = "amd" ]; then \
99-
./mfc.sh build --gpu mp -j $(nproc); \
10058
else \
10159
./mfc.sh build -j $(nproc); \
10260
fi
10361

10462
RUN cd /opt/MFC && \
10563
if [ "$TARGET" = "gpu" ]; then \
10664
./mfc.sh test -a --dry-run --gpu acc -j $(nproc); \
107-
elif [ "$TARGET" = "amd" ]; then \
108-
./mfc.sh test -a --dry-run --gpu mp -j $(nproc); \
10965
else \
11066
./mfc.sh test -a --dry-run -j $(nproc); \
11167
fi

.github/scripts/prebuild-case-optimization.sh

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
#!/bin/bash
22

3-
# Pre-builds all benchmark cases with --case-optimization.
4-
# No GPU hardware needed — compilation only.
3+
# Pre-builds all benchmark cases with --case-optimization using --dry-run so
4+
# binaries are cached before the GPU run job. No simulation is executed.
55
# Can run in two modes:
66
# 1. Direct (Frontier login nodes): pass cluster/device/interface as args
7-
# 2. Inside SLURM (Phoenix): uses $job_device/$job_interface from submit-slurm-job.sh
7+
# 2. Inside SLURM (Phoenix/frontier_amd): uses $job_device/$job_interface
88
# Usage: bash prebuild-case-optimization.sh [<cluster> <device> <interface>]
99

1010
set -e
@@ -22,14 +22,18 @@ case "$cluster" in
2222
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2323
esac
2424

25-
source .github/scripts/clean-build.sh
26-
clean_build
25+
# Phoenix starts fresh (no prior dep build); other clusters pre-build deps via
26+
# build.sh first, so we must preserve them and only clean MFC target staging.
27+
if [ "$cluster" = "phoenix" ]; then
28+
source .github/scripts/clean-build.sh
29+
clean_build
30+
else
31+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
32+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
33+
fi
2734

2835
. ./mfc.sh load -c "$flag" -m g
2936

30-
# Set GPU build flags from interface — this is always a GPU build.
31-
# Don't use gpu-opts.sh since $job_device may be "cpu" when submitted
32-
# to a CPU SLURM partition (no GPU hardware needed for compilation).
3337
case "$job_interface" in
3438
acc) gpu_opts="--gpu acc" ;;
3539
omp) gpu_opts="--gpu mp" ;;
@@ -38,5 +42,5 @@ esac
3842

3943
for case in benchmarks/*/case.py; do
4044
echo "=== Pre-building: $case ==="
41-
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
45+
./mfc.sh run "$case" --case-optimization $gpu_opts -j 8 --dry-run
4246
done

.github/scripts/run_case_optimization.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,14 @@ benchmarks=(
2323

2424
# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
2525
# build case-optimized binaries here on the compute node before running.
26-
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
26+
# For Phoenix and frontier_amd: prebuild-case-optimization.sh already built
27+
# everything in a prior SLURM job (via --dry-run), so skip the build here.
2728
#
2829
# Clean stale MFC target staging before building. On self-hosted CI runners,
2930
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
3031
# can persist and poison subsequent builds. Each case-opt config gets its own
3132
# hash-named staging dir, but install dirs and other artifacts may be stale.
32-
if [ "$job_cluster" != "phoenix" ]; then
33+
if [ "$job_cluster" != "phoenix" ] && [ "$job_cluster" != "frontier_amd" ]; then
3334
# Clean stale MFC target dirs (hash-named) from prior builds, but
3435
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
3536
# node has no internet to re-fetch them.

.github/workflows/docker.yml

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ jobs:
2828
- { name: 'cpu', tag: 'cpu', runner: 'ubuntu-22.04-arm', base_image: 'ubuntu:22.04' }
2929
- { name: 'gpu', tag: 'gpu-nvidia', runner: 'ubuntu-22.04', base_image: 'nvcr.io/nvidia/nvhpc:24.5-devel-cuda_multi-ubuntu22.04', compiler_arch: 'Linux_x86_64' }
3030
- { name: 'gpu', tag: 'gpu-nvidia', runner: 'ubuntu-22.04-arm', base_image: 'nvcr.io/nvidia/nvhpc:24.5-devel-cuda_multi-ubuntu22.04', compiler_arch: 'Linux_aarch64' }
31-
- { name: 'amd', tag: 'gpu-amd', runner: 'ubuntu-22.04', base_image: 'ubuntu:22.04' }
3231
runs-on: ${{ matrix.config.runner }}
3332
outputs:
3433
tag: ${{ steps.clone.outputs.tag }}
@@ -135,37 +134,6 @@ jobs:
135134
${{ env.GH_REGISTRY }}:${{ env.TAG }}-${{ matrix.config.tag }}-${{ matrix.config.runner }}
136135
push: true
137136

138-
- name: Set AMD AFAR vars
139-
if: ${{ matrix.config.name == 'amd' }}
140-
run: |
141-
AFAR=rocm-afar-8873-drop-22.2.0
142-
ROOT="/opt/${AFAR}"
143-
echo "AFAR_VERSION=${AFAR}" >> $GITHUB_ENV
144-
echo "AFAR_ROOT=${ROOT}" >> $GITHUB_ENV
145-
146-
- name: Build and push image (amd)
147-
if: ${{ matrix.config.name == 'amd' }}
148-
uses: docker/build-push-action@v6
149-
with:
150-
context: /mnt/share
151-
file: /mnt/share/Dockerfile
152-
build-args: |
153-
BASE_IMAGE=${{ matrix.config.base_image }}
154-
TARGET=amd
155-
AFAR_VERSION=${{ env.AFAR_VERSION }}
156-
OLCF_AFAR_ROOT=${{ env.AFAR_ROOT }}
157-
CC_COMPILER=gcc
158-
CXX_COMPILER=g++
159-
FC_COMPILER=${{ env.AFAR_ROOT }}/bin/amdflang
160-
COMPILER_PATH=${{ env.AFAR_ROOT }}/lib/llvm/bin:${{ env.AFAR_ROOT }}/bin
161-
COMPILER_LD_LIBRARY_PATH=${{ env.AFAR_ROOT }}/lib:${{ env.AFAR_ROOT }}/lib/llvm/lib
162-
labels: |
163-
org.opencontainers.image.source=https://github.com/${{ github.repository }}
164-
tags: |
165-
${{ secrets.DOCKERHUB_USERNAME }}/mfc:${{ env.TAG }}-gpu-amd-ubuntu-22.04
166-
${{ env.GH_REGISTRY }}:${{ env.TAG }}-gpu-amd-ubuntu-22.04
167-
push: true
168-
169137
manifests:
170138
runs-on: ubuntu-latest
171139
needs: Container
@@ -201,9 +169,8 @@ jobs:
201169
run: |
202170
GH="${{ env.GH_REGISTRY }}"
203171
for R in "$DH" "$GH"; do
204-
docker buildx imagetools create -t $R:$TAG-cpu $R:$TAG-cpu-ubuntu-22.04 $R:$TAG-cpu-ubuntu-22.04-arm
172+
docker buildx imagetools create -t $R:$TAG-cpu $R:$TAG-cpu-ubuntu-22.04 $R:$TAG-cpu-ubuntu-22.04-arm
205173
docker buildx imagetools create -t $R:$TAG-gpu-nvidia $R:$TAG-gpu-nvidia-ubuntu-22.04 $R:$TAG-gpu-nvidia-ubuntu-22.04-arm
206-
docker buildx imagetools create -t $R:$TAG-gpu-amd $R:$TAG-gpu-amd-ubuntu-22.04
207174
done
208175
209176
- name: Update latest tags
@@ -214,7 +181,6 @@ jobs:
214181
run: |
215182
GH="${{ env.GH_REGISTRY }}"
216183
for R in "$DH" "$GH"; do
217-
docker buildx imagetools create -t $R:latest-cpu $R:$TAG-cpu-ubuntu-22.04 $R:$TAG-cpu-ubuntu-22.04-arm
184+
docker buildx imagetools create -t $R:latest-cpu $R:$TAG-cpu-ubuntu-22.04 $R:$TAG-cpu-ubuntu-22.04-arm
218185
docker buildx imagetools create -t $R:latest-gpu-nvidia $R:$TAG-gpu-nvidia-ubuntu-22.04 $R:$TAG-gpu-nvidia-ubuntu-22.04-arm
219-
docker buildx imagetools create -t $R:latest-gpu-amd $R:$TAG-gpu-amd-ubuntu-22.04
220186
done

.github/workflows/fp-stability.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,12 @@ jobs:
101101
run: ~/.local/verrou/bin/valgrind --version
102102

103103
- name: Build MFC (debug, serial)
104+
# FFLAGS=-fno-inline prevents gfortran from inlining small functions into
105+
# their callers. Without it, DWARF debug info attributes inlined ops to
106+
# the caller's line (often a do-loop header), making Verrou dd_line point
107+
# to loop boundaries instead of the actual arithmetic.
108+
env:
109+
FFLAGS: "-fno-inline"
104110
run: ./mfc.sh build -t pre_process simulation --no-mpi --debug -j"$(nproc)"
105111

106112
- name: Run FP Stability Suite

.github/workflows/test.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -659,12 +659,16 @@ jobs:
659659
if: matrix.cluster == 'phoenix'
660660
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
661661

662+
- name: Pre-Build (SLURM)
663+
if: matrix.cluster == 'frontier_amd'
664+
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh gpu ${{ matrix.interface }} ${{ matrix.cluster }}
665+
662666
- name: Build & Run Case-Optimization Tests
663-
if: matrix.cluster != 'phoenix'
667+
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_amd'
664668
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
665669

666670
- name: Run Case-Optimization Tests
667-
if: matrix.cluster == 'phoenix'
671+
if: matrix.cluster == 'phoenix' || matrix.cluster == 'frontier_amd'
668672
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
669673

670674
- name: Cancel SLURM Jobs

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,5 @@ cce_*/
114114
cce_*.log
115115
run_cce_*.sh
116116
.ffmt_cache/
117+
# FP-stability log artifacts (generated by ./mfc.sh fp-stability)
118+
fp-stability-logs/

.typos.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ tru = "tru" # typo for "true" in "when_tru" - tests dependency keys
3131
PNGs = "PNGs"
3232

3333
[files]
34-
extend-exclude = ["docs/documentation/references*", "docs/references.bib", "tests/", "toolchain/cce_simulation_workgroup_256.sh", "build-docs/", "build/", "build_test/"]
34+
extend-exclude = ["docs/documentation/references*", "docs/references.bib", "tests/", "toolchain/cce_simulation_workgroup_256.sh", "build-docs/", "build/", "build_test/", "fp-stability-logs/"]

0 commit comments

Comments
 (0)