Skip to content

Concurrent Frontier CI: parallel test+bench, consolidated SLURM jobs #4171

Concurrent Frontier CI: parallel test+bench, consolidated SLURM jobs

Concurrent Frontier CI: parallel test+bench, consolidated SLURM jobs #4171

Workflow file for this run

name: 'Test Suite'
on:
push:
branches: [master]
pull_request:
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
lint-gate:
name: Lint Gate
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Initialize MFC
run: ./mfc.sh init
- name: Check Formatting
run: |
./mfc.sh format -j $(nproc)
git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)
- name: Spell Check
run: ./mfc.sh spelling
- name: Lint Toolchain
run: ./mfc.sh lint
- name: Lint Source - No Raw Directives
run: |
! grep -iR '!\$acc\|!\$omp' --exclude="parallel_macros.fpp" --exclude="acc_macros.fpp" --exclude="omp_macros.fpp" --exclude="shared_parallel_macros.fpp" --exclude="syscheck.fpp" ./src/*
- name: Lint Source - No Double Precision Intrinsics
run: |
! grep -iR 'double_precision\|dsqrt\|dexp\|dlog\|dble\|dabs\|double\ precision\|real(8)\|real(4)\|dprod\|dmin\|dmax\|dfloat\|dreal\|dcos\|dsin\|dtan\|dsign\|dtanh\|dsinh\|dcosh\|d0' --exclude-dir=syscheck --exclude="*nvtx*" --exclude="*precision_select*" ./src/*
- name: Lint Source - No Junk Code
run: |
! grep -iR -e '\.\.\.' -e '\-\-\-' -e '===' ./src/*
file-changes:
name: Detect File Changes
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
steps:
- name: Clone
uses: actions/checkout@v4
- name: Detect Changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: ".github/file-filter.yml"
github:
name: Github
if: needs.file-changes.outputs.checkall == 'true'
needs: [lint-gate, file-changes]
strategy:
matrix:
os: ['ubuntu', 'macos']
mpi: ['mpi']
precision: ['']
debug: ['debug', 'no-debug']
intel: [true, false]
exclude:
- os: macos
intel: true
include:
- os: ubuntu
mpi: no-mpi
precision: single
debug: no-debug
intel: false
fail-fast: false
continue-on-error: true
runs-on: ${{ matrix.os }}-latest
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup MacOS
if: matrix.os == 'macos'
run: |
brew update
brew upgrade
brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
echo "FC=gfortran-15" >> $GITHUB_ENV
echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
- name: Setup Ubuntu
if: matrix.os == 'ubuntu' && matrix.intel == false
run: |
sudo apt update -y
sudo apt install -y cmake gcc g++ python3 python3-dev hdf5-tools \
libfftw3-dev libhdf5-dev openmpi-bin libopenmpi-dev \
libblas-dev liblapack-dev
- name: Setup Ubuntu (Intel)
if: matrix.os == 'ubuntu' && matrix.intel == true
run: |
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
sudo apt-get update
sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel
source /opt/intel/oneapi/setvars.sh
printenv >> $GITHUB_ENV
- name: Set up Python 3.14
uses: actions/setup-python@v5
with:
python-version: '3.14'
- name: Build
run: |
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
- name: Test
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
self:
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: [lint-gate, file-changes]
continue-on-error: false
timeout-minutes: 480
strategy:
matrix:
include:
# Phoenix (GT) — build+test combined in SLURM job
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
# Frontier (ORNL) — all configs consolidated into one 5-node SLURM job
- runner: 'frontier'
cluster: 'frontier_all'
cluster_name: 'Oak Ridge | Frontier'
device: 'all'
interface: 'configs'
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone
uses: actions/checkout@v4
- name: Build
if: matrix.cluster != 'phoenix' && matrix.cluster != 'frontier_all'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
- name: Test
if: matrix.cluster != 'frontier_all'
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
- name: Build & Test (Frontier All Configs)
if: matrix.cluster == 'frontier_all'
run: bash .github/scripts/run_frontier_all_tests.sh
- name: Print Logs
if: always()
run: |
cat test-*.out 2>/dev/null || true
- name: Archive Logs
uses: actions/upload-artifact@v4
if: always() && matrix.cluster != 'phoenix'
with:
name: logs-${{ matrix.cluster }}
path: test-*.out
bench:
name: "Benchmark | ${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: [lint-gate, file-changes]
continue-on-error: false
timeout-minutes: 480
strategy:
fail-fast: true
matrix:
include:
- cluster: phoenix
name: Georgia Tech | Phoenix (NVHPC)
group: phoenix
labels: gt
flag: p
device: cpu
interface: none
build_script: ""
- cluster: phoenix
name: Georgia Tech | Phoenix (NVHPC)
group: phoenix
labels: gt
flag: p
device: gpu
interface: acc
build_script: ""
- cluster: phoenix
name: Georgia Tech | Phoenix (NVHPC)
group: phoenix
labels: gt
flag: p
device: gpu
interface: omp
build_script: ""
# Frontier — all configs consolidated into one 6-node SLURM job
- cluster: frontier_all
name: Oak Ridge | Frontier
group: phoenix
labels: frontier
device: all
interface: configs
build_script: ""
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
env:
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone - PR
uses: actions/checkout@v4
with:
path: pr
- name: Clone - Master
uses: actions/checkout@v4
with:
repository: MFlowCode/MFC
ref: master
path: master
- name: Setup & Build
if: matrix.build_script != '' && matrix.cluster != 'frontier_all'
run: |
(cd pr && ${{ matrix.build_script }}) &
(cd master && ${{ matrix.build_script }}) &
wait %1 && wait %2
- name: Bench (Master v. PR)
if: matrix.cluster != 'frontier_all'
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
- name: Bench All Configs (Frontier)
if: matrix.cluster == 'frontier_all'
run: bash pr/.github/scripts/run_frontier_all_benchmarks.sh
- name: Generate & Post Comment
if: matrix.cluster != 'frontier_all'
run: |
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)
- name: Generate & Post Comments (Frontier)
if: matrix.cluster == 'frontier_all'
run: bash pr/.github/scripts/frontier_bench_post.sh
- name: Print Logs
if: always()
run: |
cat pr/bench-*.* master/bench-*.* 2>/dev/null || true
cat pr-*/bench-*.* master-*/bench-*.* 2>/dev/null || true
- name: Archive Logs
if: always() && matrix.cluster != 'phoenix'
uses: actions/upload-artifact@v4
with:
name: bench-${{ matrix.cluster }}
path: |
pr*/bench-*.*
master*/bench-*.*
cancel-on-test-failure:
name: Cancel on Test Failure
needs: [self]
if: failure()
runs-on: ubuntu-latest
steps:
- name: Cancel Workflow Run
run: gh run cancel ${{ github.run_id }} --repo ${{ github.repository }}
env:
GH_TOKEN: ${{ github.token }}