Skip to content

Commit bc0f282

Browse files
authored
Merge branch 'master' into ci-test
2 parents a7219b3 + 356b61f commit bc0f282

115 files changed

Lines changed: 1653 additions & 506 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ else
5252
echo "Master job completed successfully"
5353
fi
5454

55-
# Check if either job failed
55+
# Warn if either job failed (partial results may still be usable)
5656
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
57-
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
58-
exit 1
57+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
58+
echo "Checking for partial results..."
59+
else
60+
echo "=========================================="
61+
echo "Both benchmark jobs completed successfully!"
62+
echo "=========================================="
5963
fi
6064

61-
echo "=========================================="
62-
echo "Both benchmark jobs completed successfully!"
63-
echo "=========================================="
64-
6565
# Final verification that output files exist before proceeding
6666
pr_yaml="pr/bench-${device}-${interface}.yaml"
6767
master_yaml="master/bench-${device}-${interface}.yaml"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
# Sets up a persistent build cache for self-hosted CI runners.
3+
# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
4+
#
5+
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
6+
# This avoids cross-runner path issues entirely — CMake's absolute paths are
7+
# always correct because the same runner always uses the same workspace path.
8+
#
9+
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
10+
11+
_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
12+
_cache_device="${2:?}"
13+
_cache_interface="${3:-none}"
14+
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
15+
16+
_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
17+
_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
18+
19+
mkdir -p "$_cache_base"
20+
_cache_dir="$(cd "$_cache_base" && pwd -P)"
21+
22+
echo "=== Build Cache Setup ==="
23+
echo " Cache key: $_cache_key"
24+
echo " Cache dir: $_cache_dir"
25+
26+
# Replace any existing build/ (real dir or stale symlink) with a symlink
27+
# to our runner-specific cache directory.
28+
# Use unlink for symlinks to avoid rm -rf following the link and deleting
29+
# the shared cache contents (which another runner may be using).
30+
if [ -L "build" ]; then
31+
unlink "build"
32+
elif [ -e "build" ]; then
33+
rm -rf "build"
34+
fi
35+
36+
ln -s "$_cache_dir" "build"
37+
38+
echo " Symlink: build -> $_cache_dir"
39+
echo "========================="

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@ fi
3737
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
3838

3939
# Use the monitoring script from PR (where this script lives)
40-
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
41-
42-
echo "[$dir] Monitoring complete for job $job_id"
40+
monitor_exit=0
41+
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
42+
if [ "$monitor_exit" -ne 0 ]; then
43+
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
44+
else
45+
echo "[$dir] Monitoring complete for job $job_id"
46+
fi
4347

4448
# Verify the YAML output file was created
4549
yaml_file="${job_slug}.yaml"

.github/workflows/bench.yml

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,42 @@ jobs:
4646
else
4747
# Get PR number from workflow_run
4848
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
49+
if [ -z "$PR_NUMBER" ]; then
50+
# Cross-repo PRs don't populate pull_requests[]. Search by head SHA.
51+
HEAD_SHA="${{ github.event.workflow_run.head_sha }}"
52+
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
53+
--jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1)
54+
fi
55+
if [ -z "$PR_NUMBER" ]; then
56+
# workflow_run may report the merge/base SHA for forks. Fall back to branch name.
57+
HEAD_BRANCH="${{ github.event.workflow_run.head_branch }}"
58+
if [ -n "$HEAD_BRANCH" ] && [ "$HEAD_BRANCH" != "master" ]; then
59+
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
60+
--jq ".[] | select(.head.ref == \"$HEAD_BRANCH\") | .number" | head -1)
61+
fi
62+
fi
63+
4964
if [ -n "$PR_NUMBER" ]; then
5065
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
5166
5267
# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
5368
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
5469
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
5570
56-
# Check if PR is approved
57-
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
58-
--jq '[.[] | select(.state == "APPROVED")] | length')
59-
if [ "$APPROVED" -gt 0 ]; then
60-
echo "approved=true" >> $GITHUB_OUTPUT
61-
else
62-
echo "approved=false" >> $GITHUB_OUTPUT
63-
fi
71+
# Check if PR is approved by a maintainer/admin (ignore AI bot approvals)
72+
APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
73+
--jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]')
74+
APPROVED="false"
75+
for approver in $APPROVERS; do
76+
PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \
77+
--jq '.permission' 2>/dev/null || echo "none")
78+
if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then
79+
echo " Approved by $approver (permission: $PERM)"
80+
APPROVED="true"
81+
break
82+
fi
83+
done
84+
echo "approved=$APPROVED" >> $GITHUB_OUTPUT
6485
else
6586
echo "pr_number=" >> $GITHUB_OUTPUT
6687
echo "approved=false" >> $GITHUB_OUTPUT
@@ -76,8 +97,7 @@ jobs:
7697
(
7798
github.event_name == 'workflow_dispatch' ||
7899
needs.file-changes.outputs.pr_approved == 'true' ||
79-
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
80-
needs.file-changes.outputs.pr_author == 'wilfonba'
100+
needs.file-changes.outputs.pr_author == 'sbryngelson'
81101
)
82102
needs: [file-changes]
83103
strategy:
@@ -164,6 +184,7 @@ jobs:
164184
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
165185

166186
- name: Generate & Post Comment
187+
if: always()
167188
run: |
168189
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
169190
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)

.github/workflows/coverage.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ jobs:
3535
- name: Checkouts
3636
uses: actions/checkout@v4
3737

38+
- name: Restore Build Cache
39+
uses: actions/cache@v4
40+
with:
41+
path: build
42+
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
43+
3844
- name: Setup Ubuntu
3945
run: |
4046
sudo apt update -y

.github/workflows/docs.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ jobs:
6767
echo "url-count = ${{ steps.sitemap.outputs.url-count }}"
6868
echo "excluded-count = ${{ steps.sitemap.outputs.excluded-count }}"
6969
70+
- name: Linkcheck - Lychee
71+
uses: lycheeverse/lychee-action@v2
72+
with:
73+
args: -c .lychee.toml build/install/docs/mfc/
74+
fail: true
75+
7076
- name: Publish Documentation
7177
if: github.repository == 'MFlowCode/MFC' && github.ref == 'refs/heads/master' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' )
7278
run: |
@@ -83,11 +89,5 @@ jobs:
8389
git -C ../www commit -m "Docs @ ${GITHUB_SHA::7}" || true
8490
git -C ../www push
8591
86-
- name: Linkcheck - Lychee
87-
uses: lycheeverse/lychee-action@v2
88-
with:
89-
args: -c .lychee.toml build/install/docs/mfc/
90-
fail: true
91-
9292
# DOC_PUSH_URL should be of the format:
9393
# --> https://<username>:<token>@github.com/<username>/<repository>

.github/workflows/frontier/build.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ fi
1818

1919
. ./mfc.sh load -c f -m g
2020

21+
# Only set up build cache for test suite, not benchmarks
22+
if [ "$run_bench" != "bench" ]; then
23+
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
24+
fi
25+
2126
max_attempts=3
2227
attempt=1
2328
while [ $attempt -le $max_attempts ]; do
@@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
4550
fi
4651

4752
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
53+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
54+
rm -rf build/staging build/install build/lock.yaml
5055
sleep 30
5156
fi
5257
attempt=$((attempt + 1))

.github/workflows/frontier_amd/build.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ fi
1818

1919
. ./mfc.sh load -c famd -m g
2020

21+
# Only set up build cache for test suite, not benchmarks
22+
if [ "$run_bench" != "bench" ]; then
23+
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
24+
fi
25+
2126
max_attempts=3
2227
attempt=1
2328
while [ $attempt -le $max_attempts ]; do
@@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
4550
fi
4651

4752
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
53+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
54+
rm -rf build/staging build/install build/lock.yaml
5055
sleep 30
5156
fi
5257
attempt=$((attempt + 1))

.github/workflows/phoenix/test.sh

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,39 @@ if [ "$job_device" = "gpu" ]; then
1414
fi
1515
fi
1616

17+
# Set up persistent build cache
18+
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
19+
1720
max_attempts=3
1821
attempt=1
1922
while [ $attempt -le $max_attempts ]; do
2023
echo "Build attempt $attempt of $max_attempts..."
2124
if ./mfc.sh test -v --dry-run -j 8 $build_opts; then
2225
echo "Build succeeded on attempt $attempt."
26+
27+
# Smoke-test the cached binaries to catch architecture mismatches
28+
# (SIGILL from binaries compiled on a different compute node).
29+
syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1)
30+
if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then
31+
echo "WARNING: syscheck binary crashed — cached install is stale."
32+
if [ $attempt -lt $max_attempts ]; then
33+
echo "Clearing cache and rebuilding..."
34+
rm -rf build/staging build/install build/lock.yaml
35+
sleep 5
36+
attempt=$((attempt + 1))
37+
continue
38+
else
39+
echo "ERROR: syscheck still failing after $max_attempts attempts."
40+
exit 1
41+
fi
42+
fi
43+
2344
break
2445
fi
2546

2647
if [ $attempt -lt $max_attempts ]; then
27-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
28-
./mfc.sh clean
48+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
49+
rm -rf build/staging build/install build/lock.yaml
2950
sleep 30
3051
else
3152
echo "Build failed after $max_attempts attempts."
@@ -44,4 +65,3 @@ if [ "$job_device" = "gpu" ]; then
4465
fi
4566

4667
./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
47-

.github/workflows/test.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ jobs:
9797
- name: Clone
9898
uses: actions/checkout@v4
9999

100+
- name: Restore Build Cache
101+
uses: actions/cache@v4
102+
with:
103+
path: build
104+
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
105+
100106
- name: Setup MacOS
101107
if: matrix.os == 'macos'
102108
run: |
@@ -248,6 +254,8 @@ jobs:
248254
steps:
249255
- name: Clone
250256
uses: actions/checkout@v4
257+
with:
258+
clean: false
251259

252260
- name: Build
253261
if: matrix.cluster != 'phoenix'

0 commit comments

Comments
 (0)