Skip to content

Commit 3738e14

Browse files
sbryngelsonclaude
andcommitted
Add CI build caching for GitHub-hosted and self-hosted HPC runners
GitHub-hosted runners: Add actions/cache@v4 to test.yml and coverage.yml, caching the build/ directory keyed by matrix config and source file hashes. Partial cache hits via restore-keys enable incremental builds. Self-hosted HPC runners (Phoenix, Frontier, Frontier AMD): Add a persistent build cache that symlinks build/ to $HOME/scratch/.mfc-ci-cache/<config>/build. This ensures cached artifacts persist across CI runs regardless of which runner instance picks up the job. Key details: - Cross-runner workspace path fixup via sed on CMake files - flock-based locking prevents concurrent builds from corrupting the cache - Retry logic uses targeted rm (staging/install only) instead of mfc.sh clean - Phoenix releases the lock after build, before tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 21347c1 commit 3738e14

6 files changed

Lines changed: 123 additions & 6 deletions

File tree

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/bin/bash
2+
# Sets up a persistent build cache for self-hosted CI runners.
3+
# Creates a symlink: ./build -> <resolved scratch path>/.mfc-ci-cache/<key>/build
4+
#
5+
# This ensures that every run of the same config (cluster/device/interface) finds
6+
# cached build artifacts regardless of which runner instance picks up the job.
7+
#
8+
# Concurrent safety: uses flock to serialize access per cache directory. If
9+
# multiple PRs trigger the same config simultaneously, the second job waits
10+
# for the first to finish (up to 1 hour), then gets a warm cache. If the lock
11+
# times out, falls back to a local build (same as no caching).
12+
#
13+
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
14+
15+
_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
16+
_cache_device="${2:?}"
17+
_cache_interface="${3:-none}"
18+
19+
_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}"
20+
_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build"
21+
22+
# Create the cache dir, then resolve to a physical path (no symlinks).
23+
# $HOME/scratch is typically a symlink to a scratch filesystem — resolving
24+
# it ensures the build symlink target remains valid even if intermediate
25+
# symlinks change.
26+
mkdir -p "$_cache_base"
27+
_cache_dir="$(cd "$_cache_base" && pwd -P)"
28+
29+
echo "=== Build Cache Setup ==="
30+
echo " Cache key: $_cache_key"
31+
echo " Cache dir: $_cache_dir"
32+
33+
# Acquire an exclusive lock on the cache directory to prevent concurrent
34+
# builds from corrupting it. The lock is fd-based (flock on fd 9), so it
35+
# auto-releases when the calling process exits — no stale locks.
36+
#
37+
# Timeout: 1 hour. If another build holds the lock, we wait. This is fine
38+
# because the waiting job will get a warm cache when it finally acquires.
39+
# If the lock can't be acquired after 1 hour, something is wrong — fall
40+
# back to a local build in the workspace.
41+
_cache_locked=false
42+
_lock_file="$_cache_dir/.cache.lock"
43+
exec 9>"$_lock_file"
44+
echo " Acquiring cache lock..."
45+
if flock --timeout 3600 9; then
46+
_cache_locked=true
47+
echo " Cache lock acquired"
48+
else
49+
echo " WARNING: Cache lock timeout (1h), building locally without cache"
50+
exec 9>&-
51+
mkdir -p "build"
52+
echo "========================="
53+
return 0 2>/dev/null || true
54+
fi
55+
56+
# If build/ exists (real dir or stale symlink), remove it.
57+
# rm -rf on a symlink removes the symlink, not the target — cache is safe.
58+
if [ -e "build" ] || [ -L "build" ]; then
59+
rm -rf "build"
60+
fi
61+
62+
ln -s "$_cache_dir" "build"
63+
64+
# Handle cross-runner workspace path changes.
65+
# CMakeCache.txt stores absolute paths from whichever runner instance
66+
# originally configured the build. If we're on a different runner, sed-replace
67+
# the old workspace path with the current one so CMake can do incremental builds.
68+
_workspace_marker="$_cache_dir/.workspace_path"
69+
if [ -f "$_workspace_marker" ]; then
70+
_old_workspace=$(cat "$_workspace_marker")
71+
if [ "$_old_workspace" != "$(pwd)" ]; then
72+
echo " Workspace path changed: $_old_workspace -> $(pwd)"
73+
echo " Updating cached CMake paths..."
74+
find "$_cache_dir/staging" -type f \
75+
\( -name "CMakeCache.txt" -o -name "*.cmake" \
76+
-o -name "*.make" -o -name "Makefile" \
77+
-o -name "build.ninja" \) \
78+
-exec sed -i "s|${_old_workspace}|$(pwd)|g" {} + 2>/dev/null || true
79+
fi
80+
fi
81+
echo "$(pwd)" > "$_workspace_marker"
82+
83+
echo " Symlink: build -> $_cache_dir"
84+
echo "========================="

.github/workflows/coverage.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ jobs:
3535
- name: Checkouts
3636
uses: actions/checkout@v4
3737

38+
- name: Restore Build Cache
39+
uses: actions/cache@v4
40+
with:
41+
path: build
42+
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
43+
restore-keys: |
44+
mfc-coverage-
45+
3846
- name: Setup Ubuntu
3947
run: |
4048
sudo apt update -y

.github/workflows/frontier/build.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ fi
1818

1919
. ./mfc.sh load -c f -m g
2020

21+
# Set up persistent build cache
22+
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
23+
2124
max_attempts=3
2225
attempt=1
2326
while [ $attempt -le $max_attempts ]; do
@@ -45,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do
4548
fi
4649

4750
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
51+
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
52+
rm -rf build/staging build/install build/lock.yaml
5053
sleep 30
5154
fi
5255
attempt=$((attempt + 1))

.github/workflows/frontier_amd/build.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ fi
1818

1919
. ./mfc.sh load -c famd -m g
2020

21+
# Set up persistent build cache
22+
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
23+
2124
max_attempts=3
2225
attempt=1
2326
while [ $attempt -le $max_attempts ]; do
@@ -45,8 +48,8 @@ while [ $attempt -le $max_attempts ]; do
4548
fi
4649

4750
if [ $attempt -lt $max_attempts ]; then
48-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
49-
./mfc.sh clean
51+
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
52+
rm -rf build/staging build/install build/lock.yaml
5053
sleep 30
5154
fi
5255
attempt=$((attempt + 1))

.github/workflows/phoenix/test.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ if [ "$job_device" = "gpu" ]; then
1010
fi
1111
fi
1212

13+
# Set up persistent build cache
14+
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
15+
1316
max_attempts=3
1417
attempt=1
1518
while [ $attempt -le $max_attempts ]; do
@@ -20,8 +23,8 @@ while [ $attempt -le $max_attempts ]; do
2023
fi
2124

2225
if [ $attempt -lt $max_attempts ]; then
23-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
24-
./mfc.sh clean
26+
echo "Build failed on attempt $attempt. Clearing staging/install and retrying in 30s..."
27+
rm -rf build/staging build/install build/lock.yaml
2528
sleep 30
2629
else
2730
echo "Build failed after $max_attempts attempts."
@@ -30,6 +33,14 @@ while [ $attempt -le $max_attempts ]; do
3033
attempt=$((attempt + 1))
3134
done
3235

36+
# Release the cache lock before running tests. Tests only read installed
37+
# binaries and can take hours — no need to block other builds.
38+
if [ "${_cache_locked:-false}" = true ]; then
39+
flock --unlock 9
40+
exec 9>&-
41+
echo "Released build cache lock before tests"
42+
fi
43+
3344
n_test_threads=8
3445

3546
if [ "$job_device" = "gpu" ]; then

.github/workflows/test.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,14 @@ jobs:
9494
- name: Clone
9595
uses: actions/checkout@v4
9696

97+
- name: Restore Build Cache
98+
uses: actions/cache@v4
99+
with:
100+
path: build
101+
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
102+
restore-keys: |
103+
mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-
104+
97105
- name: Setup MacOS
98106
if: matrix.os == 'macos'
99107
run: |

0 commit comments

Comments
 (0)