Skip to content

Commit f6cd29c

Browse files
committed
Merge branch 'fix/hpc-bugfixes-batch' of https://github.com/sbryngelson/MFC into fix/hpc-bugfixes-batch
2 parents 7c92596 + 239e520 commit f6cd29c

120 files changed

Lines changed: 1875 additions & 620 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/settings.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"allowedTools": [
3+
"Bash(gh pr view:*)",
4+
"Bash(gh pr diff:*)",
5+
"Bash(gh pr comment:*)",
6+
"Bash(gh api:*)",
7+
"Bash(gh search code:*)",
8+
"Bash(cat:*)",
9+
"Bash(ls:*)",
10+
"Bash(grep:*)",
11+
"Bash(python3:*)",
12+
"Bash(git:*)"
13+
]
14+
}

.github/scripts/monitor_slurm_job.sh

Lines changed: 88 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44

55
set -euo pipefail
66

7-
# Cleanup handler to prevent orphaned tail processes
7+
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88
cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12+
# Cancel the SLURM job if the monitor is exiting due to an error
13+
# (e.g., the CI runner is being killed). Don't cancel on success.
14+
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16+
scancel "$job_id" 2>/dev/null || true
17+
fi
1218
}
1319
trap cleanup EXIT
1420

@@ -23,30 +29,78 @@ output_file="$2"
2329
echo "Submitted batch job $job_id"
2430
echo "Monitoring output file: $output_file"
2531

26-
# Wait for file to appear with retry logic for transient squeue failures
32+
# Robustly check SLURM job state using squeue with sacct fallback.
33+
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+
# or "UNKNOWN" if both commands fail.
35+
get_job_state() {
36+
local jid="$1"
37+
local state
38+
39+
# Try squeue first (fast, works for active jobs)
40+
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
41+
if [ -n "$state" ]; then
42+
echo "$state"
43+
return
44+
fi
45+
46+
# Fallback to sacct (works for completed/historical jobs)
47+
if command -v sacct >/dev/null 2>&1; then
48+
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
49+
if [ -n "$state" ]; then
50+
echo "$state"
51+
return
52+
fi
53+
fi
54+
55+
echo "UNKNOWN"
56+
}
57+
58+
# Check if a state is terminal (job is done, for better or worse)
59+
is_terminal_state() {
60+
case "$1" in
61+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
62+
return 0 ;;
63+
*)
64+
return 1 ;;
65+
esac
66+
}
67+
68+
# Wait for file to appear, using robust state checking.
69+
# Never give up due to transient squeue/sacct failures — the CI job timeout
70+
# is the ultimate backstop.
2771
echo "Waiting for job to start..."
28-
squeue_retries=0
29-
max_squeue_retries=5
72+
unknown_count=0
3073
while [ ! -f "$output_file" ]; do
31-
# Check if job is still queued/running
32-
if squeue -j "$job_id" &>/dev/null; then
33-
squeue_retries=0 # Reset on success
34-
sleep 5
35-
else
36-
squeue_retries=$((squeue_retries + 1))
37-
if [ $squeue_retries -ge $max_squeue_retries ]; then
38-
# Job not in queue and output file doesn't exist
39-
if [ ! -f "$output_file" ]; then
40-
echo "ERROR: Job $job_id not in queue and output file not created"
74+
state=$(get_job_state "$job_id")
75+
76+
case "$state" in
77+
PENDING|CONFIGURING)
78+
unknown_count=0
79+
sleep 5
80+
;;
81+
RUNNING|COMPLETING)
82+
unknown_count=0
83+
# Job is running but output file not yet visible (NFS delay)
84+
sleep 2
85+
;;
86+
UNKNOWN)
87+
unknown_count=$((unknown_count + 1))
88+
# Only print warning periodically to avoid log spam
89+
if [ $((unknown_count % 12)) -eq 1 ]; then
90+
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+
fi
92+
sleep 5
93+
;;
94+
*)
95+
# Terminal state — job finished without creating output
96+
if is_terminal_state "$state"; then
97+
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
4198
exit 1
4299
fi
43-
break
44-
fi
45-
# Exponential backoff
46-
sleep_time=$((2 ** squeue_retries))
47-
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48-
sleep $sleep_time
49-
fi
100+
# Unrecognized state, keep waiting
101+
sleep 5
102+
;;
103+
esac
50104
done
51105

52106
echo "=== Streaming output for job $job_id ==="
@@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111
tail_pid=$!
58112

59113
# Monitor job status and stream output simultaneously
60-
squeue_failures=0
61114
last_heartbeat=$(date +%s)
62115

63116
while true; do
64117
# Try to read from tail output (non-blocking via timeout)
65118
# Read multiple lines if available to avoid falling behind
66119
lines_read=0
67-
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
120+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
68121
echo "$line"
69122
lines_read=$((lines_read + 1))
70123
last_heartbeat=$(date +%s)
@@ -73,49 +126,30 @@ while true; do
73126
break
74127
fi
75128
done
76-
129+
77130
# Check job status
78131
current_time=$(date +%s)
79-
if ! squeue -j "$job_id" &>/dev/null; then
80-
squeue_failures=$((squeue_failures + 1))
81-
# Check if job actually completed using sacct (if available)
82-
if [ $squeue_failures -ge 3 ]; then
83-
if command -v sacct >/dev/null 2>&1; then
84-
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85-
# Consider job done only if it reached a terminal state
86-
case "$state" in
87-
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88-
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89-
break
90-
;;
91-
*)
92-
# treat as transient failure, reset failures and continue polling
93-
squeue_failures=0
94-
;;
95-
esac
96-
else
97-
# No sacct: assume job completed after 3 failures
98-
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99-
break
100-
fi
101-
fi
132+
state=$(get_job_state "$job_id")
133+
134+
if is_terminal_state "$state"; then
135+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
136+
break
102137
else
103-
squeue_failures=0
104138
# Print heartbeat if no output for 60 seconds
105139
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106-
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
140+
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
107141
last_heartbeat=$current_time
108142
fi
109143
fi
110-
144+
111145
# Sleep briefly between status checks
112146
sleep 1
113147
done
114148

115149
# Drain any remaining output from tail after job completes
116150
echo "Draining remaining output..."
117151
drain_count=0
118-
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
152+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
119153
echo "$line"
120154
drain_count=$((drain_count + 1))
121155
# Safety limit to avoid infinite loop
@@ -128,8 +162,9 @@ done
128162
# Close the file descriptor and kill tail
129163
exec 3<&-
130164
kill "${tail_pid}" 2>/dev/null || true
165+
tail_pid=""
131166

132-
# Wait for output file to finish growing (stabilize) before stopping tail
167+
# Wait for output file to stabilize (NFS flush) before final read
133168
if [ -f "$output_file" ]; then
134169
last_size=-1
135170
same_count=0
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184
done
150185
fi
151186

152-
# Stop tailing (trap will also handle this on exit)
153-
kill "${tail_pid}" 2>/dev/null || true
154-
155187
echo ""
156188
echo "=== Final output ==="
157189
cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219
exit 1
188220
fi
189221

222+
monitor_success=1
190223
echo "Job $job_id completed successfully"
191224
exit 0
192-

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ else
5252
echo "Master job completed successfully"
5353
fi
5454

55-
# Check if either job failed
55+
# Warn if either job failed (partial results may still be usable)
5656
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
57-
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
58-
exit 1
57+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
58+
echo "Checking for partial results..."
59+
else
60+
echo "=========================================="
61+
echo "Both benchmark jobs completed successfully!"
62+
echo "=========================================="
5963
fi
6064

61-
echo "=========================================="
62-
echo "Both benchmark jobs completed successfully!"
63-
echo "=========================================="
64-
6565
# Final verification that output files exist before proceeding
6666
pr_yaml="pr/bench-${device}-${interface}.yaml"
6767
master_yaml="master/bench-${device}-${interface}.yaml"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
# Sets up a persistent build cache for self-hosted CI runners.
3+
# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
4+
#
5+
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
6+
# This avoids cross-runner path issues entirely — CMake's absolute paths are
7+
# always correct because the same runner always uses the same workspace path.
8+
#
9+
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
10+
11+
_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
12+
_cache_device="${2:?}"
13+
_cache_interface="${3:-none}"
14+
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
15+
16+
_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
17+
_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
18+
19+
mkdir -p "$_cache_base"
20+
_cache_dir="$(cd "$_cache_base" && pwd -P)"
21+
22+
echo "=== Build Cache Setup ==="
23+
echo " Cache key: $_cache_key"
24+
echo " Cache dir: $_cache_dir"
25+
26+
# Replace any existing build/ (real dir or stale symlink) with a symlink
27+
# to our runner-specific cache directory.
28+
# Use unlink for symlinks to avoid rm -rf following the link and deleting
29+
# the shared cache contents (which another runner may be using).
30+
if [ -L "build" ]; then
31+
unlink "build"
32+
elif [ -e "build" ]; then
33+
rm -rf "build"
34+
fi
35+
36+
ln -s "$_cache_dir" "build"
37+
38+
echo " Symlink: build -> $_cache_dir"
39+
echo "========================="

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@ fi
3737
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
3838

3939
# Use the monitoring script from PR (where this script lives)
40-
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
41-
42-
echo "[$dir] Monitoring complete for job $job_id"
40+
monitor_exit=0
41+
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
42+
if [ "$monitor_exit" -ne 0 ]; then
43+
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
44+
else
45+
echo "[$dir] Monitoring complete for job $job_id"
46+
fi
4347

4448
# Verify the YAML output file was created
4549
yaml_file="${job_slug}.yaml"

0 commit comments

Comments
 (0)