Skip to content

Commit dd72e48

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ci-test
2 parents 13db810 + 3781b98 commit dd72e48

3 files changed

Lines changed: 112 additions & 140 deletions

File tree

.github/scripts/monitor_slurm_job.sh

Lines changed: 88 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44

55
set -euo pipefail
66

7-
# Cleanup handler to prevent orphaned tail processes
7+
# Cleanup handler to prevent orphaned tail processes and cancel orphaned jobs
88
cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12+
# Cancel the SLURM job if the monitor is exiting due to an error
13+
# (e.g., the CI runner is being killed). Don't cancel on success.
14+
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16+
scancel "$job_id" 2>/dev/null || true
17+
fi
1218
}
1319
trap cleanup EXIT
1420

@@ -23,30 +29,78 @@ output_file="$2"
2329
echo "Submitted batch job $job_id"
2430
echo "Monitoring output file: $output_file"
2531

26-
# Wait for file to appear with retry logic for transient squeue failures
32+
# Robustly check SLURM job state using squeue with sacct fallback.
33+
# Returns the state string (PENDING, RUNNING, COMPLETED, FAILED, etc.)
34+
# or "UNKNOWN" if both commands fail.
35+
get_job_state() {
36+
local jid="$1"
37+
local state
38+
39+
# Try squeue first (fast, works for active jobs)
40+
state=$(squeue -j "$jid" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || true)
41+
if [ -n "$state" ]; then
42+
echo "$state"
43+
return
44+
fi
45+
46+
# Fallback to sacct (works for completed/historical jobs)
47+
if command -v sacct >/dev/null 2>&1; then
48+
state=$(sacct -j "$jid" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 || true)
49+
if [ -n "$state" ]; then
50+
echo "$state"
51+
return
52+
fi
53+
fi
54+
55+
echo "UNKNOWN"
56+
}
57+
58+
# Check if a state is terminal (job is done, for better or worse)
59+
is_terminal_state() {
60+
case "$1" in
61+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
62+
return 0 ;;
63+
*)
64+
return 1 ;;
65+
esac
66+
}
67+
68+
# Wait for file to appear, using robust state checking.
69+
# Never give up due to transient squeue/sacct failures — the CI job timeout
70+
# is the ultimate backstop.
2771
echo "Waiting for job to start..."
28-
squeue_retries=0
29-
max_squeue_retries=5
72+
unknown_count=0
3073
while [ ! -f "$output_file" ]; do
31-
# Check if job is still queued/running
32-
if squeue -j "$job_id" &>/dev/null; then
33-
squeue_retries=0 # Reset on success
34-
sleep 5
35-
else
36-
squeue_retries=$((squeue_retries + 1))
37-
if [ $squeue_retries -ge $max_squeue_retries ]; then
38-
# Job not in queue and output file doesn't exist
39-
if [ ! -f "$output_file" ]; then
40-
echo "ERROR: Job $job_id not in queue and output file not created"
74+
state=$(get_job_state "$job_id")
75+
76+
case "$state" in
77+
PENDING|CONFIGURING)
78+
unknown_count=0
79+
sleep 5
80+
;;
81+
RUNNING|COMPLETING)
82+
unknown_count=0
83+
# Job is running but output file not yet visible (NFS delay)
84+
sleep 2
85+
;;
86+
UNKNOWN)
87+
unknown_count=$((unknown_count + 1))
88+
# Only print warning periodically to avoid log spam
89+
if [ $((unknown_count % 12)) -eq 1 ]; then
90+
echo "Warning: Could not query job $job_id state (SLURM may be temporarily unavailable)..."
91+
fi
92+
sleep 5
93+
;;
94+
*)
95+
# Terminal state — job finished without creating output
96+
if is_terminal_state "$state"; then
97+
echo "ERROR: Job $job_id reached terminal state ($state) without creating output file"
4198
exit 1
4299
fi
43-
break
44-
fi
45-
# Exponential backoff
46-
sleep_time=$((2 ** squeue_retries))
47-
echo "Warning: squeue check failed, retrying in ${sleep_time}s..."
48-
sleep $sleep_time
49-
fi
100+
# Unrecognized state, keep waiting
101+
sleep 5
102+
;;
103+
esac
50104
done
51105

52106
echo "=== Streaming output for job $job_id ==="
@@ -57,14 +111,13 @@ exec 3< <(stdbuf -oL -eL tail -f "$output_file" 2>&1)
57111
tail_pid=$!
58112

59113
# Monitor job status and stream output simultaneously
60-
squeue_failures=0
61114
last_heartbeat=$(date +%s)
62115

63116
while true; do
64117
# Try to read from tail output (non-blocking via timeout)
65118
# Read multiple lines if available to avoid falling behind
66119
lines_read=0
67-
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
120+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
68121
echo "$line"
69122
lines_read=$((lines_read + 1))
70123
last_heartbeat=$(date +%s)
@@ -73,49 +126,30 @@ while true; do
73126
break
74127
fi
75128
done
76-
129+
77130
# Check job status
78131
current_time=$(date +%s)
79-
if ! squeue -j "$job_id" &>/dev/null; then
80-
squeue_failures=$((squeue_failures + 1))
81-
# Check if job actually completed using sacct (if available)
82-
if [ $squeue_failures -ge 3 ]; then
83-
if command -v sacct >/dev/null 2>&1; then
84-
state=$(sacct -j "$job_id" --format=State --noheader 2>/dev/null | head -n1 | awk '{print $1}')
85-
# Consider job done only if it reached a terminal state
86-
case "$state" in
87-
COMPLETED|FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY)
88-
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
89-
break
90-
;;
91-
*)
92-
# treat as transient failure, reset failures and continue polling
93-
squeue_failures=0
94-
;;
95-
esac
96-
else
97-
# No sacct: assume job completed after 3 failures
98-
echo "[$(date +%H:%M:%S)] Job $job_id no longer in queue"
99-
break
100-
fi
101-
fi
132+
state=$(get_job_state "$job_id")
133+
134+
if is_terminal_state "$state"; then
135+
echo "[$(date +%H:%M:%S)] Job $job_id reached terminal state: $state"
136+
break
102137
else
103-
squeue_failures=0
104138
# Print heartbeat if no output for 60 seconds
105139
if [ $((current_time - last_heartbeat)) -ge 60 ]; then
106-
echo "[$(date +%H:%M:%S)] Job $job_id still running (no new output for 60s)..."
140+
echo "[$(date +%H:%M:%S)] Job $job_id state=$state (no new output for 60s)..."
107141
last_heartbeat=$current_time
108142
fi
109143
fi
110-
144+
111145
# Sleep briefly between status checks
112146
sleep 1
113147
done
114148

115149
# Drain any remaining output from tail after job completes
116150
echo "Draining remaining output..."
117151
drain_count=0
118-
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
152+
while IFS= read -r -t 1 line <&3 2>/dev/null; do
119153
echo "$line"
120154
drain_count=$((drain_count + 1))
121155
# Safety limit to avoid infinite loop
@@ -128,8 +162,9 @@ done
128162
# Close the file descriptor and kill tail
129163
exec 3<&-
130164
kill "${tail_pid}" 2>/dev/null || true
165+
tail_pid=""
131166

132-
# Wait for output file to finish growing (stabilize) before stopping tail
167+
# Wait for output file to stabilize (NFS flush) before final read
133168
if [ -f "$output_file" ]; then
134169
last_size=-1
135170
same_count=0
@@ -149,9 +184,6 @@ if [ -f "$output_file" ]; then
149184
done
150185
fi
151186

152-
# Stop tailing (trap will also handle this on exit)
153-
kill "${tail_pid}" 2>/dev/null || true
154-
155187
echo ""
156188
echo "=== Final output ==="
157189
cat "$output_file"
@@ -187,6 +219,6 @@ if [ "$exit_code" != "0:0" ]; then
187219
exit 1
188220
fi
189221

222+
monitor_success=1
190223
echo "Job $job_id completed successfully"
191224
exit 0
192-

.github/workflows/bench.yml

Lines changed: 23 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,35 @@
11
name: 'Benchmark'
22

33
on:
4-
# Trigger when Test Suite completes (no polling needed)
5-
workflow_run:
6-
workflows: ["Test Suite"]
7-
types: [completed]
4+
pull_request:
5+
pull_request_review:
6+
types: [submitted]
87
workflow_dispatch:
98

109
concurrency:
11-
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch || github.ref }}
10+
group: ${{ github.workflow }}-${{ github.ref }}
1211
cancel-in-progress: true
1312

1413
jobs:
1514
file-changes:
1615
name: Detect File Changes
17-
# Only run if Test Suite passed (or manual dispatch)
18-
if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'
1916
runs-on: 'ubuntu-latest'
2017
outputs:
2118
checkall: ${{ steps.changes.outputs.checkall }}
22-
pr_number: ${{ steps.pr-info.outputs.pr_number }}
23-
pr_approved: ${{ steps.pr-info.outputs.approved }}
24-
pr_author: ${{ steps.pr-info.outputs.author }}
2519
steps:
2620
- name: Clone
2721
uses: actions/checkout@v4
28-
with:
29-
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
3022

3123
- name: Detect Changes
3224
uses: dorny/paths-filter@v3
3325
id: changes
3426
with:
3527
filters: ".github/file-filter.yml"
3628

37-
- name: Get PR Info
38-
id: pr-info
39-
env:
40-
GH_TOKEN: ${{ github.token }}
41-
run: |
42-
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
43-
echo "pr_number=" >> $GITHUB_OUTPUT
44-
echo "approved=true" >> $GITHUB_OUTPUT
45-
echo "author=${{ github.actor }}" >> $GITHUB_OUTPUT
46-
else
47-
# Get PR number from workflow_run
48-
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
49-
if [ -z "$PR_NUMBER" ]; then
50-
# Cross-repo PRs don't populate pull_requests[]. Search by head SHA.
51-
HEAD_SHA="${{ github.event.workflow_run.head_sha }}"
52-
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
53-
--jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1)
54-
fi
55-
if [ -z "$PR_NUMBER" ]; then
56-
# workflow_run may report the merge/base SHA for forks. Fall back to branch name.
57-
HEAD_BRANCH="${{ github.event.workflow_run.head_branch }}"
58-
if [ -n "$HEAD_BRANCH" ] && [ "$HEAD_BRANCH" != "master" ]; then
59-
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
60-
--jq ".[] | select(.head.ref == \"$HEAD_BRANCH\") | .number" | head -1)
61-
fi
62-
fi
63-
64-
if [ -n "$PR_NUMBER" ]; then
65-
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
66-
67-
# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
68-
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
69-
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT
70-
71-
# Check if PR is approved by a maintainer/admin (ignore AI bot approvals)
72-
APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
73-
--jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]')
74-
APPROVED="false"
75-
for approver in $APPROVERS; do
76-
PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \
77-
--jq '.permission' 2>/dev/null || echo "none")
78-
if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then
79-
echo " Approved by $approver (permission: $PERM)"
80-
APPROVED="true"
81-
break
82-
fi
83-
done
84-
echo "approved=$APPROVED" >> $GITHUB_OUTPUT
85-
else
86-
echo "pr_number=" >> $GITHUB_OUTPUT
87-
echo "approved=false" >> $GITHUB_OUTPUT
88-
echo "author=" >> $GITHUB_OUTPUT
89-
fi
90-
fi
91-
9229
self:
9330
name: "${{ matrix.name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
94-
if: >
95-
github.repository == 'MFlowCode/MFC' &&
96-
needs.file-changes.outputs.checkall == 'true' &&
97-
(
98-
github.event_name == 'workflow_dispatch' ||
99-
needs.file-changes.outputs.pr_approved == 'true' ||
100-
needs.file-changes.outputs.pr_author == 'sbryngelson'
101-
)
102-
needs: [file-changes]
31+
if: ${{ github.repository=='MFlowCode/MFC' && needs.file-changes.outputs.checkall=='true' && ((github.event_name=='pull_request_review' && github.event.review.state=='approved') || (github.event_name=='pull_request' && (github.event.pull_request.user.login=='sbryngelson' || github.event.pull_request.user.login=='wilfonba')) || github.event_name=='workflow_dispatch') }}
32+
needs: file-changes
10333
strategy:
10434
fail-fast: false
10535
matrix:
@@ -156,14 +86,10 @@ jobs:
15686
group: ${{ matrix.group }}
15787
labels: ${{ matrix.labels }}
15888
timeout-minutes: 480
159-
env:
160-
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
161-
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
16289
steps:
16390
- name: Clone - PR
16491
uses: actions/checkout@v4
16592
with:
166-
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
16793
path: pr
16894

16995
- name: Clone - Master
@@ -175,10 +101,23 @@ jobs:
175101

176102
- name: Setup & Build
177103
if: matrix.build_script != ''
178-
run: |
179-
(cd pr && ${{ matrix.build_script }}) &
180-
(cd master && ${{ matrix.build_script }}) &
181-
wait %1 && wait %2
104+
uses: nick-fields/retry@v3
105+
with:
106+
max_attempts: 3
107+
retry_wait_seconds: 60
108+
timeout_minutes: 150
109+
command: |
110+
(cd pr && ${{ matrix.build_script }}) &
111+
pid1=$!
112+
(cd master && ${{ matrix.build_script }}) &
113+
pid2=$!
114+
wait $pid1; e1=$?
115+
wait $pid2; e2=$?
116+
[ $e1 -eq 0 ] && [ $e2 -eq 0 ]
117+
on_retry_command: |
118+
(cd pr && ./mfc.sh clean) &
119+
(cd master && ./mfc.sh clean) &
120+
wait
182121
183122
- name: Bench (Master v. PR)
184123
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

.github/workflows/phoenix/submit-bench.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ sbatch <<EOT
4444
$sbatch_device_opts
4545
#SBATCH -t 04:00:00 # Duration of the job (Ex: 15 mins)
4646
#SBATCH -q embers # QOS Name
47+
#SBATCH --requeue # Auto-requeue on preemption
4748
#SBATCH -o$job_slug.out # Combined output and error messages file
4849
4950
set -e

0 commit comments

Comments
 (0)