Skip to content

Commit 0f85e38

Browse files
committed
colletc-evals
1 parent dfba2ff commit 0f85e38

4 files changed

Lines changed: 353 additions & 1 deletion

File tree

.github/workflows/benchmark-tmpl.yml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ jobs:
135135
env:
136136
RUNNER_NAME: ${{ runner.name }}
137137
RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
138+
# Suppress per-job eval markdown from being appended to the step summary.
139+
# We'll publish a single combined eval table in the collection job instead.
140+
GITHUB_STEP_SUMMARY: ''
138141
run: |
139142
bash ./runners/launch_${RUNNER_NAME%%_*}.sh
140143
FOUND_RESULT_FILE=
@@ -162,4 +165,23 @@ jobs:
162165
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
163166
with:
164167
name: ${{ env.RESULT_FILENAME }}
165-
path: agg_${{ env.RESULT_FILENAME }}.json
168+
path: agg_${{ env.RESULT_FILENAME }}.json
169+
170+
- name: Upload eval results (if any)
171+
if: ${{ env.RUN_EVAL == 'true' }}
172+
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
173+
with:
174+
name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
175+
path: eval_out/${{ env.RESULT_FILENAME }}
176+
177+
- name: Cleanup eval outputs (post-upload)
178+
if: ${{ env.RUN_EVAL == 'true' }}
179+
run: |
180+
if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
181+
echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
182+
rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
183+
fi
184+
# Also remove empty parent folder if present
185+
if [ -d "eval_out" ]; then
186+
rmdir eval_out 2>/dev/null || true
187+
fi
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Template - Collect Evals
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
exp-name:
7+
required: false
8+
type: string
9+
default: ''
10+
11+
permissions:
12+
contents: read
13+
14+
jobs:
15+
collect-evals:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- name: Checkout code
19+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
20+
with:
21+
token: ${{ secrets.REPO_PAT }}
22+
fetch-depth: 0
23+
24+
- name: Download eval artifacts
25+
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
26+
with:
27+
path: eval_results/
28+
pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }}
29+
30+
- name: Summarize evals
31+
run: |
32+
echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
33+
echo "" >> $GITHUB_STEP_SUMMARY
34+
python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY
35+
36+
- name: Upload aggregated evals
37+
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
38+
with:
39+
name: eval_results_${{ inputs.exp-name || 'all' }}
40+
path: agg_eval_${{ inputs.exp-name || 'all' }}.json
41+
42+
- name: Cleanup downloaded eval artifacts
43+
if: ${{ always() }}
44+
run: |
45+
rm -rf eval_results/ || true

.github/workflows/full-sweep-test.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,14 @@ jobs:
140140
with:
141141
exp-name: "dsr1_1k1k"
142142

143+
collect-dsr1-1k1k-evals:
144+
needs: benchmark-dsr1-1k1k
145+
if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
146+
uses: ./.github/workflows/collect-evals.yml
147+
secrets: inherit
148+
with:
149+
exp-name: "dsr1_1k1k"
150+
143151
# GPTOSS 1K1K Benchmarks
144152
benchmark-gptoss-1k1k:
145153
needs: get-configs
@@ -175,6 +183,14 @@ jobs:
175183
with:
176184
exp-name: "gptoss_1k1k"
177185

186+
collect-gptoss-1k1k-evals:
187+
needs: benchmark-gptoss-1k1k
188+
if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
189+
uses: ./.github/workflows/collect-evals.yml
190+
secrets: inherit
191+
with:
192+
exp-name: "gptoss_1k1k"
193+
178194

179195
# DSR1 8K1K Benchmarks
180196
benchmark-dsr1-8k1k:
@@ -211,6 +227,14 @@ jobs:
211227
with:
212228
exp-name: "dsr1_8k1k"
213229

230+
collect-dsr1-8k1k-evals:
231+
needs: benchmark-dsr1-8k1k
232+
if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
233+
uses: ./.github/workflows/collect-evals.yml
234+
secrets: inherit
235+
with:
236+
exp-name: "dsr1_8k1k"
237+
214238
# GPTOSS 8K1K Benchmarks
215239
benchmark-gptoss-8k1k:
216240
needs: get-configs
@@ -246,6 +270,14 @@ jobs:
246270
with:
247271
exp-name: "gptoss_8k1k"
248272

273+
collect-gptoss-8k1k-evals:
274+
needs: benchmark-gptoss-8k1k
275+
if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
276+
uses: ./.github/workflows/collect-evals.yml
277+
secrets: inherit
278+
with:
279+
exp-name: "gptoss_8k1k"
280+
249281

250282
# DSR1 1K8K Benchmarks
251283
benchmark-dsr1-1k8k:
@@ -376,6 +408,14 @@ jobs:
376408
with:
377409
exp-name: "dsr1_1k8k"
378410

411+
collect-dsr1-1k8k-evals:
412+
needs: benchmark-dsr1-1k8k
413+
if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
414+
uses: ./.github/workflows/collect-evals.yml
415+
secrets: inherit
416+
with:
417+
exp-name: "dsr1_1k8k"
418+
379419

380420
# GPTOSS 1K8K Benchmarks
381421
benchmark-gptoss-1k8k:
@@ -412,6 +452,14 @@ jobs:
412452
with:
413453
exp-name: "gptoss_1k8k"
414454

455+
collect-gptoss-1k8k-evals:
456+
needs: benchmark-gptoss-1k8k
457+
if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
458+
uses: ./.github/workflows/collect-evals.yml
459+
secrets: inherit
460+
with:
461+
exp-name: "gptoss_1k8k"
462+
415463

416464
calc-success-rate:
417465
needs:

0 commit comments

Comments
 (0)