colletc-evals

Oseltamivir · Oseltamivir · commit 0f85e38d22f9 · 2025-12-02T21:14:15.000+08:00
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -135,6 +135,9 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
+          # Suppress per-job eval markdown from being appended to the step summary.
+          # We'll publish a single combined eval table in the collection job instead.
+          GITHUB_STEP_SUMMARY: ''
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           FOUND_RESULT_FILE=
@@ -162,4 +165,23 @@ jobs:
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
           name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          path: agg_${{ env.RESULT_FILENAME }}.json
+
+      - name: Upload eval results (if any)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: eval_out/${{ env.RESULT_FILENAME }}
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        run: |
+          if [ -n "${RESULT_FILENAME:-}" ] && [ -e "eval_out/${RESULT_FILENAME}" ]; then
+            echo "Removing eval dir: eval_out/${RESULT_FILENAME}"
+            rm -rf --one-file-system "eval_out/${RESULT_FILENAME}" || rm -rf "eval_out/${RESULT_FILENAME}" || true
+          fi
+          # Also remove empty parent folder if present
+          if [ -d "eval_out" ]; then
+            rmdir eval_out 2>/dev/null || true
+          fi
diff --git a/.github/workflows/collect-evals.yml b/.github/workflows/collect-evals.yml
@@ -0,0 +1,45 @@
+name: Template - Collect Evals
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: false
+        type: string
+        default: ''
+
+permissions:
+  contents: read
+
+jobs:
+  collect-evals:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+
+      - name: Download eval artifacts
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+        with:
+          path: eval_results/
+          pattern: ${{ inputs.exp-name && format('eval_{0}_*', inputs.exp-name) || 'eval_*' }}
+
+      - name: Summarize evals
+        run: |
+          echo "## 📋 Eval Summary - ${{ inputs.exp-name || 'all' }}" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.exp-name || 'all' }} >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload aggregated evals
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_results_${{ inputs.exp-name || 'all' }}
+          path: agg_eval_${{ inputs.exp-name || 'all' }}.json
+
+      - name: Cleanup downloaded eval artifacts
+        if: ${{ always() }}
+        run: |
+          rm -rf eval_results/ || true
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
@@ -140,6 +140,14 @@ jobs:
         with:
             exp-name: "dsr1_1k1k"
 
+    collect-dsr1-1k1k-evals:
+        needs: benchmark-dsr1-1k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+
     # GPTOSS 1K1K Benchmarks
     benchmark-gptoss-1k1k:
         needs: get-configs
@@ -175,6 +183,14 @@ jobs:
         with:
             exp-name: "gptoss_1k1k"
 
+    collect-gptoss-1k1k-evals:
+        needs: benchmark-gptoss-1k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
+
 
     # DSR1 8K1K Benchmarks
     benchmark-dsr1-8k1k:
@@ -211,6 +227,14 @@ jobs:
         with:
             exp-name: "dsr1_8k1k"
 
+    collect-dsr1-8k1k-evals:
+        needs: benchmark-dsr1-8k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+
     # GPTOSS 8K1K Benchmarks
     benchmark-gptoss-8k1k:
         needs: get-configs
@@ -246,6 +270,14 @@ jobs:
         with:
             exp-name: "gptoss_8k1k"
 
+    collect-gptoss-8k1k-evals:
+        needs: benchmark-gptoss-8k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"
+
 
     # DSR1 1K8K Benchmarks
     benchmark-dsr1-1k8k:
@@ -376,6 +408,14 @@ jobs:
         with:
             exp-name: "dsr1_1k8k"
 
+    collect-dsr1-1k8k-evals:
+        needs: benchmark-dsr1-1k8k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+
 
     # GPTOSS 1K8K Benchmarks
     benchmark-gptoss-1k8k:
@@ -412,6 +452,14 @@ jobs:
         with:
             exp-name: "gptoss_1k8k"
 
+    collect-gptoss-1k8k-evals:
+        needs: benchmark-gptoss-1k8k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
+
 
     calc-success-rate:
         needs:
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py