SemiAnalysisAI
diff --git a/‎.github/workflows/benchmark-tmpl.yml‎
Lines changed: 33 additions & 1 deletion b/‎.github/workflows/benchmark-tmpl.yml‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎.github/workflows/collect-evals.yml‎
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/collect-evals.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/collect-results.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/collect-results.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/e2e-tests.yml‎
Lines changed: 10 additions & 1 deletion b/‎.github/workflows/e2e-tests.yml‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎.github/workflows/run-sweep.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/run-sweep.yml‎
Lines changed: 16 additions & 0 deletions
@@ -50,6 +50,14 @@ on:
       disagg:
         required: true
         type: string
+      run-eval:
+        type: boolean
+        required: true
+        default: false
+      random-range-ratio:
+        required: false
+        type: string
+        default: '0.8'
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false
@@ -74,6 +82,7 @@ env:
   CONC: ${{ inputs.conc }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  RUN_EVAL: ${{ inputs.run-eval }}
 
 permissions:
   contents: read
@@ -82,7 +91,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}'
+    name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} ${{ inputs.run-eval && 'eval ' || '' }}tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }} spec=${{ inputs.spec-decoding }}"
     steps:
       - name: Resource cleanup
         run: |
@@ -113,7 +122,11 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_specdecode_${{ env.SPEC_DECODING }}_${{ runner.name }}
+          # Suppress per-job eval markdown from being appended to the step summary.
+          # We'll publish a single combined eval table in the collection job instead.
+          GITHUB_STEP_SUMMARY: ''
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           FOUND_RESULT_FILE=
@@ -137,8 +150,27 @@ jobs:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
+
       - name: Upload result
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
+
+      - name: Upload eval results (if any)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
+          path: |
+            meta_env.json
+            results*.json
+            sample*.jsonl
+          if-no-files-found: ignore
+
+      - name: Cleanup eval outputs (post-upload)
+        if: ${{ env.RUN_EVAL == 'true' }}
+        run: |
+          rm -f meta_env.json || true
+          # Remove any eval results JSONs that were moved into workspace
+          rm -f results*.json || true
@@ -0,0 +1,46 @@
+name: Template - Collect Evals
+
+on:
+  workflow_call:
+    inputs:
+      result-prefix:
+        required: false
+        type: string
+        default: ''
+
+permissions:
+  contents: read
+
+jobs:
+  collect-evals:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+
+      - name: Download eval artifacts
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+        with:
+          path: eval_results/
+          pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }}
+
+      - name: Summarize evals
+        run: |
+          pip install tabulate
+          echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload aggregated evals
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: eval_results_${{ inputs.result-prefix || 'all' }}
+          path: agg_eval_${{ inputs.result-prefix || 'all' }}.json
+
+      - name: Cleanup downloaded eval artifacts
+        if: ${{ always() }}
+        run: |
+          rm -rf eval_results/ || true
@@ -34,7 +34,9 @@ jobs:
           python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
 
       - name: Aggregate results
-        run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
+        run: |
+          pip install tabulate
+          python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
 
       - name: Upload aggregated results
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
 
@@ -122,16 +122,25 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
             ref: ${{ inputs.ref }}
 
     collect-results:
         needs: [test-sweep-multi-node, test-sweep-single-node]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
+        with:
+            result-prefix: "bmk"
+
+    collect-evals:
+        needs: [test-sweep-multi-node, test-sweep-single-node]
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
 
     calc-success-rate:
-        needs: collect-results
+        needs: [collect-results, collect-evals]
         if: ${{ always() }}
         runs-on: ubuntu-latest
 
 
@@ -142,6 +142,7 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
+            run-eval: ${{ matrix.config.run-eval }}
 
     sweep-single-node-1k8k:
         needs: setup
@@ -184,6 +185,21 @@ jobs:
         with:
             result-prefix: "bmk"
 
+    collect-evals:
+        needs:
+            [
+                sweep-single-node-1k1k,
+                sweep-single-node-1k8k,
+                sweep-single-node-8k1k,
+                sweep-multi-node-1k1k,
+                sweep-multi-node-1k8k,
+                sweep-multi-node-8k1k,
+                setup,
+            ]
+        if: ${{ always() && needs.setup.result != 'skipped' }}
+        uses: ./.github/workflows/collect-evals.yml
+        secrets: inherit
+
     upload-changelog-metadata:
         needs: [setup, collect-results]
         if: ${{ always() && needs.setup.result != 'skipped' }}