speedbench-al: parameterize model + relocate collector script

qiching · qiching · commit d595d49ab931 · 2026-06-04T10:17:28.000-07:00
Address review:
- Model is now a workflow input (model + model-prefix, default
  deepseek-ai/DeepSeek-V4-Pro / dsv4). MODEL, MODEL_PREFIX, EXP_NAME,
  BENCH_SCRIPT_OVERRIDE, artifact names and the Create-PR branch/title/body
  are all derived from those inputs. The emitted YAML top-level key is now
  derived from the model (MODEL_KEY, defaults to the model basename lowercased).
- Move the collector to benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
  and fix its benchmark_lib.sh source path (../ -&gt; ../../) for the deeper dir.
diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
@@ -1,9 +1,10 @@
 name: SpeedBench AL Collection
 
-# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench
-# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the
-# golden reference consumed by the synthetic-acceptance framework and (optionally)
-# opens a PR updating benchmarks/speedbench-reference-al.yaml.
+# Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length
+# (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to
+# DeepSeek-V4-Pro). Produces the golden reference consumed by the
+# synthetic-acceptance framework and (optionally) opens a PR updating
+# benchmarks/speedbench-reference-al.yaml.
 
 on:
   workflow_dispatch:
@@ -13,6 +14,16 @@ on:
         required: false
         type: string
         default: 'b300'
+      model:
+        description: "HF model id (basename must be in launcher STAGED_MODELS for pre-staged local weights)"
+        required: false
+        type: string
+        default: 'deepseek-ai/DeepSeek-V4-Pro'
+      model-prefix:
+        description: "Model prefix; drives launcher MODEL_PATH resolution, exp name, collector script, and artifact names"
+        required: false
+        type: string
+        default: 'dsv4'
       image:
         description: "vLLM container image"
         required: false
@@ -64,22 +75,22 @@ permissions:
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-  # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the
-  # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so
+  # Drive the single-node path in runners/launch_b300-nv.sh. MODEL is the HF id;
+  # its basename (e.g. DeepSeek-V4-Pro) must be in the launcher's STAGED_MODELS so
   # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts
   # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download.
-  MODEL: deepseek-ai/DeepSeek-V4-Pro
-  MODEL_PREFIX: dsv4
+  MODEL: ${{ inputs.model }}
+  MODEL_PREFIX: ${{ inputs.model-prefix }}
   PRECISION: fp4
   FRAMEWORK: vllm
-  EXP_NAME: dsv4_speedbench
+  EXP_NAME: ${{ inputs.model-prefix }}_speedbench
   IMAGE: ${{ inputs.image }}
   TP: '8'
   EP_SIZE: '1'
   DP_ATTENTION: 'false'
   SPEC_DECODING: mtp
   # Run the AL-matrix collector instead of the auto-selected throughput script.
-  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/speedbench/${{ inputs.model-prefix }}_fp4_b300_vllm.sh
   SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }}
   # Matrix-collector tunables (propagated into the container via srun --export=ALL).
   MTP_LIST: ${{ inputs.mtp-list }}
@@ -158,7 +169,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: speedbench-reference-al
+          name: speedbench-reference-al-${{ inputs.model-prefix }}
           path: speedbench-reference-al.yaml
           if-no-files-found: warn
 
@@ -168,9 +179,12 @@ jobs:
           GH_TOKEN: ${{ secrets.REPO_PAT }}
         run: |
           set -euo pipefail
+          # NOTE: the reference yaml is keyed by model at the top level. This
+          # overwrites it with the current model's matrix; when more than one
+          # model is collected, replace this cp with a per-model-key YAML merge.
           cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
 
-          BRANCH="speedbench-al/auto-${{ github.run_id }}"
+          BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}"
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
           git checkout -b "$BRANCH"
@@ -179,19 +193,19 @@ jobs:
             echo "No change in reference yaml; skipping PR."
             exit 0
           fi
-          git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})"
+          git commit -m "Update SpeedBench AL reference matrix for ${{ inputs.model }} (auto, run ${{ github.run_id }})"
           git push -u origin "$BRANCH"
           gh pr create \
-            --title "Update SpeedBench AL reference matrix (auto)" \
-            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
+            --title "Update SpeedBench AL reference matrix for ${{ inputs.model-prefix }} (auto)" \
+            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Model: \`${{ inputs.model }}\`, category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
             --base main \
             --head "$BRANCH"
 
       - name: Upload server logs
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: speedbench_server_logs
+          name: speedbench_server_logs-${{ inputs.model-prefix }}
           path: speedbench_results/server_*.log
           if-no-files-found: ignore
 
diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -13,7 +13,7 @@
 #
 # Usage (inside the vLLM container, on a B300 node):
 #   export MODEL=/data/models/dsv4-pro
-#   bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+#   bash benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
 #
 # Tunables (env):
 #   MTP_LIST          space-separated MTP levels   (default "1 2 3 4 5 6 7 8")
@@ -23,7 +23,7 @@
 #   OUT_YAML          output matrix path            (default $RESULTS_DIR/speedbench-reference-al.yaml)
 
 set -uo pipefail
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}"
 # Serve from the local weights dir resolved by the launcher (MODEL_PATH points
@@ -39,6 +39,9 @@ PORT="${PORT:-8888}"
 MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
 THINKING_MODES="${THINKING_MODES:-off on}"
 CATEGORY="${CATEGORY:-coding}"
+# Top-level key in the emitted YAML matrix. Derived from the model by the
+# workflow (e.g. deepseek-v4-pro); falls back to the model basename, lowercased.
+MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}"
 SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
 CONCURRENCY="${CONCURRENCY:-1}"
 TEMPERATURE="${TEMPERATURE:-1.0}"
@@ -315,11 +318,11 @@ emit_mode_block() {
     echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
     echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN"
     echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON"
-    echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens."
-    echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)."
+    echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens."
+    echo "# Auto-generated by benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh (speedbench-al.yml)."
     echo "#"
     echo "# key = num_speculative_tokens (MTP level); value = golden AL"
-    echo "deepseek-v4-pro:"
+    echo "${MODEL_KEY}:"
     if [[ " $THINKING_MODES " == *" on "* ]]; then
         echo "  thinking_on:"
         emit_mode_block on