feat(profile): add Flash vLLM MTP3 run

Oseltamivir · Oseltamivir · commit 39f914bf06ed · 2026-05-21T13:37:39.000-07:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -2073,6 +2073,23 @@ dsv4-flash-fp4-b300-vllm:
       search-space:
       - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }
 
+# Targeted Flash vLLM MTP profile at the same single-point profile location.
+# The shared vLLM MTP launcher selects 3 speculative tokens for this model.
+dsv4-flash-fp4-b300-vllm-mtp:
+  image: vllm/vllm-openai:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 64, conc-end: 64, spec-decoding: mtp }
+
 # Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
 # non-MTP Flash profile above. The shared SGLang MTP launcher selects the
 # Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -62,8 +62,12 @@ else
     SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
 fi
 
-# use 2 speculative tokens for all configs for now
+# Keep the existing Pro MTP profile at 2 speculative tokens; Flash uses the
+# requested 3-token MTP profile.
 NUM_SPEC_TOKENS=2
+if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
+    NUM_SPEC_TOKENS=3
+fi
 
 start_gpu_monitor