fix: limit gb200 flash profile to one decode token

Oseltamivir · Oseltamivir · commit 80e394ea02e3 · 2026-05-28T22:02:04.000-07:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -8694,7 +8694,7 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
   scenarios:
     fixed-seq-len:
     - isl: 8192
-      osl: 256
+      osl: 1
       search-space:
       - conc-list: [16]
         spec-decoding: mtp
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-eplp-dep16-conc16-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-eplp-dep16-conc16-mtp3.yaml
@@ -66,7 +66,7 @@ backend:
       speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       tokenizer-mode: deepseek_v4
-      max-model-len: 8704
+      max-model-len: 8449
       max-num-seqs: 16
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 16
@@ -77,21 +77,21 @@ backend:
       block-size: 256
       compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","mode":3}'
       gpu-memory-utilization: 0.9
-      stream-interval: 50
+      stream-interval: 1
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       all2all-backend: "flashinfer_nvlink_one_sided"
 
 profiling:
   type: "torch"
   aggregated:
-    start_step: 384
-    stop_step: 385
+    start_step: 1
+    stop_step: 2
 
 benchmark:
   type: "sa-bench"
   isl: 8192
-  osl: 256
+  osl: 1
   concurrencies: "16"
   req_rate: "inf"
   num_prompts_mult: 1