fix(profile): capture later GB200 decode step

Oseltamivir · Oseltamivir · commit 27ddec5ba9d2 · 2026-05-26T11:12:02.000-07:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -8694,20 +8694,20 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
   scenarios:
     fixed-seq-len:
     - isl: 8192
-      osl: 1024
+      osl: 2048
       search-space:
       - conc-list: [256]
         spec-decoding: mtp
         prefill:
-          num-worker: 1
-          tp: 16
+          num-worker: 16
+          tp: 1
           ep: 16
           dp-attn: true
           additional-settings:
           - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
         decode:
           num-worker: 0
-          tp: 16
+          tp: 1
           ep: 1
           dp-attn: false
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml
@@ -65,11 +65,11 @@ backend:
       speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       tokenizer-mode: deepseek_v4
-      max-model-len: 9472
+      max-model-len: 10496
       max-num-seqs: 256
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 256
-      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
+      profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -84,17 +84,17 @@ backend:
 profiling:
   type: "torch"
   aggregated:
-    start_step: 3
-    stop_step: 4
+    start_step: 2304
+    stop_step: 2305
 
 benchmark:
   type: "sa-bench"
   isl: 8192
-  osl: 1024
+  osl: 2048
   concurrencies: "256"
   req_rate: "inf"
   num_prompts_mult: 1
-  num_warmup_mult: 1
+  num_warmup_mult: 4
   use_chat_template: true
   custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"