Skip to content

Commit 27ddec5

Browse files
committed
fix(profile): capture later GB200 decode step
1 parent eb885ff commit 27ddec5

2 files changed

Lines changed: 10 additions & 10 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8694,20 +8694,20 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
86948694
scenarios:
86958695
fixed-seq-len:
86968696
- isl: 8192
8697-
osl: 1024
8697+
osl: 2048
86988698
search-space:
86998699
- conc-list: [256]
87008700
spec-decoding: mtp
87018701
prefill:
8702-
num-worker: 1
8703-
tp: 16
8702+
num-worker: 16
8703+
tp: 1
87048704
ep: 16
87058705
dp-attn: true
87068706
additional-settings:
87078707
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
87088708
decode:
87098709
num-worker: 0
8710-
tp: 16
8710+
tp: 1
87118711
ep: 1
87128712
dp-attn: false
87138713

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@ backend:
6565
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
6666
attention-config: '{"use_fp4_indexer_cache":true}'
6767
tokenizer-mode: deepseek_v4
68-
max-model-len: 9472
68+
max-model-len: 10496
6969
max-num-seqs: 256
7070
max-num-batched-tokens: 256
7171
max-cudagraph-capture-size: 256
72-
profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":3,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
72+
profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":2304,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}'
7373
trust-remote-code: true
7474
no-enable-prefix-caching: true
7575
no-enable-flashinfer-autotune: true
@@ -84,17 +84,17 @@ backend:
8484
profiling:
8585
type: "torch"
8686
aggregated:
87-
start_step: 3
88-
stop_step: 4
87+
start_step: 2304
88+
stop_step: 2305
8989

9090
benchmark:
9191
type: "sa-bench"
9292
isl: 8192
93-
osl: 1024
93+
osl: 2048
9494
concurrencies: "256"
9595
req_rate: "inf"
9696
num_prompts_mult: 1
97-
num_warmup_mult: 1
97+
num_warmup_mult: 4
9898
use_chat_template: true
9999
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
100100

0 commit comments

Comments
 (0)