Skip to content

Commit 80e394e

Browse files
committed
fix: limit gb200 flash profile to one decode token
1 parent 2ede19e commit 80e394e

2 files changed

Lines changed: 6 additions & 6 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8694,7 +8694,7 @@ dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
86948694
scenarios:
86958695
fixed-seq-len:
86968696
- isl: 8192
8697-
osl: 256
8697+
osl: 1
86988698
search-space:
86998699
- conc-list: [16]
87008700
spec-decoding: mtp

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-eplp-dep16-conc16-mtp3.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ backend:
6666
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
6767
attention-config: '{"use_fp4_indexer_cache":true}'
6868
tokenizer-mode: deepseek_v4
69-
max-model-len: 8704
69+
max-model-len: 8449
7070
max-num-seqs: 16
7171
max-num-batched-tokens: 2048
7272
max-cudagraph-capture-size: 16
@@ -77,21 +77,21 @@ backend:
7777
block-size: 256
7878
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","mode":3}'
7979
gpu-memory-utilization: 0.9
80-
stream-interval: 50
80+
stream-interval: 1
8181
no-disable-hybrid-kv-cache-manager: true
8282
enable-sleep-mode: true
8383
all2all-backend: "flashinfer_nvlink_one_sided"
8484

8585
profiling:
8686
type: "torch"
8787
aggregated:
88-
start_step: 384
89-
stop_step: 385
88+
start_step: 1
89+
stop_step: 2
9090

9191
benchmark:
9292
type: "sa-bench"
9393
isl: 8192
94-
osl: 256
94+
osl: 1
9595
concurrencies: "16"
9696
req_rate: "inf"
9797
num_prompts_mult: 1

0 commit comments

Comments
 (0)