Skip to content

Commit 2f300a3

Browse files
committed
fix(profile): use aggregate GB200 DSV4 profile
1 parent 58d423e commit 2f300a3

2 files changed

Lines changed: 19 additions & 70 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8681,7 +8681,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
86818681
dp-attn: true
86828682

86838683
# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
8684-
# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256.
8684+
# 256 shape: aggregated DEP16 on GB200, MTP3, conc=256.
86858685
dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
86868686
image: vllm/vllm-openai:v0.21.0-ubuntu2404
86878687
model: deepseek-ai/DeepSeek-V4-Pro
@@ -8690,7 +8690,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
86908690
precision: fp4
86918691
framework: dynamo-vllm
86928692
multinode: true
8693-
disagg: true
8693+
disagg: false
86948694
scenarios:
86958695
fixed-seq-len:
86968696
- isl: 8192
@@ -8700,16 +8700,16 @@ dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
87008700
spec-decoding: mtp
87018701
prefill:
87028702
num-worker: 1
8703-
tp: 8
8704-
ep: 8
8703+
tp: 16
8704+
ep: 16
87058705
dp-attn: true
87068706
additional-settings:
8707-
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml"
8707+
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml"
87088708
decode:
8709-
num-worker: 1
8710-
tp: 8
8711-
ep: 8
8712-
dp-attn: true
8709+
num-worker: 0
8710+
tp: 16
8711+
ep: 1
8712+
dp-attn: false
87138713

87148714
dsv4-fp4-b300-dynamo-vllm:
87158715
image: vllm/vllm-openai:v0.20.1

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml renamed to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-profile-16gpu-conc256-mtp3.yaml

Lines changed: 10 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3"
1+
name: "svf-vllm-agg-gb200-profile-16gpu-conc256-mtp3"
22

33
model:
44
path: "deepseek-v4-pro"
@@ -21,15 +21,9 @@ health_check:
2121
resources:
2222
gpu_type: "gb200"
2323
gpus_per_node: 4
24-
prefill_nodes: 2
25-
decode_nodes: 2
26-
prefill_workers: 1
27-
decode_workers: 1
28-
gpus_per_prefill: 8
29-
gpus_per_decode: 8
30-
31-
infra:
32-
etcd_nats_dedicated_node: true
24+
agg_nodes: 4
25+
agg_workers: 1
26+
gpus_per_agg: 16
3327

3428
frontend:
3529
type: dynamo
@@ -38,7 +32,7 @@ frontend:
3832
backend:
3933
type: vllm
4034
connector: null
41-
prefill_environment:
35+
aggregated_environment:
4236
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
4337
TILELANG_CLEANUP_TEMP_FILES: "1"
4438
VLLM_USE_NCCL_SYMM_MEM: "1"
@@ -54,67 +48,25 @@ backend:
5448
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
5549
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
5650
NCCL_P2P_LEVEL: NVL
57-
decode_environment:
58-
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
59-
TILELANG_CLEANUP_TEMP_FILES: "1"
60-
VLLM_USE_NCCL_SYMM_MEM: "1"
61-
TORCH_SYMMMEM: "NVSHMEM"
62-
NCCL_CUMEM_ENABLE: "1"
63-
NCCL_MNNVL_ENABLE: "1"
64-
NCCL_NVLS_ENABLE: "1"
65-
VLLM_SERVER_DEV_MODE: "1"
66-
UCX_MEMTYPE_CACHE: "n"
67-
UCX_MEMTYPE_REG_WHOLE: "n"
68-
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
69-
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
70-
NCCL_P2P_LEVEL: NVL
7151
vllm_config:
72-
prefill:
73-
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
52+
aggregated:
7453
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
7554
kv-cache-dtype: "fp8"
7655
tensor-parallel-size: 1
7756
pipeline-parallel-size: 1
7857
data-parallel-hybrid-lb: true
79-
data-parallel-size: 8
58+
data-parallel-size: 16
8059
data-parallel-rpc-port: 13345
8160
enable-expert-parallel: true
8261
enable-ep-weight-filter: true
8362
moe-backend: deep_gemm_mega_moe
84-
enforce-eager: true
8563
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
8664
attention-config: '{"use_fp4_indexer_cache":true}'
87-
max-model-len: 9472
88-
max-num-seqs: 8
89-
max-num-batched-tokens: 16384
90-
trust-remote-code: true
91-
no-enable-prefix-caching: true
92-
no-enable-flashinfer-autotune: true
93-
no-async-scheduling: true
94-
block-size: 256
95-
gpu-memory-utilization: 0.9
96-
no-disable-hybrid-kv-cache-manager: true
97-
enable-sleep-mode: true
98-
numa-bind: true
9965
tokenizer-mode: deepseek_v4
100-
decode:
101-
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
102-
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
103-
kv-cache-dtype: "fp8"
104-
tensor-parallel-size: 1
105-
pipeline-parallel-size: 1
106-
data-parallel-hybrid-lb: true
107-
data-parallel-size: 8
108-
data-parallel-rpc-port: 13345
109-
enable-expert-parallel: true
110-
enable-ep-weight-filter: true
111-
moe-backend: deep_gemm_mega_moe
112-
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
113-
attention-config: '{"use_fp4_indexer_cache":true}'
11466
max-model-len: 9472
11567
max-num-seqs: 256
116-
max-cudagraph-capture-size: 256
11768
max-num-batched-tokens: 256
69+
max-cudagraph-capture-size: 256
11870
trust-remote-code: true
11971
no-enable-prefix-caching: true
12072
no-enable-flashinfer-autotune: true
@@ -124,14 +76,11 @@ backend:
12476
stream-interval: 50
12577
no-disable-hybrid-kv-cache-manager: true
12678
enable-sleep-mode: true
127-
tokenizer-mode: deepseek_v4
79+
all2all-backend: "flashinfer_nvlink_one_sided"
12880

12981
profiling:
13082
type: "torch"
131-
prefill:
132-
start_step: 100000
133-
stop_step: 100001
134-
decode:
83+
aggregated:
13584
start_step: 3
13685
stop_step: 4
13786

0 commit comments

Comments
 (0)