1- name : " svf-vllm-disagg -gb200-profile-16gpu-conc256-mtp3"
1+ name : " svf-vllm-agg -gb200-profile-16gpu-conc256-mtp3"
22
33model :
44 path : " deepseek-v4-pro"
@@ -21,15 +21,9 @@ health_check:
2121resources :
2222 gpu_type : " gb200"
2323 gpus_per_node : 4
24- prefill_nodes : 2
25- decode_nodes : 2
26- prefill_workers : 1
27- decode_workers : 1
28- gpus_per_prefill : 8
29- gpus_per_decode : 8
30-
31- infra :
32- etcd_nats_dedicated_node : true
24+ agg_nodes : 4
25+ agg_workers : 1
26+ gpus_per_agg : 16
3327
3428frontend :
3529 type : dynamo
@@ -38,7 +32,7 @@ frontend:
3832backend :
3933 type : vllm
4034 connector : null
41- prefill_environment :
35+ aggregated_environment :
4236 VLLM_ENGINE_READY_TIMEOUT_S : " 3600"
4337 TILELANG_CLEANUP_TEMP_FILES : " 1"
4438 VLLM_USE_NCCL_SYMM_MEM : " 1"
@@ -54,67 +48,25 @@ backend:
5448 UCX_TLS : " cuda_copy,cuda_ipc,tcp"
5549 UCX_CUDA_IPC_ENABLE_MNNVL : " y"
5650 NCCL_P2P_LEVEL : NVL
57- decode_environment :
58- VLLM_ENGINE_READY_TIMEOUT_S : " 3600"
59- TILELANG_CLEANUP_TEMP_FILES : " 1"
60- VLLM_USE_NCCL_SYMM_MEM : " 1"
61- TORCH_SYMMMEM : " NVSHMEM"
62- NCCL_CUMEM_ENABLE : " 1"
63- NCCL_MNNVL_ENABLE : " 1"
64- NCCL_NVLS_ENABLE : " 1"
65- VLLM_SERVER_DEV_MODE : " 1"
66- UCX_MEMTYPE_CACHE : " n"
67- UCX_MEMTYPE_REG_WHOLE : " n"
68- UCX_TLS : " cuda_copy,cuda_ipc,tcp"
69- UCX_CUDA_IPC_ENABLE_MNNVL : " y"
70- NCCL_P2P_LEVEL : NVL
7151 vllm_config :
72- prefill :
73- kv-transfer-config : ' {"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
52+ aggregated :
7453 served-model-name : " deepseek-ai/DeepSeek-V4-Pro"
7554 kv-cache-dtype : " fp8"
7655 tensor-parallel-size : 1
7756 pipeline-parallel-size : 1
7857 data-parallel-hybrid-lb : true
79- data-parallel-size : 8
58+ data-parallel-size : 16
8059 data-parallel-rpc-port : 13345
8160 enable-expert-parallel : true
8261 enable-ep-weight-filter : true
8362 moe-backend : deep_gemm_mega_moe
84- enforce-eager : true
8563 speculative-config : ' {"method":"mtp","num_speculative_tokens":3}'
8664 attention-config : ' {"use_fp4_indexer_cache":true}'
87- max-model-len : 9472
88- max-num-seqs : 8
89- max-num-batched-tokens : 16384
90- trust-remote-code : true
91- no-enable-prefix-caching : true
92- no-enable-flashinfer-autotune : true
93- no-async-scheduling : true
94- block-size : 256
95- gpu-memory-utilization : 0.9
96- no-disable-hybrid-kv-cache-manager : true
97- enable-sleep-mode : true
98- numa-bind : true
9965 tokenizer-mode : deepseek_v4
100- decode :
101- kv-transfer-config : ' {"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
102- served-model-name : " deepseek-ai/DeepSeek-V4-Pro"
103- kv-cache-dtype : " fp8"
104- tensor-parallel-size : 1
105- pipeline-parallel-size : 1
106- data-parallel-hybrid-lb : true
107- data-parallel-size : 8
108- data-parallel-rpc-port : 13345
109- enable-expert-parallel : true
110- enable-ep-weight-filter : true
111- moe-backend : deep_gemm_mega_moe
112- speculative-config : ' {"method":"mtp","num_speculative_tokens":3}'
113- attention-config : ' {"use_fp4_indexer_cache":true}'
11466 max-model-len : 9472
11567 max-num-seqs : 256
116- max-cudagraph-capture-size : 256
11768 max-num-batched-tokens : 256
69+ max-cudagraph-capture-size : 256
11870 trust-remote-code : true
11971 no-enable-prefix-caching : true
12072 no-enable-flashinfer-autotune : true
@@ -124,14 +76,11 @@ backend:
12476 stream-interval : 50
12577 no-disable-hybrid-kv-cache-manager : true
12678 enable-sleep-mode : true
127- tokenizer-mode : deepseek_v4
79+ all2all-backend : " flashinfer_nvlink_one_sided "
12880
12981profiling :
13082 type : " torch"
131- prefill :
132- start_step : 100000
133- stop_step : 100001
134- decode :
83+ aggregated :
13584 start_step : 3
13685 stop_step : 4
13786
0 commit comments