diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3d6a17ff7..e24ec38c0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh index 8021e54e7..2cafd4b32 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh @@ -43,8 +43,10 @@ fi GMU_ARGS=() MOE_ARGS=() +EPLB_ARGS=() if [ "${DP_ATTENTION}" = "true" ]; then MOE_ARGS=(--moe-backend deep_gemm_mega_moe) + EPLB_ARGS=(--enable-eplb --eplb-config '{"communicator":"torch_nccl", "use_async": false}') fi if [ "${ISL}" -eq 8192 ] && [ "${CONC}" -le 128 ]; then @@ -78,6 +80,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${EP_ARGS[@]}" \ "${GMU_ARGS[@]}" \ "${MOE_ARGS[@]}" \ + "${EPLB_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache=True \ --tokenizer-mode deepseek_v4 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d69f528a8..eb23a6cc8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3445,6 +3445,12 @@ - "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641 +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Enable EPLB for DEP configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1655 + - config-keys: - minimaxm2.5-fp8-gb300-dynamo-vllm description: