Skip to content

Commit 9b534f7

Browse files
committed
fix(profile): disable Flash vLLM MTP cudagraphs
1 parent cd160ee commit 9b534f7

1 file changed

Lines changed: 9 additions & 2 deletions

File tree

benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ if [[ "${PROFILE:-}" == "1" ]]; then
5252
)
5353
fi
5454

55+
COMPILATION_ARGS=(
56+
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
57+
--max-cudagraph-capture-size 2048
58+
)
59+
if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then
60+
COMPILATION_ARGS=(--compilation-config '{"cudagraph_mode":"NONE","custom_ops":["all"]}')
61+
fi
62+
5563
BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
5664

5765
if [ "${EVAL_ONLY}" = "true" ]; then
@@ -82,13 +90,12 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
8290
"${EP_ARGS[@]}" \
8391
"${MOE_ARGS[@]}" \
8492
"${PROFILE_ARGS[@]}" \
85-
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
93+
"${COMPILATION_ARGS[@]}" \
8694
--attention_config.use_fp4_indexer_cache True \
8795
--tokenizer-mode deepseek_v4 \
8896
--tool-call-parser deepseek_v4 \
8997
--enable-auto-tool-choice \
9098
--reasoning-parser deepseek_v4 \
91-
--max-cudagraph-capture-size 2048 \
9299
--speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
93100
--max-model-len "$SERVE_MAX_MODEL_LEN" \
94101
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

0 commit comments

Comments
 (0)