File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -52,6 +52,14 @@ if [[ "${PROFILE:-}" == "1" ]]; then
5252 )
5353fi
5454
55+ COMPILATION_ARGS=(
56+ --compilation-config ' {"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
57+ --max-cudagraph-capture-size 2048
58+ )
59+ if [[ " $MODEL " == " deepseek-ai/DeepSeek-V4-Flash" ]]; then
60+ COMPILATION_ARGS=(--compilation-config ' {"cudagraph_mode":"NONE","custom_ops":["all"]}' )
61+ fi
62+
5563BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN
5664
5765if [ " ${EVAL_ONLY} " = " true" ]; then
@@ -82,13 +90,12 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
8290 " ${EP_ARGS[@]} " \
8391 " ${MOE_ARGS[@]} " \
8492 " ${PROFILE_ARGS[@]} " \
85- --compilation-config ' {"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]} ' \
93+ " ${COMPILATION_ARGS[@]} " \
8694 --attention_config.use_fp4_indexer_cache True \
8795 --tokenizer-mode deepseek_v4 \
8896 --tool-call-parser deepseek_v4 \
8997 --enable-auto-tool-choice \
9098 --reasoning-parser deepseek_v4 \
91- --max-cudagraph-capture-size 2048 \
9299 --speculative-config " {\" method\" : \" mtp\" , \" num_speculative_tokens\" : $NUM_SPEC_TOKENS }" \
93100 --max-model-len " $SERVE_MAX_MODEL_LEN " \
94101 --max-num-batched-tokens " $MAX_NUM_BATCHED_TOKENS " > " $SERVER_LOG " 2>&1 &
You can’t perform that action at this time.
0 commit comments