File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -2883,7 +2883,7 @@ dsr1-fp8-h200-sglang-mtp:
28832883# Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
28842884# flag is omitted. Max-model-len is pinned at 800k per the recipe.
28852885dsv4-fp8-h200-vllm :
2886- image : vllm/vllm-openai:deepseekv4-cu129
2886+ image : vllm/vllm-openai:v0.21.0
28872887 model : deepseek-ai/DeepSeek-V4-Pro
28882888 model-prefix : dsv4
28892889 runner : h200
@@ -2907,7 +2907,7 @@ dsv4-fp8-h200-vllm:
29072907# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
29082908# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
29092909dsv4-fp8-h200-vllm-mtp :
2910- image : vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4
2910+ image : vllm/vllm-openai:v0.21.0
29112911 model : deepseek-ai/DeepSeek-V4-Pro
29122912 model-prefix : dsv4
29132913 runner : h200
Original file line number Diff line number Diff line change @@ -13,6 +13,7 @@ check_env_vars \
1313 CONC \
1414 ISL \
1515 OSL \
16+ MAX_MODEL_LEN \
1617 RANDOM_RANGE_RATIO \
1718 RESULT_FILENAME
1819
@@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3536 setup_eval_context
3637 MAX_MODEL_LEN_ARG=" --max-model-len $EVAL_MAX_MODEL_LEN "
3738else
38- MAX_MODEL_LEN_ARG=" --max-model-len 800000 "
39+ MAX_MODEL_LEN_ARG=" --max-model-len $MAX_MODEL_LEN "
3940fi
4041
4142# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
@@ -62,6 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
6263" ${PARALLEL_ARGS[@]} " \
6364" ${EP_ARGS[@]} " \
6465$MAX_MODEL_LEN_ARG \
66+ --quantization deepseek_v4_fp8 \
6567--gpu-memory-utilization 0.95 \
6668--max-num-seqs 512 \
6769--max-num-batched-tokens 512 \
Original file line number Diff line number Diff line change @@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
7070" ${PARALLEL_ARGS[@]} " \
7171" ${EP_ARGS[@]} " \
7272$MAX_MODEL_LEN_ARG \
73+ --quantization deepseek_v4_fp8 \
7374--gpu-memory-utilization 0.95 \
7475--max-num-seqs 512 \
7576--max-num-batched-tokens 512 \
Original file line number Diff line number Diff line change 26622662 - " Update SGLang image from v0.5.9-cu129-amd64 (74d old) to v0.5.12-cu130"
26632663 pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/1458
26642664
2665+ - config-keys :
2666+ - dsv4-fp8-h200-vllm
2667+ - dsv4-fp8-h200-vllm-mtp
2668+ description :
2669+ - " Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)"
2670+ - " Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95"
2671+ pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/1461
2672+
26652673- config-keys :
26662674 - dsr1-fp8-mi325x-sglang
26672675 description :
You can’t perform that action at this time.
0 commit comments