Skip to content

Commit 542a246

Browse files
Update dsv4-fp8-h200-vllm (+mtp) vLLM image to v0.21.0 (#1461)
1 parent 21a4ab0 commit 542a246

4 files changed

Lines changed: 14 additions & 3 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2883,7 +2883,7 @@ dsr1-fp8-h200-sglang-mtp:
28832883
# Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
28842884
# flag is omitted. Max-model-len is pinned at 800k per the recipe.
28852885
dsv4-fp8-h200-vllm:
2886-
image: vllm/vllm-openai:deepseekv4-cu129
2886+
image: vllm/vllm-openai:v0.21.0
28872887
model: deepseek-ai/DeepSeek-V4-Pro
28882888
model-prefix: dsv4
28892889
runner: h200
@@ -2907,7 +2907,7 @@ dsv4-fp8-h200-vllm:
29072907
# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
29082908
# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
29092909
dsv4-fp8-h200-vllm-mtp:
2910-
image: vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4
2910+
image: vllm/vllm-openai:v0.21.0
29112911
model: deepseek-ai/DeepSeek-V4-Pro
29122912
model-prefix: dsv4
29132913
runner: h200

benchmarks/single_node/dsv4_fp8_h200.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ check_env_vars \
1313
CONC \
1414
ISL \
1515
OSL \
16+
MAX_MODEL_LEN \
1617
RANDOM_RANGE_RATIO \
1718
RESULT_FILENAME
1819

@@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3536
setup_eval_context
3637
MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN"
3738
else
38-
MAX_MODEL_LEN_ARG="--max-model-len 800000"
39+
MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
3940
fi
4041

4142
# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
@@ -62,6 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
6263
"${PARALLEL_ARGS[@]}" \
6364
"${EP_ARGS[@]}" \
6465
$MAX_MODEL_LEN_ARG \
66+
--quantization deepseek_v4_fp8 \
6567
--gpu-memory-utilization 0.95 \
6668
--max-num-seqs 512 \
6769
--max-num-batched-tokens 512 \

benchmarks/single_node/dsv4_fp8_h200_mtp.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
7070
"${PARALLEL_ARGS[@]}" \
7171
"${EP_ARGS[@]}" \
7272
$MAX_MODEL_LEN_ARG \
73+
--quantization deepseek_v4_fp8 \
7374
--gpu-memory-utilization 0.95 \
7475
--max-num-seqs 512 \
7576
--max-num-batched-tokens 512 \

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2662,6 +2662,14 @@
26622662
- "Update SGLang image from v0.5.9-cu129-amd64 (74d old) to v0.5.12-cu130"
26632663
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1458
26642664

2665+
- config-keys:
2666+
- dsv4-fp8-h200-vllm
2667+
- dsv4-fp8-h200-vllm-mtp
2668+
description:
2669+
- "Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)"
2670+
- "Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95"
2671+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461
2672+
26652673
- config-keys:
26662674
- dsr1-fp8-mi325x-sglang
26672675
description:

0 commit comments

Comments
 (0)