Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2883,7 +2883,7 @@ dsr1-fp8-h200-sglang-mtp:
# Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
# flag is omitted. Max-model-len is pinned at 800k per the recipe.
dsv4-fp8-h200-vllm:
image: vllm/vllm-openai:deepseekv4-cu129
image: vllm/vllm-openai:v0.21.0
Comment thread
Oseltamivir marked this conversation as resolved.
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: h200
Expand All @@ -2907,7 +2907,7 @@ dsv4-fp8-h200-vllm:
# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
dsv4-fp8-h200-vllm-mtp:
image: vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4
image: vllm/vllm-openai:v0.21.0
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: h200
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/single_node/dsv4_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ check_env_vars \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

Expand All @@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN"
else
MAX_MODEL_LEN_ARG="--max-model-len 800000"
MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN"
fi

# DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP);
Expand All @@ -62,6 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
$MAX_MODEL_LEN_ARG \
--quantization deepseek_v4_fp8 \
--gpu-memory-utilization 0.95 \
--max-num-seqs 512 \
--max-num-batched-tokens 512 \
Expand Down
1 change: 1 addition & 0 deletions benchmarks/single_node/dsv4_fp8_h200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
"${PARALLEL_ARGS[@]}" \
"${EP_ARGS[@]}" \
$MAX_MODEL_LEN_ARG \
--quantization deepseek_v4_fp8 \
--gpu-memory-utilization 0.95 \
--max-num-seqs 512 \
--max-num-batched-tokens 512 \
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2662,6 +2662,14 @@
- "Update SGLang image from v0.5.9-cu129-amd64 (74d old) to v0.5.12-cu130"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1458

- config-keys:
- dsv4-fp8-h200-vllm
- dsv4-fp8-h200-vllm-mtp
description:
- "Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)"
- "Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461

- config-keys:
- dsr1-fp8-mi325x-sglang
description:
Expand Down