diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..02aebe0af 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2883,7 +2883,7 @@ dsr1-fp8-h200-sglang-mtp: # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache # flag is omitted. Max-model-len is pinned at 800k per the recipe. dsv4-fp8-h200-vllm: - image: vllm/vllm-openai:deepseekv4-cu129 + image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2907,7 +2907,7 @@ dsv4-fp8-h200-vllm: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp8-h200-vllm-mtp: - image: vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4 + image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/dsv4_fp8_h200.sh index ed67d316e..51e4a72d2 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/dsv4_fp8_h200.sh @@ -13,6 +13,7 @@ check_env_vars \ CONC \ ISL \ OSL \ + MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME @@ -35,7 +36,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" else - MAX_MODEL_LEN_ARG="--max-model-len 800000" + MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" fi # DP_ATTENTION=true runs DP-attention with expert parallel (DP size = TP); @@ -62,6 +63,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${PARALLEL_ARGS[@]}" \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ +--quantization deepseek_v4_fp8 \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index a5d7b7738..0446ac6d9 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -70,6 +70,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ "${PARALLEL_ARGS[@]}" \ "${EP_ARGS[@]}" \ $MAX_MODEL_LEN_ARG \ +--quantization deepseek_v4_fp8 \ --gpu-memory-utilization 0.95 \ --max-num-seqs 512 \ --max-num-batched-tokens 512 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4894c47..9df99abc5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2662,6 +2662,14 @@ - "Update SGLang image from v0.5.9-cu129-amd64 (74d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1458 +- config-keys: + - dsv4-fp8-h200-vllm + - dsv4-fp8-h200-vllm-mtp + description: + - "Update vLLM image to v0.21.0 (from custom deepseekv4-cu129 / v0.20.1@sha256-pinned)" + - "Lower --gpu-memory-utilization from 0.95 to 0.90 in dsv4_fp8_h200.sh and dsv4_fp8_h200_mtp.sh — v0.21.0 uses more memory at load time, OOM'd on GPU 2 at 0.95" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1461 + - config-keys: - dsr1-fp8-mi325x-sglang description: