Skip to content

Commit 4e52ed4

Browse files
Ankur-singhnvpohanhcquil11
authored
Update vLLM version to v0.13.0 for NVIDIA configs (#327)
* Update vLLM version to v0.12.0 * Fix H100/H200 perf regression * check and install git before use * add container writable to h200 nv runner launch script * add sudo to apt-get * add container-remap-root to h200 nv and nb runner launchers * make changes to perf changelog * fix typo, use correct env var for h100 * update to v0.13.0 * make changes to perf changelog * fix perf-changelog fix perf-changelog fix perf-changelog fix * fix compilation configs * make num prompts conc * 10 * add --container-writable to h200 nb * add --container-remap-root to b200 nb * add --container-remap-root to b200 nv --------- Co-authored-by: Po-Han Huang <pohanh@nvidia.com> Co-authored-by: Cam Quilici <cjquilici@gmail.com>
1 parent d27433d commit 4e52ed4

12 files changed

Lines changed: 39 additions & 8 deletions

.github/configs/nvidia-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ gptoss-fp4-b200-trt:
209209
- { tp: 8, conc-start: 4, conc-end: 8 }
210210

211211
gptoss-fp4-b200-vllm:
212-
image: vllm/vllm-openai:v0.11.2
212+
image: vllm/vllm-openai:v0.13.0
213213
model: openai/gpt-oss-120b
214214
model-prefix: gptoss
215215
runner: b200
@@ -240,7 +240,7 @@ gptoss-fp4-b200-vllm:
240240
- { tp: 8, conc-start: 4, conc-end: 4 }
241241

242242
gptoss-fp4-h100-vllm:
243-
image: vllm/vllm-openai:v0.11.2
243+
image: vllm/vllm-openai:v0.13.0
244244
model: openai/gpt-oss-120b
245245
model-prefix: gptoss
246246
runner: h100
@@ -300,7 +300,7 @@ gptoss-fp4-h200-trt:
300300
- { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
301301

302302
gptoss-fp4-h200-vllm:
303-
image: vllm/vllm-openai:v0.11.2
303+
image: vllm/vllm-openai:v0.13.0
304304
model: openai/gpt-oss-120b
305305
model-prefix: gptoss
306306
runner: h200

benchmarks/benchmark_lib.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,17 @@ run_benchmark_serving() {
208208
echo "Error: --result-dir is required"
209209
return 1
210210
fi
211+
212+
# Check if git is installed, install if missing
213+
if ! command -v git &> /dev/null; then
214+
echo "git not found, installing..."
215+
if command -v apt-get &> /dev/null; then
216+
sudo apt-get update && sudo apt-get install -y git
217+
else
218+
echo "Error: Could not install git. Package manager not found."
219+
return 1
220+
fi
221+
fi
211222

212223
# Clone benchmark serving repo
213224
local BENCH_SERVING_DIR=$(mktemp -d /tmp/bmk-XXXXXX)

benchmarks/gptoss_fp4_b200_docker.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ fi
3030

3131
cat > config.yaml << EOF
3232
kv-cache-dtype: fp8
33-
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}'
33+
compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
3434
async-scheduling: true
3535
no-enable-prefix-caching: true
3636
max-cudagraph-capture-size: 2048

benchmarks/gptoss_fp4_b200_slurm.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ fi
2727

2828
cat > config.yaml << EOF
2929
kv-cache-dtype: fp8
30-
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}'
30+
compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
3131
async-scheduling: true
3232
no-enable-prefix-caching: true
3333
max-cudagraph-capture-size: 2048
@@ -64,7 +64,7 @@ run_benchmark_serving \
6464
--input-len "$ISL" \
6565
--output-len "$OSL" \
6666
--random-range-ratio "$RANDOM_RANGE_RATIO" \
67-
--num-prompts "$NUM_PROMPTS" \
67+
--num-prompts $(( CONC * 10 )) \
6868
--max-concurrency "$CONC" \
6969
--result-filename "$RESULT_FILENAME" \
7070
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ max-model-len: 10240
2020
EOF
2121

2222
export PYTHONNOUSERSITE=1
23+
export VLLM_MXFP4_USE_MARLIN=1
2324
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
2425

2526
set -x

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ EOF
2222

2323
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
2424
export TORCH_CUDA_ARCH_LIST="9.0"
25+
export VLLM_MXFP4_USE_MARLIN=1
2526

2627
set -x
2728
PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \

benchmarks/gptoss_fp4_h200_slurm.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
3838
PORT=$(( 8888 + $PORT_OFFSET ))
3939

4040
export TORCH_CUDA_ARCH_LIST="9.0"
41+
export VLLM_MXFP4_USE_MARLIN=1
4142

4243
PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4344
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \

perf-changelog.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,12 @@
124124
description:
125125
- "Update NVIDIA DeepSeek sglang Docker image from v0.5.5 to v0.5.6"
126126
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/276
127+
128+
- config-keys:
129+
- gptoss-fp4-b200-vllm
130+
- gptoss-fp4-h100-vllm
131+
- gptoss-fp4-h200-vllm
132+
description:
133+
- "Update vLLM image from v0.11.2 to v0.13.0"
134+
- "Add VLLM_MXFP4_USE_MARLIN=1 to H100 and H200 benchmark scripts"
135+
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/327

runners/launch_b200-nb.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
1414
--container-image=$IMAGE \
1515
--container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1} \
1616
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
17-
--no-container-mount-home --container-writable \
17+
--no-container-mount-home \
18+
--container-remap-root \
19+
--container-writable \
1820
--container-workdir=/workspace/ \
1921
--no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \
2022
bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh

runners/launch_b200-nv.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
1717
srun --jobid=$JOB_ID \
1818
--container-image=$SQUASH_FILE \
1919
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
20-
--no-container-mount-home --container-writable \
20+
--no-container-mount-home \
21+
--container-remap-root \
22+
--container-writable \
2123
--container-workdir=/workspace/ \
2224
--no-container-entrypoint --export=ALL \
2325
bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh

0 commit comments

Comments
 (0)