Skip to content

Commit 25506e8

Browse files
Ankur-singhnvpohanhcquil11
authored
Upgrade vLLM to v0.11.2 (#273)
* Upgrade vLLM to v0.11.2 Updated configs: - Use FP8 kv-cache for GPT-OSS B200. - Remove "custom_ops" from compilation-config for GPT-OSS. - Remove "cudagraph_mode" from compilation-config for GPT-OSS. - Remove VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB env var for GPT-OSS. - Remove deprecated "--disable-log-requests" flag. - Rename "cuda-graph-sizes" flag. Signed-off-by: Po-Han Huang <pohanh@nvidia.com> * make cw runners container writable * undo make cw runners container writable * coreweave cleanup * coreweave cleanup pt 2 --------- Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Co-authored-by: Po-Han Huang <pohanh@nvidia.com> Co-authored-by: Cameron Quilici <cjquilici@gmail.com>
1 parent 93e1b3c commit 25506e8

6 files changed

Lines changed: 16 additions & 19 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ gptoss-fp4-b200-trt:
209209
- { tp: 8, conc-start: 4, conc-end: 8 }
210210

211211
gptoss-fp4-b200-vllm:
212-
image: vllm/vllm-openai:v0.11.0
212+
image: vllm/vllm-openai:v0.11.2
213213
model: openai/gpt-oss-120b
214214
model-prefix: gptoss
215215
runner: b200
@@ -240,7 +240,7 @@ gptoss-fp4-b200-vllm:
240240
- { tp: 8, conc-start: 4, conc-end: 4 }
241241

242242
gptoss-fp4-h100-vllm:
243-
image: vllm/vllm-openai:v0.11.0
243+
image: vllm/vllm-openai:v0.11.2
244244
model: openai/gpt-oss-120b
245245
model-prefix: gptoss
246246
runner: h100
@@ -300,7 +300,7 @@ gptoss-fp4-h200-trt:
300300
- { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
301301

302302
gptoss-fp4-h200-vllm:
303-
image: vllm/vllm-openai:v0.11.0
303+
image: vllm/vllm-openai:v0.11.2
304304
model: openai/gpt-oss-120b
305305
model-prefix: gptoss
306306
runner: h200

benchmarks/gptoss_fp4_b200_docker.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,16 @@ else
2929
fi
3030

3131
cat > config.yaml << EOF
32-
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}'
32+
kv-cache-dtype: fp8
33+
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}'
3334
async-scheduling: true
3435
no-enable-prefix-caching: true
35-
cuda-graph-sizes: 2048
36+
max-cudagraph-capture-size: 2048
3637
max-num-batched-tokens: 8192
3738
max-model-len: $CALCULATED_MAX_MODEL_LEN
3839
EOF
3940

4041
export TORCH_CUDA_ARCH_LIST="10.0"
41-
export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
4242
export PYTHONNOUSERSITE=1
4343
export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
4444

@@ -47,7 +47,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
4747
set -x
4848
vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4949
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
50-
--disable-log-requests > $SERVER_LOG 2>&1 &
50+
> $SERVER_LOG 2>&1 &
5151

5252
SERVER_PID=$!
5353

@@ -69,4 +69,4 @@ run_benchmark_serving \
6969
--num-prompts "$NUM_PROMPTS" \
7070
--max-concurrency "$CONC" \
7171
--result-filename "$RESULT_FILENAME" \
72-
--result-dir /workspace/
72+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212

1313

1414
cat > config.yaml << EOF
15-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1615
async-scheduling: true
1716
no-enable-prefix-caching: true
18-
cuda-graph-sizes: 2048
17+
max-cudagraph-capture-size: 2048
1918
max-num-batched-tokens: 8192
2019
max-model-len: 10240
2120
EOF
@@ -29,7 +28,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
2928
--gpu-memory-utilization=0.9 \
3029
--tensor-parallel-size=$TP \
3130
--max-num-seqs=$CONC \
32-
--disable-log-requests > $SERVER_LOG 2>&1 &
31+
> $SERVER_LOG 2>&1 &
3332

3433
SERVER_PID=$!
3534

@@ -51,4 +50,4 @@ run_benchmark_serving \
5150
--num-prompts $(( $CONC * 10 )) \
5251
--max-concurrency 512 \
5352
--result-filename "$RESULT_FILENAME" \
54-
--result-dir /workspace/
53+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
1414

1515
cat > config.yaml << EOF
16-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1716
async-scheduling: true
1817
no-enable-prefix-caching: true
19-
cuda-graph-sizes: 2048
18+
max-cudagraph-capture-size: 2048
2019
max-num-batched-tokens: 8192
2120
max-model-len: 10240
2221
EOF
@@ -30,7 +29,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
3029
--gpu-memory-utilization=0.9 \
3130
--tensor-parallel-size=$TP \
3231
--max-num-seqs=$CONC \
33-
--disable-log-requests > $SERVER_LOG 2>&1 &
32+
> $SERVER_LOG 2>&1 &
3433

3534
SERVER_PID=$!
3635

benchmarks/gptoss_fp4_h200_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ fi
2727

2828
# Create config.yaml
2929
cat > config.yaml << EOF
30-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
3130
async-scheduling: true
3231
no-enable-prefix-caching: true
33-
cuda-graph-sizes: 2048
32+
max-cudagraph-capture-size: 2048
3433
max-num-batched-tokens: 8192
3534
max-model-len: $CALCULATED_MAX_MODEL_LEN
3635
EOF
@@ -42,7 +41,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
4241

4342
PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4443
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
45-
--disable-log-requests > $SERVER_LOG 2>&1 &
44+
> $SERVER_LOG 2>&1 &
4645

4746
SERVER_PID=$!
4847

runners/launch_h200-cw.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,6 @@ srun --jobid=$JOB_ID \
2727
--container-mount-home \
2828
--container-workdir=/workspace/ \
2929
--no-container-entrypoint --export=ALL \
30-
bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
30+
bash -c 'bash benchmarks/'"${EXP_NAME%%_*}_${PRECISION}"'_h200_slurm.sh; rm -rf /dev/shm/sagemaker_sessions'
3131

3232
scancel $JOB_ID

0 commit comments

Comments
 (0)