Skip to content

Commit 7da989d

Browse files
nvpohanhAnkur-singh
authored andcommitted
Upgrade vLLM to v0.11.2
Updated configs: - Use FP8 kv-cache for GPT-OSS B200. - Remove "custom_ops" from compilation-config for GPT-OSS. - Remove "cudagraph_mode" from compilation-config for GPT-OSS. - Remove VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB env var for GPT-OSS. - Remove deprecated "--disable-log-requests" flag. - Rename "cuda-graph-sizes" flag. Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
1 parent 343d193 commit 7da989d

6 files changed

Lines changed: 16 additions & 19 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ gptoss-fp4-b200-trt:
202202
- { tp: 8, conc-start: 4, conc-end: 8 }
203203

204204
gptoss-fp4-b200-vllm:
205-
image: vllm/vllm-openai:v0.11.0
205+
image: vllm/vllm-openai:v0.11.2
206206
model: openai/gpt-oss-120b
207207
model-prefix: gptoss
208208
runner: b200
@@ -232,7 +232,7 @@ gptoss-fp4-b200-vllm:
232232
- { tp: 8, conc-start: 4, conc-end: 4 }
233233

234234
gptoss-fp4-h100-vllm:
235-
image: vllm/vllm-openai:v0.11.0
235+
image: vllm/vllm-openai:v0.11.2
236236
model: openai/gpt-oss-120b
237237
model-prefix: gptoss
238238
runner: h100
@@ -290,7 +290,7 @@ gptoss-fp4-h200-trt:
290290
- { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
291291

292292
gptoss-fp4-h200-vllm:
293-
image: vllm/vllm-openai:v0.11.0
293+
image: vllm/vllm-openai:v0.11.2
294294
model: openai/gpt-oss-120b
295295
model-prefix: gptoss
296296
runner: h200

.github/workflows/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sg
115115

116116
Use the `custom` command to specify all parameters manually:
117117
```
118-
custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
118+
custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.2 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
119119
```
120120

121121
This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for:

benchmarks/gptoss_fp4_b200_docker.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,16 @@ else
2929
fi
3030

3131
cat > config.yaml << EOF
32-
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}'
32+
kv-cache-dtype: fp8
33+
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}'
3334
async-scheduling: true
3435
no-enable-prefix-caching: true
35-
cuda-graph-sizes: 2048
36+
max-cudagraph-capture-size: 2048
3637
max-num-batched-tokens: 8192
3738
max-model-len: $CALCULATED_MAX_MODEL_LEN
3839
EOF
3940

4041
export TORCH_CUDA_ARCH_LIST="10.0"
41-
export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
4242
export PYTHONNOUSERSITE=1
4343
export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
4444

@@ -47,7 +47,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
4747
set -x
4848
vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4949
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
50-
--disable-log-requests > $SERVER_LOG 2>&1 &
50+
> $SERVER_LOG 2>&1 &
5151

5252
SERVER_PID=$!
5353

@@ -69,4 +69,4 @@ run_benchmark_serving \
6969
--num-prompts "$NUM_PROMPTS" \
7070
--max-concurrency "$CONC" \
7171
--result-filename "$RESULT_FILENAME" \
72-
--result-dir /workspace/
72+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212

1313

1414
cat > config.yaml << EOF
15-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1615
async-scheduling: true
1716
no-enable-prefix-caching: true
18-
cuda-graph-sizes: 2048
17+
max-cudagraph-capture-size: 2048
1918
max-num-batched-tokens: 8192
2019
max-model-len: 10240
2120
EOF
@@ -29,7 +28,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
2928
--gpu-memory-utilization=0.9 \
3029
--tensor-parallel-size=$TP \
3130
--max-num-seqs=$CONC \
32-
--disable-log-requests > $SERVER_LOG 2>&1 &
31+
> $SERVER_LOG 2>&1 &
3332

3433
SERVER_PID=$!
3534

@@ -51,4 +50,4 @@ run_benchmark_serving \
5150
--num-prompts $(( $CONC * 10 )) \
5251
--max-concurrency 512 \
5352
--result-filename "$RESULT_FILENAME" \
54-
--result-dir /workspace/
53+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
1414

1515
cat > config.yaml << EOF
16-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1716
async-scheduling: true
1817
no-enable-prefix-caching: true
19-
cuda-graph-sizes: 2048
18+
max-cudagraph-capture-size: 2048
2019
max-num-batched-tokens: 8192
2120
max-model-len: 10240
2221
EOF
@@ -30,7 +29,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
3029
--gpu-memory-utilization=0.9 \
3130
--tensor-parallel-size=$TP \
3231
--max-num-seqs=$CONC \
33-
--disable-log-requests > $SERVER_LOG 2>&1 &
32+
> $SERVER_LOG 2>&1 &
3433

3534
SERVER_PID=$!
3635

benchmarks/gptoss_fp4_h200_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ fi
2727

2828
# Create config.yaml
2929
cat > config.yaml << EOF
30-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
3130
async-scheduling: true
3231
no-enable-prefix-caching: true
33-
cuda-graph-sizes: 2048
32+
max-cudagraph-capture-size: 2048
3433
max-num-batched-tokens: 8192
3534
max-model-len: $CALCULATED_MAX_MODEL_LEN
3635
EOF
@@ -42,7 +41,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
4241

4342
PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4443
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
45-
--disable-log-requests > $SERVER_LOG 2>&1 &
44+
> $SERVER_LOG 2>&1 &
4645

4746
SERVER_PID=$!
4847

0 commit comments

Comments
 (0)