Skip to content

Commit ba211aa

Browse files
committed
update vllm to 11.0 and make changes from PR 102
1 parent 17d5e20 commit ba211aa

4 files changed

Lines changed: 8 additions & 5 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ gptoss-fp4-b200-trt:
198198
- { tp: 8, conc-start: 4, conc-end: 8 }
199199

200200
gptoss-fp4-b200-vllm:
201-
image: vllm/vllm-openai:v0.10.2
201+
image: vllm/vllm-openai:v0.11.0
202202
model: openai/gpt-oss-120b
203203
model-prefix: gptoss
204204
runner: b200
@@ -228,7 +228,7 @@ gptoss-fp4-b200-vllm:
228228
- { tp: 8, conc-start: 4, conc-end: 4 }
229229

230230
gptoss-fp4-h100-vllm:
231-
image: vllm/vllm-openai:v0.10.2
231+
image: vllm/vllm-openai:v0.11.0
232232
model: openai/gpt-oss-120b
233233
model-prefix: gptoss
234234
runner: h100
@@ -286,7 +286,7 @@ gptoss-fp4-h200-trt:
286286
- { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
287287

288288
gptoss-fp4-h200-vllm:
289-
image: vllm/vllm-openai:v0.10.2
289+
image: vllm/vllm-openai:v0.11.0
290290
model: openai/gpt-oss-120b
291291
model-prefix: gptoss
292292
runner: h200

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# CONC
1010

1111
cat > config.yaml << EOF
12+
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1213
async-scheduling: true
1314
no-enable-prefix-caching: true
1415
cuda-graph-sizes: 2048
@@ -20,7 +21,7 @@ export PYTHONNOUSERSITE=1
2021

2122
set -x
2223
vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
23-
--config=config.yaml \
24+
--config config.yaml \
2425
--gpu-memory-utilization=0.9 \
2526
--tensor-parallel-size=$TP \
2627
--max-num-seqs=$CONC \

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
1818

1919
cat > config.yaml << EOF
20+
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
2021
async-scheduling: true
2122
no-enable-prefix-caching: true
2223
cuda-graph-sizes: 2048
@@ -29,7 +30,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
2930

3031
set -x
3132
PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
32-
--config=config.yaml \
33+
--config config.yaml \
3334
--gpu-memory-utilization=0.9 \
3435
--tensor-parallel-size=$TP \
3536
--max-num-seqs=$CONC \

benchmarks/gptoss_fp4_h200_slurm.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ fi
3131

3232
# Create config.yaml
3333
cat > config.yaml << EOF
34+
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
3435
async-scheduling: true
3536
no-enable-prefix-caching: true
3637
cuda-graph-sizes: 2048

0 commit comments

Comments
 (0)