diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e9af1ce19..e08778cc5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -198,7 +198,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.10.2 + image: vllm/vllm-openai:v0.11.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -228,7 +228,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.10.2 + image: vllm/vllm-openai:v0.11.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -286,7 +286,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.10.2 + image: vllm/vllm-openai:v0.11.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh index eceff904f..a8bb57c16 100644 --- a/benchmarks/gptoss_fp4_h100_docker.sh +++ b/benchmarks/gptoss_fp4_h100_docker.sh @@ -9,6 +9,7 @@ # CONC cat > config.yaml << EOF +compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true cuda-graph-sizes: 2048 @@ -20,7 +21,7 @@ export PYTHONNOUSERSITE=1 set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ +--config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh index f2b5e3a05..d2819b5b3 100644 --- a/benchmarks/gptoss_fp4_h100_slurm.sh +++ b/benchmarks/gptoss_fp4_h100_slurm.sh @@ -17,6 +17,7 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" cat > config.yaml << EOF +compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true cuda-graph-sizes: 2048 @@ -29,7 +30,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" set -x PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ ---config=config.yaml \ +--config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh index 23ac0bfa1..f92c60425 100644 --- a/benchmarks/gptoss_fp4_h200_slurm.sh +++ b/benchmarks/gptoss_fp4_h200_slurm.sh @@ -31,6 +31,7 @@ fi # Create config.yaml cat > config.yaml << EOF +compilation-config: '{"cudagraph_mode":"PIECEWISE"}' async-scheduling: true no-enable-prefix-caching: true cuda-graph-sizes: 2048