Skip to content

Commit 5c85c32

Browse files
authored
Merge branch 'main' into diff-only-runs
2 parents 9acd1e7 + c7e4b1d commit 5c85c32

10 files changed

Lines changed: 37 additions & 27 deletions

.github/configs/nvidia-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ gptoss-fp4-b200-trt:
209209
- { tp: 8, conc-start: 4, conc-end: 8 }
210210

211211
gptoss-fp4-b200-vllm:
212-
image: vllm/vllm-openai:v0.11.0
212+
image: vllm/vllm-openai:v0.11.2
213213
model: openai/gpt-oss-120b
214214
model-prefix: gptoss
215215
runner: b200
@@ -240,7 +240,7 @@ gptoss-fp4-b200-vllm:
240240
- { tp: 8, conc-start: 4, conc-end: 4 }
241241

242242
gptoss-fp4-h100-vllm:
243-
image: vllm/vllm-openai:v0.11.0
243+
image: vllm/vllm-openai:v0.11.2
244244
model: openai/gpt-oss-120b
245245
model-prefix: gptoss
246246
runner: h100
@@ -300,7 +300,7 @@ gptoss-fp4-h200-trt:
300300
- { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
301301

302302
gptoss-fp4-h200-vllm:
303-
image: vllm/vllm-openai:v0.11.0
303+
image: vllm/vllm-openai:v0.11.2
304304
model: openai/gpt-oss-120b
305305
model-prefix: gptoss
306306
runner: h200

.github/workflows/full-sweep-1k1k-scheduler.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ name: "Full Sweep Scheduler - 1k1k"
22

33
on:
44
workflow_dispatch:
5+
schedule:
6+
- cron: "0 0 * * *"
57

68
jobs:
79
get-dsr1-configs:
@@ -49,6 +51,7 @@ jobs:
4951
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5052
secrets: inherit
5153
with:
54+
exp-name: "dsr1_1k1k"
5255
isl: 1024
5356
osl: 1024
5457
max-model-len: 2248
@@ -58,7 +61,6 @@ jobs:
5861
model-prefix: ${{ matrix.config.model-prefix }}
5962
framework: ${{ matrix.config.framework }}
6063
precision: ${{ matrix.config.precision }}
61-
exp-name: "dsr1_1k1k"
6264
conc-list: ${{ toJson(matrix.config.conc) }}
6365
spec-decoding: ${{ matrix.config.spec-decoding }}
6466
disagg: ${{ matrix.config.disagg }}
@@ -114,6 +116,7 @@ jobs:
114116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
115117
secrets: inherit
116118
with:
119+
exp-name: "gptoss_1k1k"
117120
isl: 1024
118121
osl: 1024
119122
max-model-len: 2248
@@ -123,7 +126,6 @@ jobs:
123126
model-prefix: ${{ matrix.config.model-prefix }}
124127
framework: ${{ matrix.config.framework }}
125128
precision: ${{ matrix.config.precision }}
126-
exp-name: "dsr1_1k1k"
127129
conc-list: ${{ toJson(matrix.config.conc) }}
128130
spec-decoding: ${{ matrix.config.spec-decoding }}
129131
disagg: ${{ matrix.config.disagg }}

.github/workflows/full-sweep-1k8k-scheduler.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ name: "Full Sweep Scheduler - 1k8k"
22

33
on:
44
workflow_dispatch:
5+
schedule:
6+
- cron: "0 0 * * *"
57

68
jobs:
79
get-dsr1-configs:
@@ -49,6 +51,7 @@ jobs:
4951
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5052
secrets: inherit
5153
with:
54+
exp-name: "dsr1_1k8k"
5255
isl: 1024
5356
osl: 8192
5457
max-model-len: 9416
@@ -58,7 +61,6 @@ jobs:
5861
model-prefix: ${{ matrix.config.model-prefix }}
5962
framework: ${{ matrix.config.framework }}
6063
precision: ${{ matrix.config.precision }}
61-
exp-name: "dsr1_1k8k"
6264
conc-list: ${{ toJson(matrix.config.conc) }}
6365
spec-decoding: ${{ matrix.config.spec-decoding }}
6466
disagg: ${{ matrix.config.disagg }}
@@ -114,6 +116,7 @@ jobs:
114116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
115117
secrets: inherit
116118
with:
119+
exp-name: "gptoss_1k8k"
117120
isl: 1024
118121
osl: 8192
119122
max-model-len: 9416
@@ -123,7 +126,6 @@ jobs:
123126
model-prefix: ${{ matrix.config.model-prefix }}
124127
framework: ${{ matrix.config.framework }}
125128
precision: ${{ matrix.config.precision }}
126-
exp-name: "dsr1_1k8k"
127129
conc-list: ${{ toJson(matrix.config.conc) }}
128130
spec-decoding: ${{ matrix.config.spec-decoding }}
129131
disagg: ${{ matrix.config.disagg }}

.github/workflows/full-sweep-8k1k-scheduler.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ name: "Full Sweep Scheduler - 8k1k"
22

33
on:
44
workflow_dispatch:
5+
schedule:
6+
- cron: "0 0 * * *"
57

68
jobs:
79
get-dsr1-configs:
@@ -49,6 +51,7 @@ jobs:
4951
config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }}
5052
secrets: inherit
5153
with:
54+
exp-name: "dsr1_8k1k"
5255
isl: 8192
5356
osl: 1024
5457
max-model-len: 9416
@@ -58,7 +61,6 @@ jobs:
5861
model-prefix: ${{ matrix.config.model-prefix }}
5962
framework: ${{ matrix.config.framework }}
6063
precision: ${{ matrix.config.precision }}
61-
exp-name: "dsr1_8k1k"
6264
conc-list: ${{ toJson(matrix.config.conc) }}
6365
spec-decoding: ${{ matrix.config.spec-decoding }}
6466
disagg: ${{ matrix.config.disagg }}
@@ -114,6 +116,7 @@ jobs:
114116
config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }}
115117
secrets: inherit
116118
with:
119+
exp-name: "gptoss_8k1k"
117120
isl: 8192
118121
osl: 1024
119122
max-model-len: 9416
@@ -123,7 +126,6 @@ jobs:
123126
model-prefix: ${{ matrix.config.model-prefix }}
124127
framework: ${{ matrix.config.framework }}
125128
precision: ${{ matrix.config.precision }}
126-
exp-name: "dsr1_8k1k"
127129
conc-list: ${{ toJson(matrix.config.conc) }}
128130
spec-decoding: ${{ matrix.config.spec-decoding }}
129131
disagg: ${{ matrix.config.disagg }}

benchmarks/gptoss_fp4_b200_docker.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,16 @@ else
2929
fi
3030

3131
cat > config.yaml << EOF
32-
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+rms_norm"],"cudagraph_mode":"FULL_AND_PIECEWISE"}'
32+
kv-cache-dtype: fp8
33+
compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_noop":true}}'
3334
async-scheduling: true
3435
no-enable-prefix-caching: true
35-
cuda-graph-sizes: 2048
36+
max-cudagraph-capture-size: 2048
3637
max-num-batched-tokens: 8192
3738
max-model-len: $CALCULATED_MAX_MODEL_LEN
3839
EOF
3940

4041
export TORCH_CUDA_ARCH_LIST="10.0"
41-
export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
4242
export PYTHONNOUSERSITE=1
4343
export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
4444

@@ -47,7 +47,7 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
4747
set -x
4848
vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4949
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
50-
--disable-log-requests > $SERVER_LOG 2>&1 &
50+
> $SERVER_LOG 2>&1 &
5151

5252
SERVER_PID=$!
5353

@@ -69,4 +69,4 @@ run_benchmark_serving \
6969
--num-prompts "$NUM_PROMPTS" \
7070
--max-concurrency "$CONC" \
7171
--result-filename "$RESULT_FILENAME" \
72-
--result-dir /workspace/
72+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_docker.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212

1313

1414
cat > config.yaml << EOF
15-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1615
async-scheduling: true
1716
no-enable-prefix-caching: true
18-
cuda-graph-sizes: 2048
17+
max-cudagraph-capture-size: 2048
1918
max-num-batched-tokens: 8192
2019
max-model-len: 10240
2120
EOF
@@ -29,7 +28,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
2928
--gpu-memory-utilization=0.9 \
3029
--tensor-parallel-size=$TP \
3130
--max-num-seqs=$CONC \
32-
--disable-log-requests > $SERVER_LOG 2>&1 &
31+
> $SERVER_LOG 2>&1 &
3332

3433
SERVER_PID=$!
3534

@@ -51,4 +50,4 @@ run_benchmark_serving \
5150
--num-prompts $(( $CONC * 10 )) \
5251
--max-concurrency 512 \
5352
--result-filename "$RESULT_FILENAME" \
54-
--result-dir /workspace/
53+
--result-dir /workspace/

benchmarks/gptoss_fp4_h100_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
1414

1515
cat > config.yaml << EOF
16-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
1716
async-scheduling: true
1817
no-enable-prefix-caching: true
19-
cuda-graph-sizes: 2048
18+
max-cudagraph-capture-size: 2048
2019
max-num-batched-tokens: 8192
2120
max-model-len: 10240
2221
EOF
@@ -30,7 +29,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
3029
--gpu-memory-utilization=0.9 \
3130
--tensor-parallel-size=$TP \
3231
--max-num-seqs=$CONC \
33-
--disable-log-requests > $SERVER_LOG 2>&1 &
32+
> $SERVER_LOG 2>&1 &
3433

3534
SERVER_PID=$!
3635

benchmarks/gptoss_fp4_h200_slurm.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ fi
2727

2828
# Create config.yaml
2929
cat > config.yaml << EOF
30-
compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
3130
async-scheduling: true
3231
no-enable-prefix-caching: true
33-
cuda-graph-sizes: 2048
32+
max-cudagraph-capture-size: 2048
3433
max-num-batched-tokens: 8192
3534
max-model-len: $CALCULATED_MAX_MODEL_LEN
3635
EOF
@@ -42,7 +41,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
4241

4342
PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
4443
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
45-
--disable-log-requests > $SERVER_LOG 2>&1 &
44+
> $SERVER_LOG 2>&1 &
4645

4746
SERVER_PID=$!
4847

runners/launch_b200-nb.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
#!/usr/bin/bash
22

3-
HF_HUB_CACHE_MOUNT="/root/hf_hub_cache-${USER: -1}/"
3+
HF_HUB_CACHE_MOUNT="/mnt/data/hf-hub-cache-${USER: -1}/"
44
PARTITION="main"
55
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
66

7+
UCX_NET_DEVICES=eth0
8+
9+
# Cleanup any stale enroot locks from previous runs
10+
find /var/cache/enroot-container-images/$UID -type f -name "*.lock" | xargs rm
11+
712
set -x
813
srun --partition=$PARTITION --gres=gpu:$TP --exclusive \
914
--container-image=$IMAGE \
1015
--container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1} \
1116
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
1217
--no-container-mount-home --container-writable \
1318
--container-workdir=/workspace/ \
14-
--no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1} \
19+
--no-container-entrypoint --export=ALL,PORT_OFFSET=${USER: -1},UCX_NET_DEVICES=$UCX_NET_DEVICES \
1520
bash benchmarks/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh

runners/launch_h200-cw.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ else
2121
CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
2222
fi
2323

24+
# The 'rm -rf /dev/shm/sagemaker_sessions' is to clean up shared memory used by sagemaker sessions inside the container
25+
# This seems to have been introduced in vLLM 0.11.2, but the issue is specific to CoreWeave runners.
2426
srun --jobid=$JOB_ID \
2527
--container-image=$CONTAINER_IMAGE \
2628
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
2729
--container-mount-home \
2830
--container-workdir=/workspace/ \
2931
--no-container-entrypoint --export=ALL \
30-
bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh
32+
bash -c "bash benchmarks/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}_slurm.sh; rm -rf /dev/shm/sagemaker_sessions"
3133

3234
scancel $JOB_ID

0 commit comments

Comments
 (0)