Skip to content

Commit d8fe8f7

Browse files
Update H200/B200 SGLang image to v0.5.5-cu129-amd64 and fix deprecated flags (#204)
* Update h200/b200 sglang image tags to v0.5.5-cu129-amd64 Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com> * Fix deprecated SGLang flags: replace --enable-ep-moe with --ep-size 8 and --enable-flashinfer-trtllm-moe with --moe-runner-backend flashinfer_trtllm Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com> * Use $EP_SIZE variable instead of hardcoded 8 and add ep: 8 to nvidia-master.yaml for B200 SGLang configs Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com> * Add ep: 4 for tp=4 entries in dsr1-fp4-b200-sglang config Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com> * Pass EP_SIZE environment variable to Docker containers in B200 runner scripts Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent 167b724 commit d8fe8f7

5 files changed

Lines changed: 17 additions & 17 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dsr1-fp4-b200-sglang:
2-
image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
2+
image: lmsysorg/sglang:v0.5.5-cu129-amd64
33
model: nvidia/DeepSeek-R1-0528-FP4-V2
44
model-prefix: dsr1
55
runner: b200
@@ -9,18 +9,18 @@ dsr1-fp4-b200-sglang:
99
- isl: 1024
1010
osl: 1024
1111
search-space:
12-
- { tp: 4, conc-start: 4, conc-end: 128 }
13-
- { tp: 8, conc-start: 4, conc-end: 128 }
12+
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
13+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
1414
- isl: 1024
1515
osl: 8192
1616
search-space:
17-
- { tp: 4, conc-start: 4, conc-end: 128 }
18-
- { tp: 8, conc-start: 4, conc-end: 128 }
17+
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
18+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
1919
- isl: 8192
2020
osl: 1024
2121
search-space:
22-
- { tp: 4, conc-start: 4, conc-end: 128 }
23-
- { tp: 8, conc-start: 4, conc-end: 16 }
22+
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
23+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
2424

2525
dsr1-fp4-b200-trt:
2626
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -73,7 +73,7 @@ dsr1-fp4-b200-trt:
7373
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
7474

7575
dsr1-fp8-b200-sglang:
76-
image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
76+
image: lmsysorg/sglang:v0.5.5-cu129-amd64
7777
model: deepseek-ai/DeepSeek-R1-0528
7878
model-prefix: dsr1
7979
runner: b200
@@ -83,15 +83,15 @@ dsr1-fp8-b200-sglang:
8383
- isl: 1024
8484
osl: 1024
8585
search-space:
86-
- { tp: 8, conc-start: 4, conc-end: 64 }
86+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
8787
- isl: 1024
8888
osl: 8192
8989
search-space:
90-
- { tp: 8, conc-start: 4, conc-end: 64 }
90+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
9191
- isl: 8192
9292
osl: 1024
9393
search-space:
94-
- { tp: 8, conc-start: 4, conc-end: 64 }
94+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
9595

9696
dsr1-fp8-b200-trt:
9797
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -120,7 +120,7 @@ dsr1-fp8-b200-trt:
120120
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
121121

122122
dsr1-fp8-h200-sglang:
123-
image: lmsysorg/sglang:v0.5.2rc2-cu126
123+
image: lmsysorg/sglang:v0.5.5-cu129-amd64
124124
model: deepseek-ai/DeepSeek-R1-0528
125125
model-prefix: dsr1
126126
runner: h200

benchmarks/dsr1_fp4_b200_docker.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
2121
--tensor-parallel-size=$TP --data-parallel-size=1 \
2222
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
2323
--chunked-prefill-size 16384 \
24-
--enable-ep-moe --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
25-
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 10
24+
--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
25+
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10
2626

benchmarks/dsr1_fp8_b200_docker.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
3434
--cuda-graph-max-bs 128 --max-running-requests 128 \
3535
--mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
3636
--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
37-
--attention-backend trtllm_mla --stream-interval 30 --enable-flashinfer-trtllm-moe --quantization fp8
37+
--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8

runners/launch_b200-nvd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ docker run --rm -d --init --network host --name $server_name \
3030
--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
3131
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
3232
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
33-
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \
33+
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
3434
-e NCCL_GRAPH_REGISTER=0 \
3535
-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
3636
--entrypoint=/bin/bash \

runners/launch_b200-tg.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ docker run --rm -d --network host --name $server_name \
1212
--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
1313
-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
1414
-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
15-
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \
15+
-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \
1616
-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
1717
--entrypoint=/bin/bash \
1818
$(echo "$IMAGE" | sed 's/#/\//') \

0 commit comments

Comments
 (0)