Skip to content

Commit f3dbc40

Browse files
authored
[NV] GLM5 FP8 B200 SGLang disagg config (#1372)
* Add GLM5 B200 Dynamo SGLang disagg config * Update GLM5 B200 disagg changelog PR link * Overlay vendored glm5-fp8 sglang recipes in b200-dgxc launcher The new glm5-fp8-b200-dynamo-sglang config references recipes/sglang/glm5/b200-fp8/... paths inside the cloned srt-slurm repo, but srt-slurm@sa-submission-q2-2026 does not ship those recipes. Add a parallel branch to the existing dsv4 overlay so the vendored recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/ are copied into the clone before srtctl apply runs. * Pin glm5-fp8-b200-dynamo-sglang to b200-dgxc runner group
1 parent a0f295e commit f3dbc40

18 files changed

Lines changed: 1718 additions & 0 deletions

.github/configs/nvidia-master.yaml

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8819,6 +8819,219 @@ dsv4-fp4-gb300-dynamo-sglang:
88198819
ep: 12
88208820
dp-attn: true
88218821

8822+
glm5-fp8-b200-dynamo-sglang:
8823+
image: lmsysorg/sglang:v0.5.11-cu130
8824+
model: zai-org/GLM-5-FP8
8825+
model-prefix: glm5
8826+
runner: b200-dgxc
8827+
precision: fp8
8828+
framework: dynamo-sglang
8829+
multinode: true
8830+
disagg: true
8831+
scenarios:
8832+
fixed-seq-len:
8833+
- isl: 1024
8834+
osl: 1024
8835+
search-space:
8836+
- conc-list: [2576]
8837+
prefill:
8838+
num-worker: 1
8839+
tp: 8
8840+
ep: 1
8841+
dp-attn: true
8842+
additional-settings:
8843+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml"
8844+
decode:
8845+
num-worker: 1
8846+
tp: 8
8847+
ep: 1
8848+
dp-attn: true
8849+
- conc-list: [1248]
8850+
prefill:
8851+
num-worker: 1
8852+
tp: 8
8853+
ep: 1
8854+
dp-attn: true
8855+
additional-settings:
8856+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml"
8857+
decode:
8858+
num-worker: 2
8859+
tp: 8
8860+
ep: 1
8861+
dp-attn: true
8862+
- conc-list: [800]
8863+
prefill:
8864+
num-worker: 1
8865+
tp: 8
8866+
ep: 1
8867+
dp-attn: true
8868+
additional-settings:
8869+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml"
8870+
decode:
8871+
num-worker: 3
8872+
tp: 8
8873+
ep: 1
8874+
dp-attn: true
8875+
- conc-list: [576]
8876+
prefill:
8877+
num-worker: 1
8878+
tp: 8
8879+
ep: 1
8880+
dp-attn: true
8881+
additional-settings:
8882+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml"
8883+
decode:
8884+
num-worker: 4
8885+
tp: 8
8886+
ep: 1
8887+
dp-attn: true
8888+
- conc-list: [512, 256, 128, 64, 32]
8889+
prefill:
8890+
num-worker: 1
8891+
tp: 8
8892+
ep: 1
8893+
dp-attn: true
8894+
additional-settings:
8895+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
8896+
decode:
8897+
num-worker: 8
8898+
tp: 8
8899+
ep: 1
8900+
dp-attn: false
8901+
- conc-list: [16]
8902+
prefill:
8903+
num-worker: 1
8904+
tp: 8
8905+
ep: 1
8906+
dp-attn: true
8907+
additional-settings:
8908+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
8909+
decode:
8910+
num-worker: 8
8911+
tp: 8
8912+
ep: 1
8913+
dp-attn: false
8914+
- isl: 8192
8915+
osl: 1024
8916+
search-space:
8917+
- conc-list: [560]
8918+
prefill:
8919+
num-worker: 2
8920+
tp: 8
8921+
ep: 1
8922+
dp-attn: true
8923+
additional-settings:
8924+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
8925+
decode:
8926+
num-worker: 1
8927+
tp: 8
8928+
ep: 1
8929+
dp-attn: true
8930+
- conc-list: [240]
8931+
prefill:
8932+
num-worker: 1
8933+
tp: 8
8934+
ep: 1
8935+
dp-attn: true
8936+
additional-settings:
8937+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
8938+
decode:
8939+
num-worker: 1
8940+
tp: 8
8941+
ep: 1
8942+
dp-attn: true
8943+
- conc-list: [224]
8944+
prefill:
8945+
num-worker: 1
8946+
tp: 8
8947+
ep: 1
8948+
dp-attn: true
8949+
additional-settings:
8950+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
8951+
decode:
8952+
num-worker: 2
8953+
tp: 8
8954+
ep: 1
8955+
dp-attn: true
8956+
- conc-list: [256]
8957+
prefill:
8958+
num-worker: 1
8959+
tp: 8
8960+
ep: 1
8961+
dp-attn: true
8962+
additional-settings:
8963+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
8964+
decode:
8965+
num-worker: 2
8966+
tp: 8
8967+
ep: 1
8968+
dp-attn: false
8969+
- conc-list: [256]
8970+
prefill:
8971+
num-worker: 1
8972+
tp: 8
8973+
ep: 1
8974+
dp-attn: true
8975+
additional-settings:
8976+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
8977+
decode:
8978+
num-worker: 3
8979+
tp: 8
8980+
ep: 1
8981+
dp-attn: false
8982+
- conc-list: [200]
8983+
prefill:
8984+
num-worker: 1
8985+
tp: 8
8986+
ep: 1
8987+
dp-attn: true
8988+
additional-settings:
8989+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
8990+
decode:
8991+
num-worker: 4
8992+
tp: 8
8993+
ep: 1
8994+
dp-attn: false
8995+
- conc-list: [128]
8996+
prefill:
8997+
num-worker: 1
8998+
tp: 8
8999+
ep: 1
9000+
dp-attn: true
9001+
additional-settings:
9002+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml"
9003+
decode:
9004+
num-worker: 5
9005+
tp: 8
9006+
ep: 1
9007+
dp-attn: false
9008+
- conc-list: [64]
9009+
prefill:
9010+
num-worker: 1
9011+
tp: 8
9012+
ep: 1
9013+
dp-attn: true
9014+
additional-settings:
9015+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml"
9016+
decode:
9017+
num-worker: 7
9018+
tp: 8
9019+
ep: 1
9020+
dp-attn: false
9021+
- conc-list: [12]
9022+
prefill:
9023+
num-worker: 1
9024+
tp: 8
9025+
ep: 1
9026+
dp-attn: true
9027+
additional-settings:
9028+
- "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml"
9029+
decode:
9030+
num-worker: 8
9031+
tp: 8
9032+
ep: 1
9033+
dp-attn: false
9034+
88229035
# MTP variant of dsv4-fp4-gb300-dynamo-sglang.
88239036
dsv4-fp4-gb300-dynamo-sglang-mtp:
88249037
image: lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
name: b200-fp8-glm5_1k1k_lowlat_0
2+
model:
3+
path: glm5-fp8
4+
container: "lmsysorg/sglang:v0.5.11-cu130"
5+
precision: fp8
6+
resources:
7+
gpu_type: b200
8+
gpus_per_node: 8
9+
prefill_nodes: 1
10+
prefill_workers: 1
11+
decode_nodes: 8
12+
decode_workers: 8
13+
frontend:
14+
type: dynamo
15+
dynamo:
16+
version: "1.1.0"
17+
backend:
18+
prefill_environment:
19+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
20+
PYTHONUNBUFFERED: '1'
21+
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
22+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
23+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
24+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
25+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
26+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
27+
NCCL_CUMEM_ENABLE: '1'
28+
DYN_REQUEST_PLANE: nats
29+
decode_environment:
30+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
31+
PYTHONUNBUFFERED: '1'
32+
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
33+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
34+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
35+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
36+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
37+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
38+
NCCL_CUMEM_ENABLE: '1'
39+
DYN_REQUEST_PLANE: nats
40+
sglang_config:
41+
prefill:
42+
served-model-name: GLM-5-FP8
43+
trust-remote-code: true
44+
quantization: fp8
45+
kv-cache-dtype: fp8_e4m3
46+
disaggregation-mode: prefill
47+
disaggregation-transfer-backend: nixl
48+
max-running-requests: 256
49+
cuda-graph-max-bs: 256
50+
mem-fraction-static: 0.7
51+
context-length: 9600
52+
chunked-prefill-size: 65536
53+
max-prefill-tokens: 8192
54+
tensor-parallel-size: 8
55+
data-parallel-size: 8
56+
expert-parallel-size: 1
57+
enable-dp-attention: true
58+
enable-dp-lm-head: true
59+
load-balance-method: total_tokens
60+
nsa-decode-backend: trtllm
61+
nsa-prefill-backend: trtllm
62+
moe-runner-backend: flashinfer_trtllm
63+
enable-flashinfer-allreduce-fusion: true
64+
weight-loader-prefetch-checkpoints: true
65+
disable-radix-cache: true
66+
stream-interval: 30
67+
model-loader-extra-config: '{"enable_multithread_load": true}'
68+
decode:
69+
served-model-name: GLM-5-FP8
70+
trust-remote-code: true
71+
quantization: fp8
72+
kv-cache-dtype: fp8_e4m3
73+
disaggregation-mode: decode
74+
disaggregation-transfer-backend: nixl
75+
mem-fraction-static: 0.8
76+
context-length: 9600
77+
tensor-parallel-size: 8
78+
expert-parallel-size: 1
79+
nsa-decode-backend: trtllm
80+
nsa-prefill-backend: trtllm
81+
moe-runner-backend: flashinfer_trtllm
82+
enable-flashinfer-allreduce-fusion: true
83+
weight-loader-prefetch-checkpoints: true
84+
disable-radix-cache: true
85+
stream-interval: 30
86+
model-loader-extra-config: '{"enable_multithread_load": true}'
87+
data-parallel-size: 1
88+
max-running-requests: 64
89+
cuda-graph-max-bs: 64
90+
health_check:
91+
max_attempts: 360
92+
interval_seconds: 10
93+
benchmark:
94+
type: sa-bench
95+
req_rate: inf
96+
isl: 1024
97+
osl: 1024
98+
concurrencies: 512x256x128x64x32

0 commit comments

Comments
 (0)