Skip to content

Commit c088658

Browse files
[GB300][SGLang] Add GLM5 FP8 dynamo-sglang disagg configs (#1557)
Port PR69 GLM5 FP8 GB300 disaggregated SGLang recipes to SA upstream and wire gb300-nv launcher support while keeping SA-default SLURM account/partition and sqsh paths. Co-authored-by: Ankur-singh <ankusingh@nvidia.com>
1 parent c9798a7 commit c088658

18 files changed

Lines changed: 2269 additions & 7 deletions

.github/configs/nvidia-master.yaml

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9428,3 +9428,213 @@ glm5-fp4-gb300-dynamo-sglang:
94289428
tp: 4
94299429
ep: 1
94309430
dp-attn: false
9431+
9432+
glm5-fp8-gb300-dynamo-sglang:
9433+
image: lmsysorg/sglang:v0.5.11-cu130
9434+
model: zai-org/GLM-5-FP8
9435+
model-prefix: glm5
9436+
runner: gb300-nv
9437+
precision: fp8
9438+
framework: dynamo-sglang
9439+
multinode: true
9440+
disagg: true
9441+
scenarios:
9442+
fixed-seq-len:
9443+
# ---------- 8k1k high-throughput (wide-EP decode) ----------
9444+
- isl: 8192
9445+
osl: 1024
9446+
search-space:
9447+
- conc-list: [2800]
9448+
prefill:
9449+
num-worker: 14
9450+
tp: 4
9451+
ep: 1
9452+
dp-attn: true
9453+
additional-settings:
9454+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml"
9455+
decode:
9456+
num-worker: 1
9457+
tp: 16
9458+
ep: 16
9459+
dp-attn: true
9460+
- conc-list: [1700]
9461+
prefill:
9462+
num-worker: 12
9463+
tp: 4
9464+
ep: 1
9465+
dp-attn: true
9466+
additional-settings:
9467+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml"
9468+
decode:
9469+
num-worker: 1
9470+
tp: 24
9471+
ep: 24
9472+
dp-attn: true
9473+
- conc-list: [1300]
9474+
prefill:
9475+
num-worker: 10
9476+
tp: 4
9477+
ep: 1
9478+
dp-attn: true
9479+
additional-settings:
9480+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml"
9481+
decode:
9482+
num-worker: 1
9483+
tp: 32
9484+
ep: 32
9485+
dp-attn: true
9486+
- conc-list: [900]
9487+
prefill:
9488+
num-worker: 8
9489+
tp: 4
9490+
ep: 1
9491+
dp-attn: true
9492+
additional-settings:
9493+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml"
9494+
decode:
9495+
num-worker: 1
9496+
tp: 40
9497+
ep: 40
9498+
dp-attn: true
9499+
# ---------- 8k1k low-latency (per-node TP=4 decode workers) ----------
9500+
- isl: 8192
9501+
osl: 1024
9502+
search-space:
9503+
- conc-list: [150]
9504+
prefill:
9505+
num-worker: 1
9506+
tp: 4
9507+
ep: 1
9508+
dp-attn: true
9509+
additional-settings:
9510+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
9511+
decode:
9512+
num-worker: 9
9513+
tp: 4
9514+
ep: 1
9515+
dp-attn: false
9516+
- conc-list: [128, 64, 32]
9517+
prefill:
9518+
num-worker: 1
9519+
tp: 4
9520+
ep: 1
9521+
dp-attn: true
9522+
additional-settings:
9523+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
9524+
decode:
9525+
num-worker: 17
9526+
tp: 4
9527+
ep: 1
9528+
dp-attn: false
9529+
- conc-list: [24]
9530+
prefill:
9531+
num-worker: 1
9532+
tp: 4
9533+
ep: 1
9534+
dp-attn: true
9535+
additional-settings:
9536+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
9537+
decode:
9538+
num-worker: 17
9539+
tp: 4
9540+
ep: 1
9541+
dp-attn: false
9542+
# ---------- 1k1k high-throughput (wide-EP decode) ----------
9543+
- isl: 1024
9544+
osl: 1024
9545+
search-space:
9546+
- conc-list: [8192]
9547+
prefill:
9548+
num-worker: 12
9549+
tp: 4
9550+
ep: 1
9551+
dp-attn: true
9552+
additional-settings:
9553+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml"
9554+
decode:
9555+
num-worker: 1
9556+
tp: 24
9557+
ep: 24
9558+
dp-attn: true
9559+
- conc-list: [7500]
9560+
prefill:
9561+
num-worker: 10
9562+
tp: 4
9563+
ep: 1
9564+
dp-attn: true
9565+
additional-settings:
9566+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml"
9567+
decode:
9568+
num-worker: 1
9569+
tp: 32
9570+
ep: 32
9571+
dp-attn: true
9572+
- conc-list: [7300]
9573+
prefill:
9574+
num-worker: 8
9575+
tp: 4
9576+
ep: 1
9577+
dp-attn: true
9578+
additional-settings:
9579+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml"
9580+
decode:
9581+
num-worker: 1
9582+
tp: 40
9583+
ep: 40
9584+
dp-attn: true
9585+
- conc-list: [6500]
9586+
prefill:
9587+
num-worker: 6
9588+
tp: 4
9589+
ep: 1
9590+
dp-attn: true
9591+
additional-settings:
9592+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml"
9593+
decode:
9594+
num-worker: 1
9595+
tp: 48
9596+
ep: 48
9597+
dp-attn: true
9598+
- conc-list: [5700]
9599+
prefill:
9600+
num-worker: 4
9601+
tp: 4
9602+
ep: 1
9603+
dp-attn: true
9604+
additional-settings:
9605+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml"
9606+
decode:
9607+
num-worker: 1
9608+
tp: 56
9609+
ep: 56
9610+
dp-attn: true
9611+
# ---------- 1k1k low-latency (per-node TP=4 decode workers) ----------
9612+
- isl: 1024
9613+
osl: 1024
9614+
search-space:
9615+
- conc-list: [512, 256, 128, 64]
9616+
prefill:
9617+
num-worker: 1
9618+
tp: 4
9619+
ep: 1
9620+
dp-attn: true
9621+
additional-settings:
9622+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
9623+
decode:
9624+
num-worker: 17
9625+
tp: 4
9626+
ep: 1
9627+
dp-attn: false
9628+
- conc-list: [32]
9629+
prefill:
9630+
num-worker: 1
9631+
tp: 4
9632+
ep: 1
9633+
dp-attn: true
9634+
additional-settings:
9635+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
9636+
decode:
9637+
num-worker: 17
9638+
tp: 4
9639+
ep: 1
9640+
dp-attn: false
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
name: gb300-fp8-glm5_1k1k_hightpt_0
2+
3+
model:
4+
path: glm-5-fp8
5+
container: "lmsysorg/sglang:v0.5.11-cu130"
6+
precision: fp8
7+
8+
resources:
9+
gpu_type: gb300
10+
gpus_per_node: 4
11+
prefill_nodes: 12
12+
prefill_workers: 12
13+
decode_nodes: 6
14+
decode_workers: 1
15+
frontend:
16+
type: dynamo
17+
enable_multiple_frontends: true
18+
num_additional_frontends: 9
19+
dynamo:
20+
version: 1.1.0
21+
22+
backend:
23+
prefill_environment:
24+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
25+
PYTHONUNBUFFERED: '1'
26+
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
27+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
28+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
29+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
30+
MC_TE_METRIC: 'true'
31+
MC_FORCE_MNNVL: '1'
32+
NCCL_MNNVL_ENABLE: '1'
33+
NCCL_CUMEM_ENABLE: '1'
34+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
35+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
36+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
37+
DYN_REQUEST_PLANE: nats
38+
39+
decode_environment:
40+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
41+
PYTHONUNBUFFERED: '1'
42+
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
43+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
44+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
45+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
46+
MC_TE_METRIC: 'true'
47+
MC_FORCE_MNNVL: '1'
48+
NCCL_MNNVL_ENABLE: '1'
49+
NCCL_CUMEM_ENABLE: '1'
50+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
51+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
52+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
53+
DYN_REQUEST_PLANE: nats
54+
# DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size).
55+
# Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024.
56+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
57+
58+
sglang_config:
59+
prefill:
60+
# Model configuration
61+
served-model-name: GLM-5-FP8
62+
trust-remote-code: true
63+
quantization: fp8
64+
kv-cache-dtype: fp8_e4m3
65+
66+
# Disaggregation mode
67+
disaggregation-mode: prefill
68+
disaggregation-transfer-backend: nixl
69+
70+
# Size limits
71+
max-running-requests: 256
72+
cuda-graph-max-bs: 256
73+
mem-fraction-static: 0.7
74+
context-length: 9600
75+
chunked-prefill-size: 32768
76+
max-prefill-tokens: 8192
77+
78+
# Parallelism
79+
tensor-parallel-size: 4
80+
data-parallel-size: 4
81+
expert-parallel-size: 1
82+
enable-dp-attention: true
83+
enable-dp-lm-head: true
84+
load-balance-method: total_tokens
85+
86+
# Backend
87+
nsa-decode-backend: trtllm
88+
nsa-prefill-backend: trtllm
89+
moe-runner-backend: flashinfer_trtllm
90+
91+
# Other flags
92+
enable-flashinfer-allreduce-fusion: true
93+
disable-radix-cache: true
94+
weight-loader-prefetch-checkpoints: true
95+
model-loader-extra-config: '{"enable_multithread_load": true}'
96+
97+
decode:
98+
# Model configuration
99+
served-model-name: GLM-5-FP8
100+
trust-remote-code: true
101+
102+
quantization: fp8
103+
kv-cache-dtype: fp8_e4m3
104+
105+
# Disaggregation mode
106+
disaggregation-mode: decode
107+
disaggregation-transfer-backend: nixl
108+
109+
# Memory and token limits
110+
mem-fraction-static: 0.8
111+
context-length: 9600
112+
113+
# Backend
114+
nsa-decode-backend: trtllm
115+
nsa-prefill-backend: trtllm
116+
# moe-runner-backend: "cutedsl"
117+
118+
# Detokenizer
119+
skip-tokenizer-init: true
120+
stream-interval: 30
121+
122+
# Other flags
123+
disable-radix-cache: true
124+
weight-loader-prefetch-checkpoints: true
125+
model-loader-extra-config: '{"enable_multithread_load": true}'
126+
tensor-parallel-size: 24
127+
expert-parallel-size: 24
128+
data-parallel-size: 24
129+
enable-dp-lm-head: true
130+
enable-dp-attention: true
131+
moe-dense-tp-size: 1
132+
ep-num-redundant-experts: 32
133+
ep-dispatch-algorithm: static
134+
moe-a2a-backend: deepep
135+
deepep-mode: low_latency
136+
deepep-config: /configs/deepep_config.json
137+
max-running-requests: 8192
138+
cuda-graph-max-bs: 512
139+
health_check:
140+
max_attempts: 360
141+
interval_seconds: 10
142+
143+
benchmark:
144+
type: sa-bench
145+
req_rate: inf
146+
isl: 1024
147+
osl: 1024
148+
concurrencies: '8192'

0 commit comments

Comments
 (0)