@@ -1682,13 +1682,13 @@ dsr1-fp4-b200-sglang:
16821682 - isl : 1024
16831683 osl : 1024
16841684 search-space :
1685- - { tp: 4, ep: 4, conc-start: 4 , conc-end: 128 }
1686- - { tp: 8, ep: 8, conc-start: 4 , conc-end: 128 }
1685+ - { tp: 4, ep: 4, conc-start: 1 , conc-end: 128 }
1686+ - { tp: 8, ep: 8, conc-start: 1 , conc-end: 128 }
16871687 - isl : 8192
16881688 osl : 1024
16891689 search-space :
1690- - { tp: 4, ep: 4, conc-start: 4 , conc-end: 128 }
1691- - { tp: 8, ep: 8, conc-start: 4 , conc-end: 16 }
1690+ - { tp: 4, ep: 4, conc-start: 1 , conc-end: 128 }
1691+ - { tp: 8, ep: 8, conc-start: 1 , conc-end: 16 }
16921692 # agentic-coding: temporarily disabled — blocked by e2e-tests.yml artifact
16931693 # name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads as
16941694 # `bmk_agentic_*`). Re-enable once that workflow is aligned.
@@ -1883,13 +1883,13 @@ dsr1-fp4-b300-sglang:
18831883 - isl : 1024
18841884 osl : 1024
18851885 search-space :
1886- - { tp: 4, ep: 4, conc-start: 4 , conc-end: 128 }
1887- - { tp: 8, ep: 8, conc-start: 4 , conc-end: 128 }
1886+ - { tp: 4, ep: 4, conc-start: 1 , conc-end: 128 }
1887+ - { tp: 8, ep: 8, conc-start: 1 , conc-end: 128 }
18881888 - isl : 8192
18891889 osl : 1024
18901890 search-space :
1891- - { tp: 4, ep: 4, conc-start: 4 , conc-end: 128 }
1892- - { tp: 8, ep: 8, conc-start: 4 , conc-end: 16 }
1891+ - { tp: 4, ep: 4, conc-start: 1 , conc-end: 128 }
1892+ - { tp: 8, ep: 8, conc-start: 1 , conc-end: 16 }
18931893
18941894dsr1-fp4-b200-trt :
18951895 image : nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
@@ -1969,12 +1969,12 @@ dsr1-fp8-b200-sglang:
19691969 - isl : 1024
19701970 osl : 1024
19711971 search-space :
1972- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 64 }
1972+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 64 }
19731973 - isl : 8192
19741974 osl : 1024
19751975 search-space :
1976- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 4 }
1977- - { tp: 4, ep: 1, conc-start: 4 , conc-end: 32 }
1976+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 4 }
1977+ - { tp: 4, ep: 1, conc-start: 1 , conc-end: 32 }
19781978
19791979 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
19801980 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8
@@ -1992,12 +1992,12 @@ dsr1-fp8-b300-sglang:
19921992 - isl : 1024
19931993 osl : 1024
19941994 search-space :
1995- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 64 }
1995+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 64 }
19961996 - isl : 8192
19971997 osl : 1024
19981998 search-space :
1999- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 4 }
2000- - { tp: 4, ep: 1, conc-start: 4 , conc-end: 32 }
1999+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 4 }
2000+ - { tp: 4, ep: 1, conc-start: 1 , conc-end: 32 }
20012001
20022002# DeepSeek-V4-Pro on B300 with SGLang (non-MTP). This follows the 8k/1k
20032003# submission frontier from the 2026-05-19 Pareto HTML:
@@ -2725,11 +2725,11 @@ dsr1-fp8-b200-sglang-mtp:
27252725 - isl : 1024
27262726 osl : 1024
27272727 search-space :
2728- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 512, spec-decoding: mtp }
2728+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 512, spec-decoding: mtp }
27292729 - isl : 8192
27302730 osl : 1024
27312731 search-space :
2732- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 512, spec-decoding: mtp }
2732+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 512, spec-decoding: mtp }
27332733
27342734 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
27352735 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8
@@ -2748,11 +2748,11 @@ dsr1-fp8-b300-sglang-mtp:
27482748 - isl : 1024
27492749 osl : 1024
27502750 search-space :
2751- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 512, spec-decoding: mtp }
2751+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 512, spec-decoding: mtp }
27522752 - isl : 8192
27532753 osl : 1024
27542754 search-space :
2755- - { tp: 8, ep: 1, conc-start: 4 , conc-end: 512, spec-decoding: mtp }
2755+ - { tp: 8, ep: 1, conc-start: 1 , conc-end: 512, spec-decoding: mtp }
27562756
27572757# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
27582758# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
@@ -4503,7 +4503,7 @@ gptoss-fp4-h100-vllm:
45034503 - { tp: 8, conc-start: 4, conc-end: 16 }
45044504
45054505minimaxm2.5-fp8-h100-vllm :
4506- image : vllm/vllm-openai:v0.21.0
4506+ image : vllm/vllm-openai:v0.19.1-cu130
45074507 model : MiniMaxAI/MiniMax-M2.5
45084508 model-prefix : minimaxm2.5
45094509 runner : h100
@@ -4515,13 +4515,11 @@ minimaxm2.5-fp8-h100-vllm:
45154515 - isl : 1024
45164516 osl : 1024
45174517 search-space :
4518- # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
4519- - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
4518+ - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
45204519 - isl : 8192
45214520 osl : 1024
45224521 search-space :
4523- # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
4524- - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
4522+ - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
45254523
45264524# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
45274525# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
@@ -9222,3 +9220,199 @@ qwen3.5-fp8-h100-sglang-mtp:
92229220 osl : 1024
92239221 search-space :
92249222 - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
9223+
9224+ glm5-fp4-gb300-dynamo-sglang :
9225+ image : lmsysorg/sglang:v0.5.11-cu130
9226+ model : nvidia/GLM-5-NVFP4
9227+ model-prefix : glm5
9228+ runner : gb300-nv
9229+ precision : fp4
9230+ framework : dynamo-sglang
9231+ multinode : true
9232+ disagg : true
9233+ scenarios :
9234+ fixed-seq-len :
9235+ # ---------- 8k1k high-throughput (wide-EP TP=32 decode) ----------
9236+ - isl : 8192
9237+ osl : 1024
9238+ search-space :
9239+ # 5p1d wide-EP. 13 nodes (5P @ TP=4 + 1D @ TP=32 on 8 nodes).
9240+ - conc-list : [2048]
9241+ prefill :
9242+ num-worker : 5
9243+ tp : 4
9244+ ep : 1
9245+ dp-attn : true
9246+ additional-settings :
9247+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
9248+ decode :
9249+ num-worker : 1
9250+ tp : 32
9251+ ep : 32
9252+ dp-attn : true
9253+ # 7p1d wide-EP. 15 nodes.
9254+ - conc-list : [3072]
9255+ prefill :
9256+ num-worker : 7
9257+ tp : 4
9258+ ep : 1
9259+ dp-attn : true
9260+ additional-settings :
9261+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
9262+ decode :
9263+ num-worker : 1
9264+ tp : 32
9265+ ep : 32
9266+ dp-attn : true
9267+ # 10p1d wide-EP. 18 nodes.
9268+ - conc-list : [4096]
9269+ prefill :
9270+ num-worker : 10
9271+ tp : 4
9272+ ep : 1
9273+ dp-attn : true
9274+ additional-settings :
9275+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
9276+ decode :
9277+ num-worker : 1
9278+ tp : 32
9279+ ep : 32
9280+ dp-attn : true
9281+ # ---------- 8k1k low-latency (per-node TP=4 decode workers) ----------
9282+ - isl : 8192
9283+ osl : 1024
9284+ search-space :
9285+ # 1p3d. 4 nodes (1P + 3 D workers @ 1 node each).
9286+ - conc-list : [1024]
9287+ prefill :
9288+ num-worker : 1
9289+ tp : 4
9290+ ep : 1
9291+ dp-attn : true
9292+ additional-settings :
9293+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
9294+ decode :
9295+ num-worker : 3
9296+ tp : 4
9297+ ep : 1
9298+ dp-attn : false
9299+ # 1p5d. 6 nodes.
9300+ - conc-list : [1024]
9301+ prefill :
9302+ num-worker : 1
9303+ tp : 4
9304+ ep : 1
9305+ dp-attn : true
9306+ additional-settings :
9307+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
9308+ decode :
9309+ num-worker : 5
9310+ tp : 4
9311+ ep : 1
9312+ dp-attn : false
9313+ # 1p9d. 10 nodes.
9314+ - conc-list : [1024]
9315+ prefill :
9316+ num-worker : 1
9317+ tp : 4
9318+ ep : 1
9319+ dp-attn : true
9320+ additional-settings :
9321+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
9322+ decode :
9323+ num-worker : 9
9324+ tp : 4
9325+ ep : 1
9326+ dp-attn : false
9327+ # 1p15d. 16 nodes.
9328+ - conc-list : [1024]
9329+ prefill :
9330+ num-worker : 1
9331+ tp : 4
9332+ ep : 1
9333+ dp-attn : true
9334+ additional-settings :
9335+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml"
9336+ decode :
9337+ num-worker : 15
9338+ tp : 4
9339+ ep : 1
9340+ dp-attn : false
9341+ # ---------- 1k1k high-throughput (wide-EP TP=32 decode) ----------
9342+ - isl : 1024
9343+ osl : 1024
9344+ search-space :
9345+ # 3p1d wide-EP. 11 nodes. conc 16500.
9346+ - conc-list : [16500]
9347+ prefill :
9348+ num-worker : 3
9349+ tp : 4
9350+ ep : 1
9351+ dp-attn : true
9352+ additional-settings :
9353+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml"
9354+ decode :
9355+ num-worker : 1
9356+ tp : 32
9357+ ep : 32
9358+ dp-attn : true
9359+ # 2p1d wide-EP. 10 nodes. conc 8300.
9360+ - conc-list : [8300]
9361+ prefill :
9362+ num-worker : 2
9363+ tp : 4
9364+ ep : 1
9365+ dp-attn : true
9366+ additional-settings :
9367+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml"
9368+ decode :
9369+ num-worker : 1
9370+ tp : 32
9371+ ep : 32
9372+ dp-attn : true
9373+ # 1p1d wide-EP. 9 nodes. conc sweep 2500x1024x512x256.
9374+ - conc-list : [2500, 1024, 512, 256]
9375+ prefill :
9376+ num-worker : 1
9377+ tp : 4
9378+ ep : 1
9379+ dp-attn : true
9380+ additional-settings :
9381+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml"
9382+ decode :
9383+ num-worker : 1
9384+ tp : 32
9385+ ep : 32
9386+ dp-attn : true
9387+ # ---------- 1k1k low-latency (per-node TP=4 decode workers) ----------
9388+ - isl : 1024
9389+ osl : 1024
9390+ search-space :
9391+ # 1p17d low-latency, bs=32 sweep. 18 nodes.
9392+ - conc-list : [512, 256, 128, 64]
9393+ prefill :
9394+ num-worker : 1
9395+ tp : 4
9396+ ep : 1
9397+ dp-attn : true
9398+ additional-settings :
9399+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
9400+ decode :
9401+ num-worker : 17
9402+ tp : 4
9403+ ep : 1
9404+ dp-attn : false
9405+ # 1p17d low-latency, bs=1 (single-stream). 18 nodes.
9406+ - conc-list : [32]
9407+ prefill :
9408+ num-worker : 1
9409+ tp : 4
9410+ ep : 1
9411+ dp-attn : true
9412+ additional-settings :
9413+ - " CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
9414+ decode :
9415+ num-worker : 17
9416+ tp : 4
9417+ ep : 1
9418+ dp-attn : false
0 commit comments