Skip to content

Commit 7baa914

Browse files
authored
Merge branch 'main' into cap-dsv4-b200-trt-conc
2 parents c8c6b3c + ea4f575 commit 7baa914

19 files changed

Lines changed: 1313 additions & 15 deletions

.github/configs/nvidia-master.yaml

Lines changed: 217 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9236,11 +9236,13 @@ qwen3.5-fp8-h100-sglang:
92369236
- isl: 1024
92379237
osl: 1024
92389238
search-space:
9239-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9239+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9240+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 256 }
92409241
- isl: 8192
92419242
osl: 1024
92429243
search-space:
9243-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9244+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9245+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 256 }
92449246

92459247
qwen3.5-fp8-h100-sglang-mtp:
92469248
image: lmsysorg/sglang:v0.5.12-cu130
@@ -11055,6 +11057,219 @@ minimaxm2.5-fp4-b300-dynamo-vllm:
1105511057
ep: 8
1105611058
dp-attn: true
1105711059

11060+
minimaxm2.5-fp8-b300-dynamo-vllm:
11061+
image: vllm/vllm-openai:v0.20.1
11062+
model: MiniMaxAI/MiniMax-M2.5
11063+
model-prefix: minimaxm2.5
11064+
runner: b300
11065+
precision: fp8
11066+
framework: dynamo-vllm
11067+
multinode: true
11068+
disagg: true
11069+
scenarios:
11070+
fixed-seq-len:
11071+
- isl: 1024
11072+
osl: 1024
11073+
search-space:
11074+
- conc-list: [8, 16, 32, 64, 128]
11075+
prefill:
11076+
num-worker: 1
11077+
tp: 1
11078+
ep: 1
11079+
dp-attn: false
11080+
additional-settings:
11081+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-1p1d-tp4.yaml"
11082+
decode:
11083+
num-worker: 1
11084+
tp: 4
11085+
ep: 1
11086+
dp-attn: false
11087+
- conc-list: [32, 64, 128, 256, 512]
11088+
prefill:
11089+
num-worker: 1
11090+
tp: 1
11091+
ep: 1
11092+
dp-attn: false
11093+
additional-settings:
11094+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-1p2d-tp4.yaml"
11095+
decode:
11096+
num-worker: 2
11097+
tp: 4
11098+
ep: 1
11099+
dp-attn: false
11100+
- conc-list: [256, 512, 1024]
11101+
prefill:
11102+
num-worker: 1
11103+
tp: 1
11104+
ep: 1
11105+
dp-attn: false
11106+
additional-settings:
11107+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-1p2d-tp4ep.yaml"
11108+
decode:
11109+
num-worker: 2
11110+
tp: 4
11111+
ep: 4
11112+
dp-attn: false
11113+
- conc-list: [256, 512, 1024]
11114+
prefill:
11115+
num-worker: 2
11116+
tp: 1
11117+
ep: 1
11118+
dp-attn: false
11119+
additional-settings:
11120+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-2p1d-dep8.yaml"
11121+
decode:
11122+
num-worker: 1
11123+
tp: 8
11124+
ep: 8
11125+
dp-attn: true
11126+
- conc-list: [512, 1024, 2048]
11127+
prefill:
11128+
num-worker: 2
11129+
tp: 1
11130+
ep: 1
11131+
dp-attn: false
11132+
additional-settings:
11133+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-2p2d-dep4.yaml"
11134+
decode:
11135+
num-worker: 2
11136+
tp: 4
11137+
ep: 4
11138+
dp-attn: true
11139+
- conc-list: [4096, 8192]
11140+
prefill:
11141+
num-worker: 2
11142+
tp: 1
11143+
ep: 1
11144+
dp-attn: false
11145+
additional-settings:
11146+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml"
11147+
decode:
11148+
num-worker: 2
11149+
tp: 4
11150+
ep: 4
11151+
dp-attn: true
11152+
- conc-list: [1024]
11153+
prefill:
11154+
num-worker: 2
11155+
tp: 1
11156+
ep: 1
11157+
dp-attn: false
11158+
additional-settings:
11159+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-b300-2p3d-dep2.yaml"
11160+
decode:
11161+
num-worker: 3
11162+
tp: 2
11163+
ep: 2
11164+
dp-attn: true
11165+
- isl: 8192
11166+
osl: 1024
11167+
search-space:
11168+
- conc-list: [16, 64, 128]
11169+
prefill:
11170+
num-worker: 1
11171+
tp: 1
11172+
ep: 1
11173+
dp-attn: false
11174+
additional-settings:
11175+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-1p1d-tp4ep.yaml"
11176+
decode:
11177+
num-worker: 1
11178+
tp: 4
11179+
ep: 4
11180+
dp-attn: false
11181+
- conc-list: [256, 512]
11182+
prefill:
11183+
num-worker: 1
11184+
tp: 1
11185+
ep: 1
11186+
dp-attn: false
11187+
additional-settings:
11188+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml"
11189+
decode:
11190+
num-worker: 1
11191+
tp: 4
11192+
ep: 4
11193+
dp-attn: false
11194+
- conc-list: [32]
11195+
prefill:
11196+
num-worker: 2
11197+
tp: 1
11198+
ep: 1
11199+
dp-attn: false
11200+
additional-settings:
11201+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-2p1d-tp2.yaml"
11202+
decode:
11203+
num-worker: 1
11204+
tp: 2
11205+
ep: 1
11206+
dp-attn: false
11207+
- conc-list: [64, 128, 256, 512]
11208+
prefill:
11209+
num-worker: 2
11210+
tp: 1
11211+
ep: 1
11212+
dp-attn: false
11213+
additional-settings:
11214+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-2p1d-tp4ep.yaml"
11215+
decode:
11216+
num-worker: 1
11217+
tp: 4
11218+
ep: 4
11219+
dp-attn: false
11220+
- conc-list: [64]
11221+
prefill:
11222+
num-worker: 3
11223+
tp: 1
11224+
ep: 1
11225+
dp-attn: false
11226+
additional-settings:
11227+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-3p1d-tp4.yaml"
11228+
decode:
11229+
num-worker: 1
11230+
tp: 4
11231+
ep: 1
11232+
dp-attn: false
11233+
- conc-list: [256, 512]
11234+
prefill:
11235+
num-worker: 3
11236+
tp: 1
11237+
ep: 1
11238+
dp-attn: false
11239+
additional-settings:
11240+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-3p1d-dep4.yaml"
11241+
decode:
11242+
num-worker: 1
11243+
tp: 4
11244+
ep: 4
11245+
dp-attn: true
11246+
- conc-list: [1024, 2048]
11247+
prefill:
11248+
num-worker: 3
11249+
tp: 1
11250+
ep: 1
11251+
dp-attn: false
11252+
additional-settings:
11253+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml"
11254+
decode:
11255+
num-worker: 1
11256+
tp: 4
11257+
ep: 4
11258+
dp-attn: true
11259+
- conc-list: [512, 1024, 2048]
11260+
prefill:
11261+
num-worker: 5
11262+
tp: 1
11263+
ep: 1
11264+
dp-attn: false
11265+
additional-settings:
11266+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-b300-5p2d-dep4.yaml"
11267+
decode:
11268+
num-worker: 2
11269+
tp: 4
11270+
ep: 4
11271+
dp-attn: true
11272+
1105811273
minimaxm2.5-fp8-gb300-dynamo-vllm:
1105911274
image: vllm/vllm-openai:v0.20.1
1106011275
model: MiniMaxAI/MiniMax-M2.5
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
name: "minimax-m2.5-vllm-disagg-b300-1p1d-tp4"
2+
3+
model:
4+
path: "minimax-m2.5-fp8"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp8"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b300"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 1
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
allow_prefill_decode_colocation: true
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
40+
41+
vllm_config:
42+
prefill:
43+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
44+
kv-cache-dtype: "fp8"
45+
tensor-parallel-size: 1
46+
pipeline-parallel-size: 1
47+
safetensors-load-strategy: "prefetch"
48+
trust-remote-code: true
49+
no-enable-prefix-caching: true
50+
stream-interval: 32
51+
52+
decode:
53+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
54+
kv-cache-dtype: "fp8"
55+
tensor-parallel-size: 4
56+
safetensors-load-strategy: "prefetch"
57+
trust-remote-code: true
58+
no-enable-prefix-caching: true
59+
stream-interval: 32
60+
61+
benchmark:
62+
type: "sa-bench"
63+
isl: 1024
64+
osl: 1024
65+
concurrencies: "8x16x32x64x128"
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
name: "minimax-m2.5-vllm-disagg-b300-1p2d-tp4"
2+
3+
model:
4+
path: "minimax-m2.5-fp8"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp8"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b300"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 2
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
allow_prefill_decode_colocation: true
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
40+
41+
vllm_config:
42+
prefill:
43+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
44+
kv-cache-dtype: "fp8"
45+
tensor-parallel-size: 1
46+
pipeline-parallel-size: 1
47+
safetensors-load-strategy: "prefetch"
48+
trust-remote-code: true
49+
no-enable-prefix-caching: true
50+
stream-interval: 32
51+
52+
decode:
53+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
54+
kv-cache-dtype: "fp8"
55+
tensor-parallel-size: 4
56+
safetensors-load-strategy: "prefetch"
57+
trust-remote-code: true
58+
no-enable-prefix-caching: true
59+
stream-interval: 32
60+
61+
benchmark:
62+
type: "sa-bench"
63+
isl: 1024
64+
osl: 1024
65+
concurrencies: "32x64x128x256x512"

0 commit comments

Comments
 (0)