Skip to content

Commit 7d4063d

Browse files
[B300][vLLM] Add MiniMax-M2.5 FP4 disagg Dynamo configs (#1652)
* [B300][vLLM] Add MiniMax-M2.5 FP4 disagg Dynamo configs Split of #1560 — B300 half. - Add minimaxm2.5-fp4-b300-dynamo-vllm to nvidia-master.yaml (1k1k + 8k1k search spaces; image vllm/vllm-openai:v0.20.1, model nvidia/MiniMax-M2.5-NVFP4). - Add srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/. - Wire minimax + dynamo-vllm routing into runners/launch_b300-nv.sh. - Append perf-changelog entry. * perf-changelog: link minimaxm2.5-fp4-b300 entry to PR #83 * perf-changelog: link minimaxm2.5-fp4-b300 entry to PR #1652 --------- Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent 0316b19 commit 7d4063d

18 files changed

Lines changed: 1300 additions & 1 deletion

.github/configs/nvidia-master.yaml

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10532,3 +10532,216 @@ minimaxm2.5-fp4-b200-dynamo-vllm:
1053210532
tp: 4
1053310533
ep: 4
1053410534
dp-attn: true
10535+
10536+
minimaxm2.5-fp4-b300-dynamo-vllm:
10537+
image: vllm/vllm-openai:v0.20.1
10538+
model: nvidia/MiniMax-M2.5-NVFP4
10539+
model-prefix: minimaxm2.5
10540+
runner: b300
10541+
precision: fp4
10542+
framework: dynamo-vllm
10543+
multinode: true
10544+
disagg: true
10545+
scenarios:
10546+
fixed-seq-len:
10547+
- isl: 1024
10548+
osl: 1024
10549+
search-space:
10550+
- conc-list: [4, 16]
10551+
prefill:
10552+
num-worker: 1
10553+
tp: 1
10554+
ep: 1
10555+
dp-attn: false
10556+
additional-settings:
10557+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml"
10558+
decode:
10559+
num-worker: 1
10560+
tp: 4
10561+
ep: 1
10562+
dp-attn: false
10563+
- conc-list: [4]
10564+
prefill:
10565+
num-worker: 1
10566+
tp: 1
10567+
ep: 1
10568+
dp-attn: false
10569+
additional-settings:
10570+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp8-1p1d.yaml"
10571+
decode:
10572+
num-worker: 1
10573+
tp: 8
10574+
ep: 1
10575+
dp-attn: false
10576+
- conc-list: [8, 16]
10577+
prefill:
10578+
num-worker: 1
10579+
tp: 1
10580+
ep: 1
10581+
dp-attn: false
10582+
additional-settings:
10583+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml"
10584+
decode:
10585+
num-worker: 2
10586+
tp: 4
10587+
ep: 1
10588+
dp-attn: false
10589+
- conc-list: [32, 64, 128]
10590+
prefill:
10591+
num-worker: 1
10592+
tp: 1
10593+
ep: 1
10594+
dp-attn: false
10595+
additional-settings:
10596+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml"
10597+
decode:
10598+
num-worker: 1
10599+
tp: 4
10600+
ep: 4
10601+
dp-attn: false
10602+
- conc-list: [64, 128, 256, 1024]
10603+
prefill:
10604+
num-worker: 1
10605+
tp: 1
10606+
ep: 1
10607+
dp-attn: false
10608+
additional-settings:
10609+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml"
10610+
decode:
10611+
num-worker: 3
10612+
tp: 4
10613+
ep: 4
10614+
dp-attn: false
10615+
- conc-list: [4096]
10616+
prefill:
10617+
num-worker: 1
10618+
tp: 1
10619+
ep: 1
10620+
dp-attn: false
10621+
additional-settings:
10622+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-1p2d.yaml"
10623+
decode:
10624+
num-worker: 2
10625+
tp: 2
10626+
ep: 2
10627+
dp-attn: true
10628+
- conc-list: [2048, 4096]
10629+
prefill:
10630+
num-worker: 2
10631+
tp: 1
10632+
ep: 1
10633+
dp-attn: false
10634+
additional-settings:
10635+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml"
10636+
decode:
10637+
num-worker: 3
10638+
tp: 2
10639+
ep: 2
10640+
dp-attn: true
10641+
- conc-list: [6144, 8192]
10642+
prefill:
10643+
num-worker: 2
10644+
tp: 1
10645+
ep: 1
10646+
dp-attn: false
10647+
additional-settings:
10648+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml"
10649+
decode:
10650+
num-worker: 3
10651+
tp: 2
10652+
ep: 2
10653+
dp-attn: true
10654+
- conc-list: [1024, 1536, 2048, 4096]
10655+
prefill:
10656+
num-worker: 2
10657+
tp: 1
10658+
ep: 1
10659+
dp-attn: false
10660+
additional-settings:
10661+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml"
10662+
decode:
10663+
num-worker: 1
10664+
tp: 8
10665+
ep: 8
10666+
dp-attn: true
10667+
- isl: 8192
10668+
osl: 1024
10669+
search-space:
10670+
- conc-list: [2, 4, 8, 16]
10671+
prefill:
10672+
num-worker: 1
10673+
tp: 1
10674+
ep: 1
10675+
dp-attn: false
10676+
additional-settings:
10677+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml"
10678+
decode:
10679+
num-worker: 1
10680+
tp: 4
10681+
ep: 1
10682+
dp-attn: false
10683+
- conc-list: [4]
10684+
prefill:
10685+
num-worker: 1
10686+
tp: 1
10687+
ep: 1
10688+
dp-attn: false
10689+
additional-settings:
10690+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp8-1p1d.yaml"
10691+
decode:
10692+
num-worker: 1
10693+
tp: 8
10694+
ep: 1
10695+
dp-attn: false
10696+
- conc-list: [32, 128]
10697+
prefill:
10698+
num-worker: 1
10699+
tp: 1
10700+
ep: 1
10701+
dp-attn: false
10702+
additional-settings:
10703+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml"
10704+
decode:
10705+
num-worker: 1
10706+
tp: 4
10707+
ep: 4
10708+
dp-attn: false
10709+
- conc-list: [64, 128, 256, 512]
10710+
prefill:
10711+
num-worker: 2
10712+
tp: 1
10713+
ep: 1
10714+
dp-attn: false
10715+
additional-settings:
10716+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml"
10717+
decode:
10718+
num-worker: 1
10719+
tp: 4
10720+
ep: 4
10721+
dp-attn: false
10722+
- conc-list: [384, 512]
10723+
prefill:
10724+
num-worker: 4
10725+
tp: 1
10726+
ep: 1
10727+
dp-attn: false
10728+
additional-settings:
10729+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml"
10730+
decode:
10731+
num-worker: 1
10732+
tp: 4
10733+
ep: 4
10734+
dp-attn: true
10735+
- conc-list: [384]
10736+
prefill:
10737+
num-worker: 4
10738+
tp: 1
10739+
ep: 1
10740+
dp-attn: false
10741+
additional-settings:
10742+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml"
10743+
decode:
10744+
num-worker: 1
10745+
tp: 8
10746+
ep: 8
10747+
dp-attn: true
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b300-decode-2xdep2"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b300"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 2
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
frontend:
24+
type: dynamo
25+
enable_multiple_frontends: false
26+
27+
backend:
28+
type: vllm
29+
connector: null
30+
allow_prefill_decode_colocation: true
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
UCX_TLS: "cuda_copy,rc"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
40+
UCX_TLS: "cuda_copy,rc"
41+
42+
vllm_config:
43+
prefill:
44+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
45+
kv-cache-dtype: "fp8"
46+
trust-remote-code: true
47+
no-enable-prefix-caching: true
48+
max-model-len: 2048
49+
max-cudagraph-capture-size: 2048
50+
max-num-batched-tokens: 2048
51+
stream-interval: 128
52+
53+
decode:
54+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
55+
kv-cache-dtype: "fp8"
56+
data-parallel-size: 2
57+
data-parallel-rpc-port: 13345
58+
enable-expert-parallel: true
59+
no-enable-prefix-caching: true
60+
max-model-len: 2048
61+
max-cudagraph-capture-size: 2048
62+
max-num-batched-tokens: 2048
63+
max-num-seqs: 864
64+
gpu-memory-utilization: 0.90
65+
stream-interval: 128
66+
67+
benchmark:
68+
type: "sa-bench"
69+
isl: 1024
70+
osl: 1024
71+
concurrencies: "4096"
72+
random_range_ratio: 0.8
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b300-decode-2p3xdep2-c6144"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b300"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
frontend:
24+
type: dynamo
25+
enable_multiple_frontends: false
26+
27+
backend:
28+
type: vllm
29+
connector: null
30+
allow_prefill_decode_colocation: true
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
UCX_TLS: "cuda_copy,rc"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
40+
UCX_TLS: "cuda_copy,rc"
41+
42+
vllm_config:
43+
prefill:
44+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
45+
kv-cache-dtype: "fp8"
46+
trust-remote-code: true
47+
no-enable-prefix-caching: true
48+
max-model-len: 2048
49+
max-cudagraph-capture-size: 2048
50+
max-num-batched-tokens: 2048
51+
stream-interval: 128
52+
53+
decode:
54+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
55+
kv-cache-dtype: "fp8"
56+
data-parallel-size: 2
57+
data-parallel-rpc-port: 13345
58+
enable-expert-parallel: true
59+
no-enable-prefix-caching: true
60+
max-model-len: 2048
61+
max-cudagraph-capture-size: 2048
62+
max-num-batched-tokens: 2048
63+
max-num-seqs: 864
64+
gpu-memory-utilization: 0.90
65+
stream-interval: 128
66+
67+
benchmark:
68+
type: "sa-bench"
69+
isl: 1024
70+
osl: 1024
71+
concurrencies: "6144x8192"
72+
random_range_ratio: 0.8

0 commit comments

Comments
 (0)