Skip to content

Commit 0fc4f86

Browse files
[NV] Add MiniMax-M2.5 FP4 GB200 Dynamo vLLM recipes (#1642)
* Add MiniMax-M2.5 FP4 GB200 Dynamo vLLM recipes * Update GB200 MiniMax changelog PR link * Fix GB200 MiniMax shared workspace setup
1 parent 1b23499 commit 0fc4f86

20 files changed

Lines changed: 1580 additions & 8 deletions

.github/configs/nvidia-master.yaml

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9905,3 +9905,242 @@ qwen3.5-fp8-h100-sglang-agentic:
99059905
search-space:
99069906
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
99079907
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
9908+
9909+
minimaxm2.5-fp4-gb200-dynamo-vllm:
9910+
image: vllm/vllm-openai:v0.20.1
9911+
model: nvidia/MiniMax-M2.5-NVFP4
9912+
model-prefix: minimaxm2.5
9913+
runner: gb200
9914+
precision: fp4
9915+
framework: dynamo-vllm
9916+
multinode: true
9917+
disagg: true
9918+
scenarios:
9919+
fixed-seq-len:
9920+
- isl: 1024
9921+
osl: 1024
9922+
search-space:
9923+
- conc-list: [16]
9924+
prefill:
9925+
num-worker: 1
9926+
tp: 1
9927+
ep: 1
9928+
dp-attn: false
9929+
additional-settings:
9930+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml"
9931+
decode:
9932+
num-worker: 1
9933+
tp: 4
9934+
ep: 1
9935+
dp-attn: false
9936+
- conc-list: [4, 8, 16, 32, 64]
9937+
prefill:
9938+
num-worker: 1
9939+
tp: 1
9940+
ep: 1
9941+
dp-attn: false
9942+
additional-settings:
9943+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml"
9944+
decode:
9945+
num-worker: 2
9946+
tp: 4
9947+
ep: 1
9948+
dp-attn: false
9949+
- conc-list: [32, 64, 128]
9950+
prefill:
9951+
num-worker: 1
9952+
tp: 1
9953+
ep: 1
9954+
dp-attn: false
9955+
additional-settings:
9956+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml"
9957+
decode:
9958+
num-worker: 1
9959+
tp: 4
9960+
ep: 4
9961+
dp-attn: false
9962+
- conc-list: [256]
9963+
prefill:
9964+
num-worker: 1
9965+
tp: 1
9966+
ep: 1
9967+
dp-attn: false
9968+
additional-settings:
9969+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d-hi-conc.yaml"
9970+
decode:
9971+
num-worker: 1
9972+
tp: 4
9973+
ep: 4
9974+
dp-attn: false
9975+
- conc-list: [128, 256]
9976+
prefill:
9977+
num-worker: 1
9978+
tp: 1
9979+
ep: 1
9980+
dp-attn: false
9981+
additional-settings:
9982+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml"
9983+
decode:
9984+
num-worker: 2
9985+
tp: 4
9986+
ep: 4
9987+
dp-attn: false
9988+
- conc-list: [64, 128, 256, 512]
9989+
prefill:
9990+
num-worker: 1
9991+
tp: 1
9992+
ep: 1
9993+
dp-attn: false
9994+
additional-settings:
9995+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml"
9996+
decode:
9997+
num-worker: 3
9998+
tp: 4
9999+
ep: 4
10000+
dp-attn: false
10001+
- conc-list: [1024, 2048]
10002+
prefill:
10003+
num-worker: 1
10004+
tp: 1
10005+
ep: 1
10006+
dp-attn: false
10007+
additional-settings:
10008+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d-hi-conc.yaml"
10009+
decode:
10010+
num-worker: 3
10011+
tp: 4
10012+
ep: 4
10013+
dp-attn: false
10014+
- conc-list: [256, 1024]
10015+
prefill:
10016+
num-worker: 2
10017+
tp: 1
10018+
ep: 1
10019+
dp-attn: false
10020+
additional-settings:
10021+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml"
10022+
decode:
10023+
num-worker: 3
10024+
tp: 4
10025+
ep: 4
10026+
dp-attn: false
10027+
- conc-list: [2048, 4096, 8192]
10028+
prefill:
10029+
num-worker: 1
10030+
tp: 1
10031+
ep: 1
10032+
dp-attn: false
10033+
additional-settings:
10034+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml"
10035+
decode:
10036+
num-worker: 2
10037+
tp: 2
10038+
ep: 2
10039+
dp-attn: true
10040+
- conc-list: [4096]
10041+
prefill:
10042+
num-worker: 2
10043+
tp: 1
10044+
ep: 1
10045+
dp-attn: false
10046+
additional-settings:
10047+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml"
10048+
decode:
10049+
num-worker: 3
10050+
tp: 2
10051+
ep: 2
10052+
dp-attn: true
10053+
- conc-list: [6144]
10054+
prefill:
10055+
num-worker: 2
10056+
tp: 1
10057+
ep: 1
10058+
dp-attn: false
10059+
additional-settings:
10060+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d-c6144.yaml"
10061+
decode:
10062+
num-worker: 3
10063+
tp: 2
10064+
ep: 2
10065+
dp-attn: true
10066+
- conc-list: [1024, 2048]
10067+
prefill:
10068+
num-worker: 3
10069+
tp: 1
10070+
ep: 1
10071+
dp-attn: false
10072+
additional-settings:
10073+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml"
10074+
decode:
10075+
num-worker: 2
10076+
tp: 4
10077+
ep: 4
10078+
dp-attn: true
10079+
- conc-list: [1024, 2048, 4096]
10080+
prefill:
10081+
num-worker: 2
10082+
tp: 1
10083+
ep: 1
10084+
dp-attn: false
10085+
additional-settings:
10086+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml"
10087+
decode:
10088+
num-worker: 1
10089+
tp: 8
10090+
ep: 8
10091+
dp-attn: true
10092+
- isl: 8192
10093+
osl: 1024
10094+
search-space:
10095+
- conc-list: [4, 8, 16]
10096+
prefill:
10097+
num-worker: 1
10098+
tp: 1
10099+
ep: 1
10100+
dp-attn: false
10101+
additional-settings:
10102+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml"
10103+
decode:
10104+
num-worker: 1
10105+
tp: 4
10106+
ep: 1
10107+
dp-attn: false
10108+
- conc-list: [32, 64]
10109+
prefill:
10110+
num-worker: 1
10111+
tp: 1
10112+
ep: 1
10113+
dp-attn: false
10114+
additional-settings:
10115+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml"
10116+
decode:
10117+
num-worker: 1
10118+
tp: 4
10119+
ep: 4
10120+
dp-attn: false
10121+
- conc-list: [256, 512, 1024]
10122+
prefill:
10123+
num-worker: 1
10124+
tp: 1
10125+
ep: 1
10126+
dp-attn: false
10127+
additional-settings:
10128+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d-hi-conc.yaml"
10129+
decode:
10130+
num-worker: 1
10131+
tp: 4
10132+
ep: 4
10133+
dp-attn: false
10134+
- conc-list: [256, 512, 1024, 2048]
10135+
prefill:
10136+
num-worker: 2
10137+
tp: 1
10138+
ep: 1
10139+
dp-attn: false
10140+
additional-settings:
10141+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml"
10142+
decode:
10143+
num-worker: 1
10144+
tp: 4
10145+
ep: 4
10146+
dp-attn: true
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
name: "minimax-m2.5-vllm-disagg-gb200-decode-2xdep2"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 1
18+
decode_nodes: 2
19+
prefill_workers: 1
20+
decode_workers: 2
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
spread_workers: true
24+
25+
frontend:
26+
type: dynamo
27+
enable_multiple_frontends: false
28+
29+
backend:
30+
type: vllm
31+
connector: null
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
36+
UCX_TLS: "cuda_copy,rc"
37+
38+
decode_environment:
39+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
40+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
41+
UCX_TLS: "cuda_copy,rc"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
trust-remote-code: true
48+
no-enable-prefix-caching: true
49+
max-model-len: 2048
50+
max-cudagraph-capture-size: 2048
51+
max-num-batched-tokens: 2048
52+
stream-interval: 128
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
data-parallel-size: 2
58+
data-parallel-rpc-port: 13345
59+
enable-expert-parallel: true
60+
no-enable-prefix-caching: true
61+
max-model-len: 2048
62+
max-cudagraph-capture-size: 2048
63+
max-num-batched-tokens: 2048
64+
max-num-seqs: 864
65+
gpu-memory-utilization: 0.90
66+
stream-interval: 128
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "2048x4096x8192"
73+
random_range_ratio: 0.8
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
name: "minimax-m2.5-vllm-disagg-gb200-decode-2p3xdep2-c6144"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 2
18+
decode_nodes: 3
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
spread_workers: true
24+
25+
frontend:
26+
type: dynamo
27+
enable_multiple_frontends: false
28+
29+
backend:
30+
type: vllm
31+
connector: null
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
36+
UCX_TLS: "cuda_copy,rc"
37+
38+
decode_environment:
39+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
40+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
41+
UCX_TLS: "cuda_copy,rc"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
trust-remote-code: true
48+
no-enable-prefix-caching: true
49+
max-model-len: 2048
50+
max-cudagraph-capture-size: 2048
51+
max-num-batched-tokens: 2048
52+
stream-interval: 128
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
data-parallel-size: 2
58+
data-parallel-rpc-port: 13345
59+
enable-expert-parallel: true
60+
no-enable-prefix-caching: true
61+
max-model-len: 2048
62+
max-cudagraph-capture-size: 2048
63+
max-num-batched-tokens: 2048
64+
max-num-seqs: 864
65+
gpu-memory-utilization: 0.90
66+
stream-interval: 128
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "6144"
73+
random_range_ratio: 0.8

0 commit comments

Comments
 (0)