Skip to content

Commit 90e5193

Browse files
Add MiniMax-M2.5 FP8 GB200 Dynamo vLLM recipes
1 parent 1b23499 commit 90e5193

12 files changed

Lines changed: 881 additions & 6 deletions

.github/configs/nvidia-master.yaml

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9905,3 +9905,138 @@ qwen3.5-fp8-h100-sglang-agentic:
99059905
search-space:
99069906
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
99079907
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
9908+
9909+
minimaxm2.5-fp8-gb200-dynamo-vllm:
9910+
image: vllm/vllm-openai:v0.20.1
9911+
model: MiniMaxAI/MiniMax-M2.5
9912+
model-prefix: minimaxm2.5
9913+
runner: gb200
9914+
precision: fp8
9915+
framework: dynamo-vllm
9916+
multinode: true
9917+
disagg: true
9918+
scenarios:
9919+
fixed-seq-len:
9920+
- isl: 1024
9921+
osl: 1024
9922+
search-space:
9923+
- conc-list: [1, 4, 8, 16, 32, 64]
9924+
prefill:
9925+
num-worker: 1
9926+
tp: 2
9927+
ep: 2
9928+
dp-attn: true
9929+
additional-settings:
9930+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml"
9931+
decode:
9932+
num-worker: 1
9933+
tp: 4
9934+
ep: 1
9935+
dp-attn: false
9936+
- conc-list: [2, 32, 64, 128, 256, 512]
9937+
prefill:
9938+
num-worker: 1
9939+
tp: 2
9940+
ep: 2
9941+
dp-attn: true
9942+
additional-settings:
9943+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml"
9944+
decode:
9945+
num-worker: 2
9946+
tp: 4
9947+
ep: 1
9948+
dp-attn: false
9949+
- conc-list: [1024]
9950+
prefill:
9951+
num-worker: 1
9952+
tp: 2
9953+
ep: 2
9954+
dp-attn: true
9955+
additional-settings:
9956+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml"
9957+
decode:
9958+
num-worker: 3
9959+
tp: 4
9960+
ep: 4
9961+
dp-attn: false
9962+
- conc-list: [512, 1024]
9963+
prefill:
9964+
num-worker: 2
9965+
tp: 2
9966+
ep: 2
9967+
dp-attn: true
9968+
additional-settings:
9969+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml"
9970+
decode:
9971+
num-worker: 1
9972+
tp: 8
9973+
ep: 8
9974+
dp-attn: true
9975+
- conc-list: [4096]
9976+
prefill:
9977+
num-worker: 1
9978+
tp: 2
9979+
ep: 2
9980+
dp-attn: true
9981+
additional-settings:
9982+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml"
9983+
decode:
9984+
num-worker: 4
9985+
tp: 2
9986+
ep: 2
9987+
dp-attn: true
9988+
- conc-list: [4096, 8192]
9989+
prefill:
9990+
num-worker: 2
9991+
tp: 2
9992+
ep: 2
9993+
dp-attn: true
9994+
additional-settings:
9995+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml"
9996+
decode:
9997+
num-worker: 3
9998+
tp: 4
9999+
ep: 4
10000+
dp-attn: true
10001+
- isl: 8192
10002+
osl: 1024
10003+
search-space:
10004+
- conc-list: [1, 4, 8, 16, 32, 64, 128]
10005+
prefill:
10006+
num-worker: 1
10007+
tp: 2
10008+
ep: 2
10009+
dp-attn: true
10010+
additional-settings:
10011+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml"
10012+
decode:
10013+
num-worker: 1
10014+
tp: 4
10015+
ep: 1
10016+
dp-attn: false
10017+
- conc-list: [256, 512]
10018+
prefill:
10019+
num-worker: 1
10020+
tp: 2
10021+
ep: 2
10022+
dp-attn: true
10023+
additional-settings:
10024+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml"
10025+
decode:
10026+
num-worker: 1
10027+
tp: 4
10028+
ep: 4
10029+
dp-attn: false
10030+
- conc-list: [1024, 2048, 4096]
10031+
prefill:
10032+
num-worker: 3
10033+
tp: 2
10034+
ep: 2
10035+
dp-attn: true
10036+
additional-settings:
10037+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml"
10038+
decode:
10039+
num-worker: 2
10040+
tp: 4
10041+
ep: 4
10042+
dp-attn: true
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4"
2+
3+
model:
4+
path: "minimax-m2.5-fp8"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp8"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 1
21+
gpus_per_prefill: 2
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
35+
36+
decode_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
39+
40+
vllm_config:
41+
prefill:
42+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
43+
kv-cache-dtype: "fp8"
44+
tensor-parallel-size: 1
45+
pipeline-parallel-size: 1
46+
data-parallel-size: 2
47+
data-parallel-rpc-port: 13346
48+
enable-expert-parallel: true
49+
safetensors-load-strategy: "prefetch"
50+
trust-remote-code: true
51+
no-enable-prefix-caching: true
52+
stream-interval: 32
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
tensor-parallel-size: 4
58+
safetensors-load-strategy: "prefetch"
59+
trust-remote-code: true
60+
no-enable-prefix-caching: true
61+
stream-interval: 32
62+
63+
benchmark:
64+
type: "sa-bench"
65+
isl: 1024
66+
osl: 1024
67+
concurrencies: "1x4x8x16x32x64"
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
name: "minimax-m2.5-vllm-disagg-gb200-1p2d-tp4"
2+
3+
model:
4+
path: "minimax-m2.5-fp8"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp8"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 1
18+
decode_nodes: 2
19+
prefill_workers: 1
20+
decode_workers: 2
21+
gpus_per_prefill: 2
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
35+
36+
decode_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
39+
40+
vllm_config:
41+
prefill:
42+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
43+
kv-cache-dtype: "fp8"
44+
tensor-parallel-size: 1
45+
pipeline-parallel-size: 1
46+
data-parallel-size: 2
47+
data-parallel-rpc-port: 13346
48+
enable-expert-parallel: true
49+
safetensors-load-strategy: "prefetch"
50+
trust-remote-code: true
51+
no-enable-prefix-caching: true
52+
stream-interval: 32
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
tensor-parallel-size: 4
58+
safetensors-load-strategy: "prefetch"
59+
trust-remote-code: true
60+
no-enable-prefix-caching: true
61+
stream-interval: 32
62+
63+
benchmark:
64+
type: "sa-bench"
65+
isl: 1024
66+
osl: 1024
67+
concurrencies: "2x32x64x128x256x512"
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-gb200-1p3d-tp4ep"
2+
3+
# Rate-matched tp4ep for FP8 GB200 1k/1k.
4+
# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓
5+
6+
model:
7+
path: "minimax-m2.5-fp8"
8+
container: "vllm/vllm-openai:v0.20.1"
9+
precision: "fp8"
10+
11+
dynamo:
12+
install: true
13+
wheel: "1.2.0.dev20260526"
14+
15+
setup_script: install-deps.sh
16+
17+
resources:
18+
gpu_type: "gb200"
19+
gpus_per_node: 4
20+
prefill_nodes: 1
21+
decode_nodes: 3
22+
prefill_workers: 1
23+
decode_workers: 3
24+
gpus_per_prefill: 2
25+
gpus_per_decode: 4
26+
27+
frontend:
28+
type: dynamo
29+
enable_multiple_frontends: false
30+
31+
backend:
32+
type: vllm
33+
connector: null
34+
35+
prefill_environment:
36+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
37+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
38+
39+
decode_environment:
40+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
41+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
tensor-parallel-size: 1
48+
pipeline-parallel-size: 1
49+
data-parallel-size: 2
50+
data-parallel-rpc-port: 13346
51+
enable-expert-parallel: true
52+
safetensors-load-strategy: "prefetch"
53+
trust-remote-code: true
54+
no-enable-prefix-caching: true
55+
stream-interval: 32
56+
57+
decode:
58+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
59+
kv-cache-dtype: "fp8"
60+
tensor-parallel-size: 4
61+
pipeline-parallel-size: 1
62+
enable-expert-parallel: true
63+
safetensors-load-strategy: "prefetch"
64+
trust-remote-code: true
65+
no-enable-prefix-caching: true
66+
stream-interval: 32
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "1024"

0 commit comments

Comments
 (0)