Skip to content

Commit 92c0fcb

Browse files
[NV] Add MiniMax-M2.5 FP8 B200 Dynamo vLLM recipes (#1649)
* Add MiniMax-M2.5 FP8 B200 Dynamo vLLM recipes * Update MiniMax-M2.5 FP8 B200 PR link * Fix B200 FP8 1k1k recipe input length --------- Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent 96f1598 commit 92c0fcb

14 files changed

Lines changed: 948 additions & 6 deletions

.github/configs/nvidia-master.yaml

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9906,6 +9906,155 @@ qwen3.5-fp8-h100-sglang-agentic:
99069906
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
99079907
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
99089908

9909+
9910+
minimaxm2.5-fp8-b200-dynamo-vllm:
9911+
image: vllm/vllm-openai:v0.20.1
9912+
model: MiniMaxAI/MiniMax-M2.5
9913+
model-prefix: minimaxm2.5
9914+
runner: b200-multinode
9915+
precision: fp8
9916+
framework: dynamo-vllm
9917+
multinode: true
9918+
disagg: true
9919+
scenarios:
9920+
fixed-seq-len:
9921+
- isl: 1024
9922+
osl: 1024
9923+
search-space:
9924+
- conc-list: [32, 64]
9925+
prefill:
9926+
num-worker: 1
9927+
tp: 2
9928+
ep: 2
9929+
dp-attn: true
9930+
additional-settings:
9931+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p1d-tp4ep.yaml"
9932+
decode:
9933+
num-worker: 1
9934+
tp: 4
9935+
ep: 4
9936+
dp-attn: false
9937+
- conc-list: [128]
9938+
prefill:
9939+
num-worker: 1
9940+
tp: 2
9941+
ep: 2
9942+
dp-attn: true
9943+
additional-settings:
9944+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml"
9945+
decode:
9946+
num-worker: 1
9947+
tp: 4
9948+
ep: 4
9949+
dp-attn: false
9950+
- conc-list: [1024]
9951+
prefill:
9952+
num-worker: 1
9953+
tp: 2
9954+
ep: 2
9955+
dp-attn: true
9956+
additional-settings:
9957+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p3d-tp4ep.yaml"
9958+
decode:
9959+
num-worker: 3
9960+
tp: 4
9961+
ep: 4
9962+
dp-attn: false
9963+
- conc-list: [512, 1024]
9964+
prefill:
9965+
num-worker: 2
9966+
tp: 2
9967+
ep: 2
9968+
dp-attn: true
9969+
additional-settings:
9970+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p1d-dep8.yaml"
9971+
decode:
9972+
num-worker: 1
9973+
tp: 8
9974+
ep: 8
9975+
dp-attn: true
9976+
- conc-list: [512]
9977+
prefill:
9978+
num-worker: 4
9979+
tp: 2
9980+
ep: 2
9981+
dp-attn: true
9982+
additional-settings:
9983+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml"
9984+
decode:
9985+
num-worker: 1
9986+
tp: 8
9987+
ep: 8
9988+
dp-attn: true
9989+
- conc-list: [4096]
9990+
prefill:
9991+
num-worker: 1
9992+
tp: 2
9993+
ep: 2
9994+
dp-attn: true
9995+
additional-settings:
9996+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml"
9997+
decode:
9998+
num-worker: 4
9999+
tp: 2
10000+
ep: 2
10001+
dp-attn: true
10002+
- conc-list: [4096, 8192]
10003+
prefill:
10004+
num-worker: 2
10005+
tp: 2
10006+
ep: 2
10007+
dp-attn: true
10008+
additional-settings:
10009+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml"
10010+
decode:
10011+
num-worker: 3
10012+
tp: 4
10013+
ep: 4
10014+
dp-attn: true
10015+
- isl: 8192
10016+
osl: 1024
10017+
search-space:
10018+
- conc-list: [16, 32, 64, 128]
10019+
prefill:
10020+
num-worker: 1
10021+
tp: 2
10022+
ep: 2
10023+
dp-attn: true
10024+
additional-settings:
10025+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep.yaml"
10026+
decode:
10027+
num-worker: 1
10028+
tp: 4
10029+
ep: 4
10030+
dp-attn: false
10031+
- conc-list: [256, 512]
10032+
prefill:
10033+
num-worker: 1
10034+
tp: 2
10035+
ep: 2
10036+
dp-attn: true
10037+
additional-settings:
10038+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml"
10039+
decode:
10040+
num-worker: 1
10041+
tp: 4
10042+
ep: 4
10043+
dp-attn: false
10044+
- conc-list: [1024, 2048]
10045+
prefill:
10046+
num-worker: 3
10047+
tp: 2
10048+
ep: 2
10049+
dp-attn: true
10050+
additional-settings:
10051+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-3p2d-dep4.yaml"
10052+
decode:
10053+
num-worker: 2
10054+
tp: 4
10055+
ep: 4
10056+
dp-attn: true
10057+
990910058
minimaxm2.5-fp4-gb200-dynamo-vllm:
991010059
image: vllm/vllm-openai:v0.20.1
991110060
model: nvidia/MiniMax-M2.5-NVFP4
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-fp8-decode-focus-dep8"
2+
3+
# Over-prefilled (4P:1D-dep8) at 1k/1k to measure X_dep8_fp8_gb200.
4+
# 4P × 48k = 192k vs dep8 X ≈ 90k → 2.1× buffer.
5+
6+
model:
7+
path: "minimax-m2.5-fp8"
8+
container: "vllm/vllm-openai:v0.20.1"
9+
precision: "fp8"
10+
11+
dynamo:
12+
install: true
13+
wheel: "1.2.0.dev20260526"
14+
15+
setup_script: install-deps.sh
16+
17+
resources:
18+
gpu_type: "b200"
19+
gpus_per_node: 8
20+
prefill_nodes: 2
21+
decode_nodes: 2
22+
prefill_workers: 4
23+
decode_workers: 1
24+
gpus_per_prefill: 2
25+
gpus_per_decode: 8
26+
27+
frontend:
28+
type: dynamo
29+
enable_multiple_frontends: false
30+
31+
backend:
32+
type: vllm
33+
connector: null
34+
35+
prefill_environment:
36+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
37+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
38+
39+
decode_environment:
40+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
41+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
tensor-parallel-size: 1
48+
pipeline-parallel-size: 1
49+
data-parallel-size: 2
50+
data-parallel-rpc-port: 13346
51+
enable-expert-parallel: true
52+
safetensors-load-strategy: "prefetch"
53+
trust-remote-code: true
54+
no-enable-prefix-caching: true
55+
stream-interval: 32
56+
57+
decode:
58+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
59+
kv-cache-dtype: "fp8"
60+
tensor-parallel-size: 1
61+
pipeline-parallel-size: 1
62+
data-parallel-size: 8
63+
data-parallel-rpc-port: 13345
64+
enable-expert-parallel: true
65+
safetensors-load-strategy: "prefetch"
66+
trust-remote-code: true
67+
no-enable-prefix-caching: true
68+
stream-interval: 32
69+
70+
benchmark:
71+
type: "sa-bench"
72+
isl: 1024
73+
osl: 1024
74+
concurrencies: "512"
75+
random_range_ratio: 0.8
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-1p1d-tp4ep"
2+
3+
model:
4+
path: "minimax-m2.5-fp8"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp8"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b200"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 1
21+
gpus_per_prefill: 2
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
35+
36+
decode_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
39+
40+
vllm_config:
41+
prefill:
42+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
43+
kv-cache-dtype: "fp8"
44+
tensor-parallel-size: 1
45+
pipeline-parallel-size: 1
46+
data-parallel-size: 2
47+
data-parallel-rpc-port: 13346
48+
enable-expert-parallel: true
49+
safetensors-load-strategy: "prefetch"
50+
trust-remote-code: true
51+
no-enable-prefix-caching: true
52+
stream-interval: 32
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
tensor-parallel-size: 4
58+
pipeline-parallel-size: 1
59+
enable-expert-parallel: true
60+
safetensors-load-strategy: "prefetch"
61+
trust-remote-code: true
62+
no-enable-prefix-caching: true
63+
stream-interval: 32
64+
65+
benchmark:
66+
type: "sa-bench"
67+
isl: 1024
68+
osl: 1024
69+
concurrencies: "32x64"
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-1p3d-tp4ep"
2+
3+
# Rate-matched tp4ep for FP8 GB200 1k/1k.
4+
# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓
5+
6+
model:
7+
path: "minimax-m2.5-fp8"
8+
container: "vllm/vllm-openai:v0.20.1"
9+
precision: "fp8"
10+
11+
dynamo:
12+
install: true
13+
wheel: "1.2.0.dev20260526"
14+
15+
setup_script: install-deps.sh
16+
17+
resources:
18+
gpu_type: "b200"
19+
gpus_per_node: 8
20+
prefill_nodes: 1
21+
decode_nodes: 3
22+
prefill_workers: 1
23+
decode_workers: 3
24+
gpus_per_prefill: 2
25+
gpus_per_decode: 4
26+
27+
frontend:
28+
type: dynamo
29+
enable_multiple_frontends: false
30+
31+
backend:
32+
type: vllm
33+
connector: null
34+
35+
prefill_environment:
36+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
37+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
38+
39+
decode_environment:
40+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
41+
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
tensor-parallel-size: 1
48+
pipeline-parallel-size: 1
49+
data-parallel-size: 2
50+
data-parallel-rpc-port: 13346
51+
enable-expert-parallel: true
52+
safetensors-load-strategy: "prefetch"
53+
trust-remote-code: true
54+
no-enable-prefix-caching: true
55+
stream-interval: 32
56+
57+
decode:
58+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
59+
kv-cache-dtype: "fp8"
60+
tensor-parallel-size: 4
61+
pipeline-parallel-size: 1
62+
enable-expert-parallel: true
63+
safetensors-load-strategy: "prefetch"
64+
trust-remote-code: true
65+
no-enable-prefix-caching: true
66+
stream-interval: 32
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "1024"

0 commit comments

Comments
 (0)