Skip to content

Commit 5a795b1

Browse files
Add MiniMax-M2.5 FP4 GB300 Dynamo vLLM recipes
1 parent 1b23499 commit 5a795b1

15 files changed

Lines changed: 1029 additions & 1 deletion

File tree

.github/configs/nvidia-master.yaml

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9261,6 +9261,180 @@ qwen3.5-fp8-h100-sglang-mtp:
92619261
search-space:
92629262
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
92639263

9264+
minimaxm2.5-fp4-gb300-dynamo-vllm:
9265+
image: vllm/vllm-openai:v0.20.1
9266+
model: nvidia/MiniMax-M2.5-NVFP4
9267+
model-prefix: minimaxm2.5
9268+
runner: gb300
9269+
precision: fp4
9270+
framework: dynamo-vllm
9271+
multinode: true
9272+
disagg: true
9273+
scenarios:
9274+
fixed-seq-len:
9275+
- isl: 1024
9276+
osl: 1024
9277+
search-space:
9278+
- conc-list: [2, 4, 16]
9279+
prefill:
9280+
num-worker: 1
9281+
tp: 1
9282+
ep: 1
9283+
dp-attn: false
9284+
additional-settings:
9285+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml"
9286+
decode:
9287+
num-worker: 1
9288+
tp: 4
9289+
ep: 1
9290+
dp-attn: false
9291+
- conc-list: [4, 8, 16, 64]
9292+
prefill:
9293+
num-worker: 1
9294+
tp: 1
9295+
ep: 1
9296+
dp-attn: false
9297+
additional-settings:
9298+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml"
9299+
decode:
9300+
num-worker: 2
9301+
tp: 4
9302+
ep: 1
9303+
dp-attn: false
9304+
- conc-list: [32, 64, 128]
9305+
prefill:
9306+
num-worker: 1
9307+
tp: 1
9308+
ep: 1
9309+
dp-attn: false
9310+
additional-settings:
9311+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml"
9312+
decode:
9313+
num-worker: 1
9314+
tp: 4
9315+
ep: 4
9316+
dp-attn: false
9317+
- conc-list: [64, 128, 256, 512, 1024]
9318+
prefill:
9319+
num-worker: 1
9320+
tp: 1
9321+
ep: 1
9322+
dp-attn: false
9323+
additional-settings:
9324+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml"
9325+
decode:
9326+
num-worker: 3
9327+
tp: 4
9328+
ep: 4
9329+
dp-attn: false
9330+
- conc-list: [2048]
9331+
prefill:
9332+
num-worker: 2
9333+
tp: 1
9334+
ep: 1
9335+
dp-attn: false
9336+
additional-settings:
9337+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml"
9338+
decode:
9339+
num-worker: 3
9340+
tp: 2
9341+
ep: 2
9342+
dp-attn: true
9343+
- conc-list: [6144, 8192]
9344+
prefill:
9345+
num-worker: 2
9346+
tp: 1
9347+
ep: 1
9348+
dp-attn: false
9349+
additional-settings:
9350+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml"
9351+
decode:
9352+
num-worker: 3
9353+
tp: 2
9354+
ep: 2
9355+
dp-attn: true
9356+
- conc-list: [1024, 2048, 4096]
9357+
prefill:
9358+
num-worker: 2
9359+
tp: 1
9360+
ep: 1
9361+
dp-attn: false
9362+
additional-settings:
9363+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml"
9364+
decode:
9365+
num-worker: 1
9366+
tp: 8
9367+
ep: 8
9368+
dp-attn: true
9369+
- isl: 8192
9370+
osl: 1024
9371+
search-space:
9372+
- conc-list: [2, 4, 8, 16]
9373+
prefill:
9374+
num-worker: 1
9375+
tp: 1
9376+
ep: 1
9377+
dp-attn: false
9378+
additional-settings:
9379+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml"
9380+
decode:
9381+
num-worker: 1
9382+
tp: 4
9383+
ep: 1
9384+
dp-attn: false
9385+
- conc-list: [32, 64, 128, 256]
9386+
prefill:
9387+
num-worker: 1
9388+
tp: 1
9389+
ep: 1
9390+
dp-attn: false
9391+
additional-settings:
9392+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml"
9393+
decode:
9394+
num-worker: 1
9395+
tp: 4
9396+
ep: 4
9397+
dp-attn: false
9398+
- conc-list: [64, 128]
9399+
prefill:
9400+
num-worker: 2
9401+
tp: 1
9402+
ep: 1
9403+
dp-attn: false
9404+
additional-settings:
9405+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml"
9406+
decode:
9407+
num-worker: 1
9408+
tp: 4
9409+
ep: 4
9410+
dp-attn: false
9411+
- conc-list: [256]
9412+
prefill:
9413+
num-worker: 4
9414+
tp: 1
9415+
ep: 1
9416+
dp-attn: false
9417+
additional-settings:
9418+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml"
9419+
decode:
9420+
num-worker: 1
9421+
tp: 4
9422+
ep: 4
9423+
dp-attn: true
9424+
- conc-list: [1024, 2048]
9425+
prefill:
9426+
num-worker: 4
9427+
tp: 1
9428+
ep: 1
9429+
dp-attn: false
9430+
additional-settings:
9431+
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml"
9432+
decode:
9433+
num-worker: 1
9434+
tp: 8
9435+
ep: 8
9436+
dp-attn: true
9437+
92649438
glm5-fp4-gb300-dynamo-sglang:
92659439
image: lmsysorg/sglang:v0.5.11-cu130
92669440
model: nvidia/GLM-5-NVFP4
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2-c6144"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb300"
16+
gpus_per_node: 4
17+
prefill_nodes: 2
18+
decode_nodes: 3
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
spread_workers: true
24+
25+
frontend:
26+
type: dynamo
27+
enable_multiple_frontends: false
28+
29+
backend:
30+
type: vllm
31+
connector: null
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
36+
UCX_TLS: "cuda_copy,rc"
37+
38+
decode_environment:
39+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
40+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
41+
UCX_TLS: "cuda_copy,rc"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
trust-remote-code: true
48+
no-enable-prefix-caching: true
49+
max-model-len: 2048
50+
max-cudagraph-capture-size: 2048
51+
max-num-batched-tokens: 2048
52+
stream-interval: 128
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
data-parallel-size: 2
58+
data-parallel-rpc-port: 13345
59+
enable-expert-parallel: true
60+
no-enable-prefix-caching: true
61+
max-model-len: 2048
62+
max-cudagraph-capture-size: 2048
63+
max-num-batched-tokens: 2048
64+
max-num-seqs: 864
65+
gpu-memory-utilization: 0.90
66+
stream-interval: 128
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "6144x8192"
73+
random_range_ratio: 0.8
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb300"
16+
gpus_per_node: 4
17+
prefill_nodes: 2
18+
decode_nodes: 3
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
spread_workers: true
24+
25+
frontend:
26+
type: dynamo
27+
enable_multiple_frontends: false
28+
29+
backend:
30+
type: vllm
31+
connector: null
32+
33+
prefill_environment:
34+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
35+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
36+
UCX_TLS: "cuda_copy,rc"
37+
38+
decode_environment:
39+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
40+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
41+
UCX_TLS: "cuda_copy,rc"
42+
43+
vllm_config:
44+
prefill:
45+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
46+
kv-cache-dtype: "fp8"
47+
trust-remote-code: true
48+
no-enable-prefix-caching: true
49+
max-model-len: 2048
50+
max-cudagraph-capture-size: 2048
51+
max-num-batched-tokens: 2048
52+
stream-interval: 32
53+
54+
decode:
55+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
56+
kv-cache-dtype: "fp8"
57+
data-parallel-size: 2
58+
data-parallel-rpc-port: 13345
59+
enable-expert-parallel: true
60+
no-enable-prefix-caching: true
61+
max-model-len: 2048
62+
max-cudagraph-capture-size: 2048
63+
max-num-batched-tokens: 2048
64+
max-num-seqs: 864
65+
gpu-memory-utilization: 0.90
66+
stream-interval: 32
67+
68+
benchmark:
69+
type: "sa-bench"
70+
isl: 1024
71+
osl: 1024
72+
concurrencies: "2048"
73+
random_range_ratio: 0.8
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: "minimax-m2.5-vllm-disagg-gb300-decode-2p1xdep8"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "gb300"
16+
gpus_per_node: 4
17+
prefill_nodes: 1
18+
decode_nodes: 2
19+
prefill_workers: 2
20+
decode_workers: 1
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 8
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
36+
decode_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
39+
40+
vllm_config:
41+
prefill:
42+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
43+
kv-cache-dtype: "fp8"
44+
trust-remote-code: true
45+
no-enable-prefix-caching: true
46+
max-model-len: 2048
47+
max-cudagraph-capture-size: 2048
48+
max-num-batched-tokens: 2048
49+
stream-interval: 128
50+
51+
decode:
52+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
53+
kv-cache-dtype: "fp8"
54+
data-parallel-size: 8
55+
data-parallel-rpc-port: 13345
56+
enable-expert-parallel: true
57+
no-enable-prefix-caching: true
58+
max-model-len: 2048
59+
max-cudagraph-capture-size: 2048
60+
max-num-batched-tokens: 2048
61+
max-num-seqs: 864
62+
gpu-memory-utilization: 0.90
63+
stream-interval: 128
64+
65+
benchmark:
66+
type: "sa-bench"
67+
isl: 1024
68+
osl: 1024
69+
concurrencies: "1024x2048x4096"
70+
random_range_ratio: 0.8

0 commit comments

Comments
 (0)