Skip to content

Commit 66b7e04

Browse files
Merge remote-tracking branch 'inferencex/main' into merge-gb300-fp4-pr1641
# Conflicts: # perf-changelog.yaml
2 parents 45759a7 + 7d4063d commit 66b7e04

47 files changed

Lines changed: 3693 additions & 9 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/configs/nvidia-master.yaml

Lines changed: 601 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-decode-2xdep2"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b200"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 1
20+
decode_workers: 2
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
UCX_TLS: "cuda_copy,rc"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
40+
UCX_TLS: "cuda_copy,rc"
41+
42+
vllm_config:
43+
prefill:
44+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
45+
kv-cache-dtype: "fp8"
46+
trust-remote-code: true
47+
no-enable-prefix-caching: true
48+
max-model-len: 2048
49+
max-cudagraph-capture-size: 2048
50+
max-num-batched-tokens: 2048
51+
stream-interval: 128
52+
53+
decode:
54+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
55+
kv-cache-dtype: "fp8"
56+
data-parallel-size: 2
57+
data-parallel-rpc-port: 13345
58+
enable-expert-parallel: true
59+
no-enable-prefix-caching: true
60+
max-model-len: 2048
61+
max-cudagraph-capture-size: 2048
62+
max-num-batched-tokens: 2048
63+
max-num-seqs: 864
64+
gpu-memory-utilization: 0.90
65+
stream-interval: 128
66+
67+
benchmark:
68+
type: "sa-bench"
69+
isl: 1024
70+
osl: 1024
71+
concurrencies: "2048x4096x8192"
72+
random_range_ratio: 0.8
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2-c6144"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b200"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
UCX_TLS: "cuda_copy,rc"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
40+
UCX_TLS: "cuda_copy,rc"
41+
42+
vllm_config:
43+
prefill:
44+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
45+
kv-cache-dtype: "fp8"
46+
trust-remote-code: true
47+
no-enable-prefix-caching: true
48+
max-model-len: 2048
49+
max-cudagraph-capture-size: 2048
50+
max-num-batched-tokens: 2048
51+
stream-interval: 128
52+
53+
decode:
54+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
55+
kv-cache-dtype: "fp8"
56+
data-parallel-size: 2
57+
data-parallel-rpc-port: 13345
58+
enable-expert-parallel: true
59+
no-enable-prefix-caching: true
60+
max-model-len: 2048
61+
max-cudagraph-capture-size: 2048
62+
max-num-batched-tokens: 2048
63+
max-num-seqs: 864
64+
gpu-memory-utilization: 0.90
65+
stream-interval: 128
66+
67+
benchmark:
68+
type: "sa-bench"
69+
isl: 1024
70+
osl: 1024
71+
concurrencies: "6144"
72+
random_range_ratio: 0.8
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b200"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 2
20+
decode_workers: 3
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 2
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
UCX_TLS: "cuda_copy,rc"
36+
37+
decode_environment:
38+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
39+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
40+
UCX_TLS: "cuda_copy,rc"
41+
42+
vllm_config:
43+
prefill:
44+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
45+
kv-cache-dtype: "fp8"
46+
trust-remote-code: true
47+
no-enable-prefix-caching: true
48+
max-model-len: 2048
49+
max-cudagraph-capture-size: 2048
50+
max-num-batched-tokens: 2048
51+
stream-interval: 128
52+
53+
decode:
54+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
55+
kv-cache-dtype: "fp8"
56+
data-parallel-size: 2
57+
data-parallel-rpc-port: 13345
58+
enable-expert-parallel: true
59+
no-enable-prefix-caching: true
60+
max-model-len: 2048
61+
max-cudagraph-capture-size: 2048
62+
max-num-batched-tokens: 2048
63+
max-num-seqs: 864
64+
gpu-memory-utilization: 0.90
65+
stream-interval: 128
66+
67+
benchmark:
68+
type: "sa-bench"
69+
isl: 1024
70+
osl: 1024
71+
concurrencies: "4096"
72+
random_range_ratio: 0.8
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-1k1k-3p2xdep4"
2+
3+
# Rate-matched dep4 at 1k/1k.
4+
# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓
5+
6+
7+
model:
8+
path: "minimax-m2.5-nvfp4"
9+
container: "vllm/vllm-openai:v0.20.1"
10+
precision: "fp4"
11+
12+
dynamo:
13+
install: true
14+
wheel: "1.2.0.dev20260526"
15+
16+
setup_script: install-deps.sh
17+
18+
resources:
19+
gpu_type: "b200"
20+
gpus_per_node: 8
21+
prefill_nodes: 1
22+
decode_nodes: 1
23+
prefill_workers: 3
24+
decode_workers: 2
25+
gpus_per_prefill: 1
26+
gpus_per_decode: 4
27+
28+
frontend:
29+
type: dynamo
30+
enable_multiple_frontends: false
31+
32+
backend:
33+
type: vllm
34+
connector: null
35+
36+
prefill_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
39+
40+
decode_environment:
41+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
42+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
43+
44+
vllm_config:
45+
prefill:
46+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
47+
kv-cache-dtype: "fp8"
48+
trust-remote-code: true
49+
no-enable-prefix-caching: true
50+
max-model-len: 2048
51+
max-cudagraph-capture-size: 2048
52+
max-num-batched-tokens: 2048
53+
stream-interval: 32
54+
55+
decode:
56+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
57+
kv-cache-dtype: "fp8"
58+
data-parallel-size: 4
59+
data-parallel-rpc-port: 13345
60+
enable-expert-parallel: true
61+
no-enable-prefix-caching: true
62+
max-model-len: 2048
63+
max-cudagraph-capture-size: 2048
64+
max-num-batched-tokens: 2048
65+
max-num-seqs: 864
66+
gpu-memory-utilization: 0.90
67+
stream-interval: 32
68+
69+
benchmark:
70+
type: "sa-bench"
71+
isl: 1024
72+
osl: 1024
73+
concurrencies: "1024x2048"
74+
random_range_ratio: 0.8
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: "minimax-m2.5-vllm-disagg-b200-decode-2p1xdep8"
2+
3+
model:
4+
path: "minimax-m2.5-nvfp4"
5+
container: "vllm/vllm-openai:v0.20.1"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260526"
11+
12+
setup_script: install-deps.sh
13+
14+
resources:
15+
gpu_type: "b200"
16+
gpus_per_node: 8
17+
prefill_nodes: 1
18+
decode_nodes: 1
19+
prefill_workers: 2
20+
decode_workers: 1
21+
gpus_per_prefill: 1
22+
gpus_per_decode: 8
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
34+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
35+
36+
decode_environment:
37+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
38+
VLLM_FLOAT32_MATMUL_PRECISION: "high"
39+
40+
vllm_config:
41+
prefill:
42+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
43+
kv-cache-dtype: "fp8"
44+
trust-remote-code: true
45+
no-enable-prefix-caching: true
46+
max-model-len: 2048
47+
max-cudagraph-capture-size: 2048
48+
max-num-batched-tokens: 2048
49+
stream-interval: 128
50+
51+
decode:
52+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
53+
kv-cache-dtype: "fp8"
54+
data-parallel-size: 8
55+
data-parallel-rpc-port: 13345
56+
enable-expert-parallel: true
57+
no-enable-prefix-caching: true
58+
max-model-len: 2048
59+
max-cudagraph-capture-size: 2048
60+
max-num-batched-tokens: 2048
61+
max-num-seqs: 864
62+
gpu-memory-utilization: 0.90
63+
stream-interval: 128
64+
65+
benchmark:
66+
type: "sa-bench"
67+
isl: 1024
68+
osl: 1024
69+
concurrencies: "1024x2048x4096"
70+
random_range_ratio: 0.8

0 commit comments

Comments
 (0)