SemiAnalysisAI
diff --git a/‎.github/configs/nvidia-master.yaml‎
Lines changed: 601 additions & 0 deletions b/‎.github/configs/nvidia-master.yaml‎
Lines changed: 601 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml‎
Lines changed: 72 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml‎
Lines changed: 74 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml‎
Lines changed: 70 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml‎
Lines changed: 70 additions & 0 deletions
@@ -0,0 +1,72 @@
+name: "minimax-m2.5-vllm-disagg-b200-decode-2xdep2"
+
+model:
+  path: "minimax-m2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.20.1"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+setup_script: install-deps.sh
+
+resources:
+  gpu_type: "b200"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 1
+  gpus_per_decode: 2
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      stream-interval: 128
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      max-num-seqs: 864
+      gpu-memory-utilization: 0.90
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048x4096x8192"
+  random_range_ratio: 0.8
@@ -0,0 +1,72 @@
+name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2-c6144"
+
+model:
+  path: "minimax-m2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.20.1"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+setup_script: install-deps.sh
+
+resources:
+  gpu_type: "b200"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 3
+  gpus_per_prefill: 1
+  gpus_per_decode: 2
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      stream-interval: 128
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      max-num-seqs: 864
+      gpu-memory-utilization: 0.90
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6144"
+  random_range_ratio: 0.8
@@ -0,0 +1,72 @@
+name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2"
+
+model:
+  path: "minimax-m2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.20.1"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+setup_script: install-deps.sh
+
+resources:
+  gpu_type: "b200"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 3
+  gpus_per_prefill: 1
+  gpus_per_decode: 2
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      stream-interval: 128
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      max-num-seqs: 864
+      gpu-memory-utilization: 0.90
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096"
+  random_range_ratio: 0.8
@@ -0,0 +1,74 @@
+name: "minimax-m2.5-vllm-disagg-b200-1k1k-3p2xdep4"
+
+# Rate-matched dep4 at 1k/1k.
+# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓
+
+
+model:
+  path: "minimax-m2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.20.1"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+setup_script: install-deps.sh
+
+resources:
+  gpu_type: "b200"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 1
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      max-num-seqs: 864
+      gpu-memory-utilization: 0.90
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024x2048"
+  random_range_ratio: 0.8
@@ -0,0 +1,70 @@
+name: "minimax-m2.5-vllm-disagg-b200-decode-2p1xdep8"
+
+model:
+  path: "minimax-m2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.20.1"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+setup_script: install-deps.sh
+
+resources:
+  gpu_type: "b200"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 1
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      stream-interval: 128
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      kv-cache-dtype: "fp8"
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      no-enable-prefix-caching: true
+      max-model-len: 2048
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+      max-num-seqs: 864
+      gpu-memory-utilization: 0.90
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024x2048x4096"
+  random_range_ratio: 0.8