B200 Minimax FP8 vllm upgrade (#947)

kedarpotdar-nv · github-actions[bot] · functionstackx · web-flow · commit bddbf4031771 · 2026-04-03T11:31:35.000-05:00
* Update nvidia-master.yaml

* vllm version bump

* add perf changelog

* update search space and configs

* fix typo in VLLM_USE_DEEP_GEMM

* Remove ISL 1024 / OSL 8192 seq-len config for minimaxm2.5-fp8-b200-vllm

Co-authored-by: functionstackx &lt;functionstackx@users.noreply.github.com&gt;

* update image

* update config and remove DEEPGEMM flag

* test tep

* fix typo in ep bash script

* add max cudagraph size

* upgrade to vllm 0.19

* typo

* revert h200 change

* fix: update perf-changelog version to v0.19.0

Co-authored-by: Cameron Quilici &lt;cquil11@users.noreply.github.com&gt;

* Remove commented-out tp:8 search-space entry

Co-authored-by: Cameron Quilici &lt;cquil11@users.noreply.github.com&gt;

---------

Co-authored-by: claude[bot] &lt;41898282+claude[bot]@users.noreply.github.com&gt;
Co-authored-by: functionstackx &lt;functionstackx@users.noreply.github.com&gt;
Co-authored-by: Cameron Quilici &lt;cquil11@users.noreply.github.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -3101,7 +3101,7 @@ gptoss-fp4-b200-vllm:
     - { tp: 8, conc-start: 4, conc-end: 4 }
 
 minimaxm2.5-fp8-b200-vllm:
-  image: vllm/vllm-openai:v0.17.0-cu130
+  image: vllm/vllm-openai:v0.19.0-cu130
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b200
@@ -3112,13 +3112,15 @@ minimaxm2.5-fp8-b200-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 512 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 }
+    - { tp: 4, conc-start: 4, conc-end: 512 }
+    - { tp: 4, ep: 4, conc-start: 16, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 256 }
+    - { tp: 4, conc-start: 4, conc-end: 256 }
 
 gptoss-fp4-h100-vllm:
   image: vllm/vllm-openai:v0.18.0
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
@@ -24,10 +24,9 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-export VLLM_USE_FLASHINFER_MOE_FP8=0
-export VLLM_MOE_USE_DEEP_GEMM=0
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl
 
-if [ "$EP_SIZE" -ge 1 ]; then
+if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
   EP=" "
@@ -44,10 +43,13 @@ set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
 $EP \
---gpu-memory-utilization 0.95 \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
---no-enable-prefix-caching \
+--kv-cache-dtype fp8 \
+--max-cudagraph-capture-size 2048 \
+--max-num-batched-tokens "$((ISL * 2 ))" \
+--stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1143,7 +1143,7 @@
   description:
     - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966
-  
+
 - config-keys:
     # NVIDIA single-node
     - dsr1-fp4-b200-sglang
@@ -1235,3 +1235,13 @@
     - "New model support on ATOM framework"
     - "Kimi-K2.5 FP4, and MiniMax-M2.5 FP8 configs added for MI355X ATOM"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/963
+
+- config-keys:
+    - minimaxm2.5-fp8-b200-vllm
+  description:
+    - "Update vLLM image from v0.17.0 to v0.19.0 for MiniMax-M2.5 FP8 B200"
+    - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs"
+    - "Remove ISL 1024 / OSL 8192 seq-len config"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947
+
+