[NV] update Minimax2.5 fp8 h100 vllm (#1516)

hshrivastava-droid · functionstackx · web-flow · commit 298d8f94cfa9 · 2026-05-22T16:34:42.000-04:00
* update h100 minimax

* Update PR link in perf-changelog.yaml

---------

Co-authored-by: functionstackx &lt;47992694+functionstackx@users.noreply.github.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -4515,7 +4515,7 @@ gptoss-fp4-h100-vllm:
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
 minimaxm2.5-fp8-h100-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.19.1-cu130
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h100
@@ -4527,13 +4527,11 @@ minimaxm2.5-fp8-h100-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
     - isl: 8192
       osl: 1024
       search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
 
 # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
 # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
@@ -9,7 +9,6 @@ check_env_vars \
     CONC \
     ISL \
     OSL \
-    MAX_MODEL_LEN \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
@@ -28,7 +27,6 @@ PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
-    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
 if [ "$EP_SIZE" -gt 1 ]; then
@@ -44,12 +42,13 @@ set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --tensor-parallel-size=$TP \
 $EP \
---gpu-memory-utilization 0.90 \
---max-model-len $MAX_MODEL_LEN \
---max-num-seqs 256 \
---no-enable-prefix-caching \
 --trust-remote-code \
---compilation-config '{"cudagraph_mode":"PIECEWISE"}' > $SERVER_LOG 2>&1 &
+--enable-auto-tool-choice \
+--tool-call-parser minimax_m2 \
+--reasoning-parser minimax_m2_append_think \
+--compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \
+--gpu-memory-utilization 0.9 \
+> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3113,3 +3113,13 @@
     - "1k1k and 8k1k STP low-latency and max-throughput srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/ (ported from upstream srt-slurm PR #152)"
     - "Wire glm5/fp4 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1514
+
+- config-keys:
+    - minimaxm2.5-fp8-h100-vllm
+  description:
+    - "Update minimaxm2.5-fp8-h100-vllm recipe (v0.19.1)"
+    - "Image: vllm/vllm-openai:v0.21.0 -> v0.19.1-cu130"
+    - "Replace recipe flags: drop PIECEWISE/0.90 mem util/256 max-num-seqs/no-prefix-caching/explicit max-model-len; add --enable-auto-tool-choice, --tool-call-parser minimax_m2, --reasoning-parser minimax_m2_append_think, --compilation-config mode:3+fuse_minimax_qk_norm"
+    - "Search-space: tp:8 ep:8 (TEP=8), conc-end 128 chosen at saturation per local sweep"
+    - "Local bench: TEP=8 peaks at C=128 with 26923 tot tps (+178% vs TEP=4 peak at C=32 in May 6 j11600242 sweep)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1516