SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 5 additions & 4 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh‎
Lines changed: 29 additions & 4 deletions b/‎benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh‎
Lines changed: 29 additions & 4 deletions
@@ -2851,10 +2851,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
-# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
-# MI355X serving shape, but retain the default BF16 KV cache because this
-# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
-# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
+# MiniMax-M3 MXFP8 MI300X recipe. Convert the checkpoint's MXFP8 MoE weights to
+# 128x128 block FP8 at load time and use the regular Triton block-FP8 backend.
+# Retain the default BF16 KV cache because this checkpoint lacks calibrated
+# ROCm FP8 attention scales. Use TP8 for latency and TP8+EP8 at high
+# concurrency.
 minimaxm3-fp8-mi300x-vllm:
   image: vllm/vllm-openai-rocm:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
 
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
 # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
-# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
-# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
-# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
-# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.
+# Reuses the dedicated ROCm image and converts MXFP8 MoE weights to 128x128
+# block FP8 at load time. Block size 128 is mandatory for MSA sparse attention.
+# Keep the default BF16 KV cache on gfx942: the checkpoint has no calibrated
+# q/prob scales for ROCm FP8 attention, and vLLM's fallback scale of 1.0
+# corrupts model accuracy.
+# Target image vLLM revision: 4a560dd8db67c270f5e2afb614558271b76f2294.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -24,6 +26,29 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
+VLLM_PACKAGE_ROOT="$(
+    python - <<'PY'
+from pathlib import Path
+
+import vllm
+
+print(Path(vllm.__file__).resolve().parent.parent)
+PY
+)"
+MXFP8_PATCH="$(dirname "$0")/minimaxm3_mi300x_mxfp8.patch"
+MXFP8_ORACLE="$VLLM_PACKAGE_ROOT/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py"
+MXFP8_MARKER="MXFP8 MoE weights will be converted to 128x128 block FP8"
+if ! grep -q "$MXFP8_MARKER" "$MXFP8_ORACLE"; then
+    if ! patch --batch --forward -d "$VLLM_PACKAGE_ROOT" -p1 < "$MXFP8_PATCH"; then
+        echo "Failed to apply the MI300X MXFP8 patch" >&2
+        exit 1
+    fi
+fi
+if ! grep -q "$MXFP8_MARKER" "$MXFP8_ORACLE"; then
+    echo "MI300X MXFP8 conversion marker is missing after patching" >&2
+    exit 1
+fi
+
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 if [ -n "$ROCR_VISIBLE_DEVICES" ]; then