From 27510c41e1a59c1de10972da665c1d861a6b0195 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 18 Jun 2026 04:51:00 +0800
Subject: [PATCH 1/2] perf(mi300x): use load-time block FP8 MoE conversion

Co-authored-by: OpenAI Codex <codex@openai.com>
Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
---
 .github/configs/amd-master.yaml               |   9 +-
 .../fixed_seq_len/minimaxm3_fp8_mi300x.sh     |  50 ++-
 .../minimaxm3_mi300x_mxfp8.patch              | 406 ++++++++++++++++++
 perf-changelog.yaml                           |  10 +-
 utils/process_changelog.py                    |  29 ++
 utils/test_process_changelog.py               |  62 +++
 6 files changed, 557 insertions(+), 9 deletions(-)
 create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
 create mode 100644 utils/test_process_changelog.py

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 606e3c2af..d0a20bef8 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2851,10 +2851,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
-# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
-# MI355X serving shape, but retain the default BF16 KV cache because this
-# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
-# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
+# MiniMax-M3 MXFP8 MI300X recipe. Convert the checkpoint's MXFP8 MoE weights to
+# 128x128 block FP8 at load time and use the regular Triton block-FP8 backend.
+# Retain the default BF16 KV cache because this checkpoint lacks calibrated
+# ROCm FP8 attention scales. Use TP8 for latency and TP8+EP8 at high
+# concurrency.
 minimaxm3-fp8-mi300x-vllm:
   image: vllm/vllm-openai-rocm:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
index f2cdaf284..d2b01a291 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
 # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
-# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
-# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
-# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
-# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.
+# Reuses the dedicated ROCm image and converts MXFP8 MoE weights to 128x128
+# block FP8 at load time. Block size 128 is mandatory for MSA sparse attention.
+# Keep the default BF16 KV cache on gfx942: the checkpoint has no calibrated
+# q/prob scales for ROCm FP8 attention, and vLLM's fallback scale of 1.0
+# corrupts model accuracy.
+# Target image vLLM revision: 4a560dd8db67c270f5e2afb614558271b76f2294.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -24,6 +26,46 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
+if ! VLLM_PACKAGE_ROOT="$(
+    python3 - <<'PY'
+from pathlib import Path
+
+import vllm
+
+print(Path(vllm.__file__).resolve().parent.parent)
+PY
+)"; then
+    echo "Failed to locate the installed vLLM package" >&2
+    exit 1
+fi
+if [[ -z "$VLLM_PACKAGE_ROOT" || ! -d "$VLLM_PACKAGE_ROOT/vllm" ]]; then
+    echo "Invalid installed vLLM package root: $VLLM_PACKAGE_ROOT" >&2
+    exit 1
+fi
+
+MXFP8_PATCH="$(dirname "$0")/minimaxm3_mi300x_mxfp8.patch"
+if [[ ! -f "$MXFP8_PATCH" ]]; then
+    echo "MI300X MXFP8 patch is missing: $MXFP8_PATCH" >&2
+    exit 1
+fi
+
+PATCH_CHECK_ARGS=(--batch --silent -d "$VLLM_PACKAGE_ROOT" -p1 --dry-run)
+if patch "${PATCH_CHECK_ARGS[@]}" --reverse --forward < "$MXFP8_PATCH"; then
+    echo "MI300X MXFP8 patch is already fully applied"
+elif patch "${PATCH_CHECK_ARGS[@]}" --forward < "$MXFP8_PATCH"; then
+    if ! patch --batch --forward -d "$VLLM_PACKAGE_ROOT" -p1 < "$MXFP8_PATCH"; then
+        echo "Failed to apply the MI300X MXFP8 patch" >&2
+        exit 1
+    fi
+else
+    echo "Installed vLLM is neither cleanly patchable nor fully patched" >&2
+    exit 1
+fi
+if ! patch "${PATCH_CHECK_ARGS[@]}" --reverse --forward < "$MXFP8_PATCH"; then
+    echo "MI300X MXFP8 patch verification failed" >&2
+    exit 1
+fi
+
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
new file mode 100644
index 000000000..1e83f431d
--- /dev/null
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
@@ -0,0 +1,406 @@
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644
+--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
++++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+@@ -1,28 +1,28 @@
+ {
+     "1": {
+-        "BLOCK_SIZE_M": 16,
+-        "BLOCK_SIZE_N": 128,
+-        "BLOCK_SIZE_K": 256,
+-        "GROUP_SIZE_M": 1,
+-        "num_warps": 8,
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 16,
++        "num_warps": 2,
+         "num_stages": 2,
+         "waves_per_eu": 0
+     },
+     "2": {
+-        "BLOCK_SIZE_M": 16,
+-        "BLOCK_SIZE_N": 128,
+-        "BLOCK_SIZE_K": 256,
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
+         "GROUP_SIZE_M": 1,
+-        "num_warps": 8,
++        "num_warps": 2,
+         "num_stages": 2,
+         "waves_per_eu": 0
+     },
+     "4": {
+-        "BLOCK_SIZE_M": 16,
+-        "BLOCK_SIZE_N": 128,
+-        "BLOCK_SIZE_K": 256,
+-        "GROUP_SIZE_M": 1,
+-        "num_warps": 8,
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 16,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 8,
++        "num_warps": 2,
+         "num_stages": 2,
+         "waves_per_eu": 0
+     },
+@@ -36,11 +36,11 @@
+         "waves_per_eu": 0
+     },
+     "16": {
+-        "BLOCK_SIZE_M": 16,
+-        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 32,
+         "BLOCK_SIZE_K": 128,
+         "GROUP_SIZE_M": 1,
+-        "num_warps": 8,
++        "num_warps": 1,
+         "num_stages": 2,
+         "waves_per_eu": 0
+     },
+diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+new file mode 100644
+index 0000000000000000000000000000000000000000..b1bfee7fc386385fa8b8f06e561d528ea4762d87
+--- /dev/null
++++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+@@ -0,0 +1,74 @@
++{
++    "64": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 4,
++        "num_warps": 2,
++        "num_stages": 2,
++        "waves_per_eu": 2
++    },
++    "128": {
++        "BLOCK_SIZE_M": 32,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 2,
++        "num_stages": 2,
++        "waves_per_eu": 1
++    },
++    "256": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 256,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 2
++    },
++    "512": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 2
++    },
++    "1024": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 2
++    },
++    "2048": {
++        "BLOCK_SIZE_M": 64,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 2
++    },
++    "4096": {
++        "BLOCK_SIZE_M": 256,
++        "BLOCK_SIZE_N": 128,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 0
++    },
++    "8192": {
++        "BLOCK_SIZE_M": 128,
++        "BLOCK_SIZE_N": 256,
++        "BLOCK_SIZE_K": 128,
++        "GROUP_SIZE_M": 1,
++        "num_warps": 4,
++        "num_stages": 2,
++        "waves_per_eu": 0
++    }
++}
+diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
+index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b5cb7a045 100644
+--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
++++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
+@@ -7,8 +7,11 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+ from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
+     Fp8MoeBackend,
+     backend_to_kernel_cls,
++    select_fp8_moe_backend,
+ )
+ from vllm.model_executor.layers.quantization.utils.quant_utils import (
++    kFp8Dynamic128Sym,
++    kFp8Static128BlockSym,
+     kMxfp8Dynamic,
+     kMxfp8Static,
+ )
+@@ -101,13 +104,31 @@ def _select_rocm_mxfp8_backend() -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts
+
+ def select_mxfp8_moe_backend(
+     config: FusedMoEConfig,
++    *,
++    block_fp8_on_fnuz: bool = False,
+ ) -> tuple[Fp8MoeBackend, type[mk.FusedMoEExperts]]:
+     """Select the MXFP8 MoE backend and the best expert class.
+
++    ``block_fp8_on_fnuz`` must only be enabled by quantization methods that
++    convert their MXFP8 weights to 128x128 block FP8 before kernel setup.
++
+     Returns:
+         A tuple of (fp8_backend, experts_cls).
+     """
+
++    if block_fp8_on_fnuz and current_platform.is_fp8_fnuz():
++        logger.info_once(
++            "MXFP8 MoE weights will be converted to 128x128 block FP8 at load time."
++        )
++        block_fp8_backend, experts_cls = select_fp8_moe_backend(
++            config=config,
++            weight_key=kFp8Static128BlockSym,
++            activation_key=kFp8Dynamic128Sym,
++        )
++        assert block_fp8_backend is not None
++        assert experts_cls is not None
++        return block_fp8_backend, experts_cls
++
+     runner_backend = config.moe_backend
+     if runner_backend != "auto":
+         backend = _BACKEND_NAME_MAP.get(runner_backend)
+diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
+index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644
+--- a/vllm/model_executor/layers/quantization/modelopt.py
++++ b/vllm/model_executor/layers/quantization/modelopt.py
+@@ -69,6 +69,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
+     MXFP8_BLOCK_SIZE,
+     MXFP8_SCALE_DTYPE,
+     MXFP8_VALUE_DTYPE,
++    convert_mxfp8_to_block_fp8,
+ )
+ from vllm.model_executor.layers.quantization.utils.quant_utils import (
+     GroupShape,
+@@ -92,6 +93,7 @@ from vllm.model_executor.parameter import (
+     PerTensorScaleParameter,
+ )
+ from vllm.model_executor.utils import replace_parameter, set_weight_attrs
++from vllm.platforms import current_platform
+
+ if TYPE_CHECKING:
+     from vllm.model_executor.models.utils import WeightsMapper
+@@ -1876,7 +1878,7 @@ class ModelOptMxFp8LinearMethod(LinearMethodBase):
+
+
+ class ModelOptMxFp8FusedMoE(FusedMoEMethodBase):
+-    """FlashInfer TRTLLM MXFP8 block-scale MoE for ModelOpt checkpoints."""
++    """ModelOpt MXFP8 MoE quantization."""
+
+     def __init__(
+         self,
+@@ -1884,11 +1886,17 @@ class ModelOptMxFp8FusedMoE(FusedMoEMethodBase):
+         moe_config: FusedMoEConfig,
+     ) -> None:
+         super().__init__(moe_config)
+-        self.weight_block_size = [1, MXFP8_BLOCK_SIZE]
++        self.requantize_mxfp8_to_block_fp8 = current_platform.is_fp8_fnuz()
++        self.weight_block_size = (
++            [128, 128] if self.requantize_mxfp8_to_block_fp8 else [1, MXFP8_BLOCK_SIZE]
++        )
+         self.quant_config = quant_config
+         assert self.quant_config.is_checkpoint_mxfp8_serialized
+
+-        self.mxfp8_backend, self.experts_cls = select_mxfp8_moe_backend(config=self.moe)
++        self.mxfp8_backend, self.experts_cls = select_mxfp8_moe_backend(
++            config=self.moe,
++            block_fp8_on_fnuz=self.requantize_mxfp8_to_block_fp8,
++        )
+
+     def create_weights(
+         self,
+@@ -2129,15 +2137,35 @@ class ModelOptMxFp8FusedMoE(FusedMoEMethodBase):
+
+         self._check_weight_dtypes(layer)
+
++        if self.requantize_mxfp8_to_block_fp8:
++            w13, w13_scale = convert_mxfp8_to_block_fp8(
++                layer.w13_weight,
++                layer.w13_weight_scale,
++                block_size=(128, 128),
++            )
++            w2, w2_scale = convert_mxfp8_to_block_fp8(
++                layer.w2_weight,
++                layer.w2_weight_scale,
++                block_size=(128, 128),
++            )
++            logger.info_once(
++                "Converted MXFP8 MoE weights to 128x128 block FP8 at load time."
++            )
++        else:
++            w13 = layer.w13_weight
++            w2 = layer.w2_weight
++            w13_scale = layer.w13_weight_scale
++            w2_scale = layer.w2_weight_scale
++
+         layer.weight_block_size = self.weight_block_size
+
+         w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format(
+             fp8_backend=self.mxfp8_backend,
+             layer=layer,
+-            w13=layer.w13_weight,
+-            w2=layer.w2_weight,
+-            w13_scale=layer.w13_weight_scale,
+-            w2_scale=layer.w2_weight_scale,
++            w13=w13,
++            w2=w2,
++            w13_scale=w13_scale,
++            w2_scale=w2_scale,
+             w13_input_scale=None,
+             w2_input_scale=None,
+         )
+diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
+index e6063b463284f912f6cd923d9cf65ff9515d56f2..27d7be0a1740630269c305cfe7377eebdbd54285 100644
+--- a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
++++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
+@@ -227,6 +227,122 @@ def dequant_mxfp8_to_bf16(x: torch.Tensor, scales: torch.Tensor) -> torch.Tensor
+     return dequantized.to(torch.bfloat16)
+
+
++def _quantize_block_fp8_ocp(
++    weight: torch.Tensor,
++    block_size: tuple[int, int],
++) -> tuple[torch.Tensor, torch.Tensor]:
++    """Quantize a matrix to OCP E4M3 with per-block FP32 scales."""
++    block_m, block_k = block_size
++    n, k = weight.shape
++    padded_n = ((n + block_m - 1) // block_m) * block_m
++    padded_k = ((k + block_k - 1) // block_k) * block_k
++    padded = torch.zeros(
++        (padded_n, padded_k),
++        dtype=torch.float32,
++        device=weight.device,
++    )
++    padded[:n, :k] = weight.float()
++
++    blocks = padded.view(
++        padded_n // block_m,
++        block_m,
++        padded_k // block_k,
++        block_k,
++    )
++    amax = blocks.abs().amax(dim=(1, 3), keepdim=True).clamp(min=1e-4)
++    scales = amax / torch.finfo(torch.float8_e4m3fn).max
++    quantized = (blocks / scales).to(torch.float8_e4m3fn)
++    return (
++        quantized.view(padded_n, padded_k)[:n, :k].contiguous(),
++        scales.view(padded_n // block_m, padded_k // block_k).contiguous(),
++    )
++
++
++def convert_mxfp8_to_block_fp8(
++    weight: torch.Tensor,
++    scales: torch.Tensor,
++    block_size: tuple[int, int] = (128, 128),
++) -> tuple[torch.Tensor, torch.Tensor]:
++    """Convert MXFP8 matrices to block-scaled FP8.
++
++    Each matrix is dequantized and requantized independently to bound the
++    temporary BF16 memory used while loading MoE weights.
++    """
++    from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
++        normalize_e4m3fn_to_e4m3fnuz,
++    )
++    from vllm.platforms import current_platform
++
++    if weight.ndim < 2:
++        raise ValueError(
++            f"MXFP8 weight must have at least 2 dimensions, got {weight.ndim}."
++        )
++    if weight.dtype != MXFP8_VALUE_DTYPE:
++        raise ValueError(
++            f"MXFP8 weight must use {MXFP8_VALUE_DTYPE}, got {weight.dtype}."
++        )
++    if scales.dtype != MXFP8_SCALE_DTYPE:
++        raise ValueError(
++            f"MXFP8 scales must use {MXFP8_SCALE_DTYPE}, got {scales.dtype}."
++        )
++    if weight.shape[-1] % MXFP8_BLOCK_SIZE != 0:
++        raise ValueError(
++            f"MXFP8 weight K dimension must be divisible by {MXFP8_BLOCK_SIZE}, "
++            f"got {weight.shape[-1]}."
++        )
++
++    expected_scale_shape = (*weight.shape[:-1], weight.shape[-1] // MXFP8_BLOCK_SIZE)
++    if scales.shape != expected_scale_shape:
++        raise ValueError(
++            f"Expected MXFP8 scale shape {expected_scale_shape}, "
++            f"got {tuple(scales.shape)}."
++        )
++
++    block_m, block_k = block_size
++    if block_m <= 0 or block_k <= 0:
++        raise ValueError(f"Block dimensions must be positive, got {block_size}.")
++
++    n, k = weight.shape[-2:]
++    scale_shape = (
++        *weight.shape[:-2],
++        (n + block_m - 1) // block_m,
++        (k + block_k - 1) // block_k,
++    )
++    quantized = torch.empty_like(weight, dtype=current_platform.fp8_dtype())
++    block_scales = torch.empty(
++        scale_shape,
++        dtype=torch.float32,
++        device=weight.device,
++    )
++
++    weight_view = weight.reshape(-1, n, k)
++    scales_view = scales.reshape(-1, n, k // MXFP8_BLOCK_SIZE)
++    quantized_view = quantized.reshape(-1, n, k)
++    block_scales_view = block_scales.reshape(
++        -1,
++        scale_shape[-2],
++        scale_shape[-1],
++    )
++    for matrix_idx in range(weight_view.shape[0]):
++        dequantized = dequant_mxfp8_to_bf16(
++            weight_view[matrix_idx],
++            scales_view[matrix_idx],
++        )
++        matrix, matrix_scales = _quantize_block_fp8_ocp(
++            dequantized,
++            block_size=(block_m, block_k),
++        )
++        if current_platform.is_fp8_fnuz():
++            matrix, matrix_scales, _ = normalize_e4m3fn_to_e4m3fnuz(
++                matrix,
++                matrix_scales,
++            )
++        quantized_view[matrix_idx].copy_(matrix)
++        block_scales_view[matrix_idx].copy_(matrix_scales)
++
++    return quantized.contiguous(), block_scales.contiguous()
++
++
+ def mxfp8_e4m3_quantize_fake(
+     x: torch.Tensor,
+     is_sf_swizzled_layout: bool = False,
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index dec0d280c..0fb914c37 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3926,4 +3926,12 @@
     - "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13"
     - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb300_nvfp4 STP recipes)"
     - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
\ No newline at end of file
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
+  
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm
+  description:
+    - "Convert checkpoint MXFP8 MoE weights once at load time to 128x128 block FP8 on gfx942."
+    - "Normalize OCP E4M3 values to FNUZ and use the regular Triton block-FP8 backend."
+    - "Use measured low-token TP tiles and a tuned local-expert table without changing the TP8/TP8+EP8 matrix."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1753
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index 1514f8d36..9de8c8c6a 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -3,6 +3,7 @@
 import re
 import subprocess
 from collections import defaultdict
+from pathlib import Path
 
 import yaml
 from constants import GENERATE_SWEEPS_PY_SCRIPT, MASTER_CONFIGS
@@ -15,6 +16,34 @@
 
 
 def get_added_lines(base_ref: str, head_ref: str, filepath: str) -> str:
+    repo_root = subprocess.run(
+        ["git", "rev-parse", "--show-toplevel"],
+        capture_output=True,
+        text=True,
+    )
+    git_filepath = filepath
+    if repo_root.returncode == 0:
+        resolved_path = Path(filepath).resolve()
+        resolved_root = Path(repo_root.stdout.strip()).resolve()
+        if resolved_path == resolved_root or resolved_root in resolved_path.parents:
+            git_filepath = resolved_path.relative_to(resolved_root).as_posix()
+
+    base_file = subprocess.run(
+        ["git", "show", f"{base_ref}:{git_filepath}"],
+        capture_output=True,
+    )
+    head_file = subprocess.run(
+        ["git", "show", f"{head_ref}:{git_filepath}"],
+        capture_output=True,
+    )
+
+    if (
+        base_file.returncode == 0
+        and head_file.returncode == 0
+        and head_file.stdout.startswith(base_file.stdout)
+    ):
+        return head_file.stdout[len(base_file.stdout) :].decode()
+
     result = subprocess.run(
         ["git", "diff", base_ref, head_ref, "--", filepath],
         capture_output=True,
diff --git a/utils/test_process_changelog.py b/utils/test_process_changelog.py
new file mode 100644
index 000000000..7af5d2d77
--- /dev/null
+++ b/utils/test_process_changelog.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import pytest
+
+import process_changelog
+
+
+def _git(repo: Path, *args: str) -> str:
+    result = subprocess.run(
+        ["git", *args],
+        cwd=repo,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    return result.stdout.strip()
+
+
+def _commit(repo: Path, content: bytes, message: str) -> str:
+    (repo / "perf-changelog.yaml").write_bytes(content)
+    _git(repo, "add", "perf-changelog.yaml")
+    _git(repo, "commit", "-m", message)
+    return _git(repo, "rev-parse", "HEAD")
+
+
+@pytest.fixture
+def changelog_repo(tmp_path: Path) -> Path:
+    _git(tmp_path, "init")
+    _git(tmp_path, "config", "user.name", "Test User")
+    _git(tmp_path, "config", "user.email", "test@example.com")
+    return tmp_path
+
+
+def test_get_added_lines_accepts_append_after_missing_newline(
+    changelog_repo: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    base_content = b"- config-keys:\n    - old\n  pr-link: old"
+    appended = b"\n\n- config-keys:\n    - new\n  pr-link: new\n"
+    base_ref = _commit(changelog_repo, base_content, "base")
+    head_ref = _commit(changelog_repo, base_content + appended, "append")
+    monkeypatch.chdir(changelog_repo)
+
+    assert (
+        process_changelog.get_added_lines(
+            base_ref, head_ref, str(changelog_repo / "perf-changelog.yaml")
+        ).encode()
+        == appended
+    )
+
+
+def test_get_added_lines_rejects_non_whitespace_deletion(
+    changelog_repo: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    base_ref = _commit(changelog_repo, b"- old\n", "base")
+    head_ref = _commit(changelog_repo, b"- new\n", "replace")
+    monkeypatch.chdir(changelog_repo)
+
+    with pytest.raises(ValueError, match="Deletions are not allowed"):
+        process_changelog.get_added_lines(base_ref, head_ref, "perf-changelog.yaml")

From 75213943ac60b044284d0fa81ec720bf0c8c017d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 18 Jun 2026 07:00:49 +0800
Subject: [PATCH 2/2] fix(mi300x): preserve M3 SwiGLU parameters in FP8 patch

Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
---
 .../minimaxm3_mi300x_mxfp8.patch              | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
index 1e83f431d..7fe8fe6af 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_mi300x_mxfp8.patch
@@ -1,3 +1,24 @@
+diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
+index 0755699d1a4545649e8f5af5de77bbf2c6b24fab..905a9bea3c59ee3ef14a5acede345ffc2fd4a36d 100644
+--- a/vllm/model_executor/layers/fused_moe/config.py
++++ b/vllm/model_executor/layers/fused_moe/config.py
+@@ -603,6 +603,8 @@ def fp8_w8a8_moe_quant_config(
+     a2_gscale: torch.Tensor | None = None,
+     g1_alphas: torch.Tensor | None = None,
+     g2_alphas: torch.Tensor | None = None,
++    gemm1_alpha: float | None = None,
++    gemm1_beta: float | None = None,
+     gemm1_clamp_limit: float | None = None,
+ ) -> FusedMoEQuantConfig:
+     """
+@@ -623,5 +625,7 @@ def fp8_w8a8_moe_quant_config(
+         per_act_token_quant=per_act_token_quant,
+         per_out_ch_quant=per_out_ch_quant,
+         block_shape=block_shape,
++        gemm1_alpha=gemm1_alpha,
++        gemm1_beta=gemm1_beta,
+         gemm1_clamp_limit=gemm1_clamp_limit,
+     )
 diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 index c275cecc1591f16e91791e9b007cdb6fcaac40b4..f20c20c4d2a475ca00926c98608edc6b645dd4c1 100644
 --- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -188,6 +209,18 @@ index d0d7c76481b0a315e9c57810d40394822f62594c..e82429b8ecddc9b8e44f003a537de08b
      runner_backend = config.moe_backend
      if runner_backend != "auto":
          backend = _BACKEND_NAME_MAP.get(runner_backend)
+diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+index acbf2cb46ad42927fa344363059fe37a970d132b..1b5030b190960dd3758a25d156389be749f31530 100644
+--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
++++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+@@ -568,5 +568,7 @@ def make_fp8_moe_quant_config(
+         block_shape=block_shape,
+         per_act_token_quant=per_act_token_quant,
+         per_out_ch_quant=per_out_ch_quant,
++        gemm1_alpha=gemm1_alpha,
++        gemm1_beta=gemm1_beta,
+         gemm1_clamp_limit=swiglu_limit,
+     )
 diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
 index 33c7c7532a0ba823e4e7a23538300a5977a4553e..9b9d73f7b5fc138cac3dc3349a24a473d2c1faf6 100644
 --- a/vllm/model_executor/layers/quantization/modelopt.py