[ROCm][gpt-oss] Pass GateMode.INTERLEAVE for MXFP4 W4A16 fused MoE

Rohan138 · Rohan138 · commit cfd608ee88e8 · 2026-06-08T16:11:50.000Z
The MXFP4 W4A16 weight-load path in oracle/mxfp4.py uses shuffle_weight_a16w4 (is_guinterleave=True), which interleaves gate/up columns within each weight tile. The CK/FlyDSL MoE kernels in aiter must be told this via gate_mode=GateMode.INTERLEAVE so they decode the gate/up packing correctly. Without the explicit gate_mode, aiter defaults to SEPARATED and (since ROCm/aiter#3123) dispatches the (SEPARATED + Swiglu + per_1x32 + fp4x2) case to a path that returns garbage for shuffled weights or crashes during CK2stages JIT for the unshuffled Quark variant (amd/gpt-oss-20b-w-mxfp4-a-bf16). This was the root cause of ROCM-25517 (gpt-oss-120b W4A16 gsm8k acc = 0) and ROCM-25478 (gpt-oss-20b Quark JIT crash). Other paths are unaffected: - FP8 W8A8 (DeepSeek-V4-Pro, DeepSeek-V3.2): shuffled with quark_ocp_mx.py:shuffle_weight(layout=(16,16)) — non-interleaved. use_mxfp4_w4a16 is False, default SEPARATED preserved. - MXFP4 W4A4 (amd/DeepSeek-R1-0528-MXFP4): shuffled via rocm_aiter_ops.shuffle_weights — non-interleaved. use_mxfp4_w4a16 is False, default SEPARATED preserved. The gate_mode kwarg was added to aiter.fused_moe in ROCm/aiter#3123 (aiter>=0.1.14). To stay compatible with older aiter shipping with vllm (e.g. aiter 0.1.13.post1 in the vllm-rocm:nightly image), we probe the aiter signature and drop the kwarg when unsupported — pre-vllm-project#3123 aiter tolerated the implicit SEPARATED default for interleave-shuffled weights, so dropping the kwarg is safe there. GateMode itself only exists on aiter>=0.1.14 and is imported under try/except for the same reason. Validation on MI355X (gfx950): vllm@main + aiter@main (6aeba41) openai/gpt-oss-120b W4A16 gsm8k: TP=1: 0.000 -> 0.905 TP=8: 0.000 -> 0.905 vllm@main + aiter@main amd/gpt-oss-20b-w-mxfp4-a-bf16 TP=2 enforce-eager: CK2stages JIT crash -> serves cleanly vllm-rocm:nightly + aiter 0.1.13.post1 openai/gpt-oss-120b W4A16 gsm8k: TP=1: 0.910 (backward-compat — gate_mode kwarg silently dropped) vllm-rocm:v0.22.0 + aiter@main openai/gpt-oss-120b W4A16 gsm8k: TP=1: 0.895 amd/gpt-oss120b-w-mxfp4-a-fp8 W4A8 (this PR composes with vllm-project#44804): TP=8 mc=1=326, mc=8=2087, mc=32=6523, mc=64=11610 tok/s Reference: sgl-project/sglang#25580 (sglang's equivalent fix). Recommended by aiter maintainer (XiaobingZhang) on ROCm/aiter#3586. Signed-off-by: Rohan Potdar <rohan.potdar@amd.com>
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
@@ -152,6 +152,7 @@ def _rocm_aiter_fused_moe_impl(
     output_dtype: torch.dtype | None = None,
     hidden_pad: int = 0,
     intermediate_pad: int = 0,
+    gate_mode: str = "",
     bias1: torch.Tensor | None = None,
     bias2: torch.Tensor | None = None,
     moe_sorting_dispatch_policy: int = 0,
@@ -162,6 +163,11 @@ def _rocm_aiter_fused_moe_impl(
     activation = ActivationType(activation_method)
     quant_type = QuantType(quant_method)
 
+    extra_kwargs: dict = {}
+    # `gate_mode` was added to aiter.fused_moe in #3123 (aiter>=0.1.14).
+    if gate_mode and rocm_aiter_ops.fused_moe_supports_gate_mode():
+        extra_kwargs["gate_mode"] = gate_mode
+
     return fused_moe(
         hidden_states,
         w1,
@@ -183,6 +189,7 @@ def _rocm_aiter_fused_moe_impl(
         bias1=bias1,
         bias2=bias2,
         moe_sorting_dispatch_policy=moe_sorting_dispatch_policy,
+        **extra_kwargs,
     )
 
 
@@ -204,6 +211,7 @@ def _rocm_aiter_fused_moe_fake(
     output_dtype: torch.dtype | None = None,
     hidden_pad: int = 0,
     intermediate_pad: int = 0,
+    gate_mode: str = "",
     bias1: torch.Tensor | None = None,
     bias2: torch.Tensor | None = None,
     moe_sorting_dispatch_policy: int = 0,
@@ -1643,6 +1651,20 @@ def are_gdn_triton_kernels_available(cls) -> bool:
         except (ImportError, ModuleNotFoundError):
             return False
 
+    @classmethod
+    @if_aiter_supported
+    @functools.cache
+    def fused_moe_supports_gate_mode(cls) -> bool:
+        """Probe whether the installed aiter.fused_moe accepts `gate_mode`.
+
+        Added in aiter#3123 (>=0.1.14). Builds with older aiter must omit the kwarg.
+        """
+        import inspect
+
+        from aiter.fused_moe import fused_moe
+
+        return "gate_mode" in inspect.signature(fused_moe).parameters
+
     @staticmethod
     @if_aiter_supported
     def register_ops_once() -> None:
@@ -1976,6 +1998,7 @@ def fused_moe(
         output_dtype: torch.dtype | None = None,
         hidden_pad: int = 0,
         intermediate_pad: int = 0,
+        gate_mode: str = "",
         bias1: torch.Tensor | None = None,
         bias2: torch.Tensor | None = None,
         moe_sorting_dispatch_policy: int = 0,
@@ -1998,6 +2021,7 @@ def fused_moe(
             output_dtype,
             hidden_pad,
             intermediate_pad,
+            gate_mode,
             bias1,
             bias2,
             moe_sorting_dispatch_policy,
diff --git a/vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py b/vllm/model_executor/layers/fused_moe/experts/rocm_aiter_moe.py
@@ -341,6 +341,18 @@ def rocm_aiter_fused_experts(
             - moe_config.intermediate_size_per_partition_unpadded
         )
 
+        # MXFP4 W4A16 weights are interleave-shuffled in oracle/mxfp4.py;
+        # match with GateMode.INTERLEAVE or aiter#3123 dispatch returns
+        # garbage / fails JIT.
+        gate_mode = ""
+        if quant_config.use_mxfp4_w4a16:
+            try:
+                from aiter.ops.flydsl.moe_common import GateMode
+
+                gate_mode = GateMode.INTERLEAVE.value
+            except ImportError:
+                pass
+
         return rocm_aiter_ops.fused_moe(
             hidden_states,
             w1,
@@ -359,6 +371,7 @@ def rocm_aiter_fused_experts(
             output_dtype=output_dtype,
             hidden_pad=hidden_pad // 128 * 128,
             intermediate_pad=intermediate_pad // 64 * 64 * 2,
+            gate_mode=gate_mode,
             bias1=quant_config.w1_bias if quant_config.use_mxfp4_w4a16 else None,
             bias2=quant_config.w2_bias if quant_config.use_mxfp4_w4a16 else None,
             moe_sorting_dispatch_policy=moe_sorting_dispatch_policy,