Update

manuelcandales · manuelcandales · commit 47cbe767a72f · 2026-04-14T12:25:25.000-04:00
[ghstack-poisoned]
diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt
@@ -45,6 +45,7 @@ set(_aoti_metal_sources
     runtime/ops/common.mm
     runtime/ops/op_bmm.mm
     runtime/ops/op_convolution.mm
+    runtime/ops/op_gather_qmv.mm
     runtime/ops/op_linear_4bit.mm
     runtime/ops/op_mm.mm
     runtime/ops/op_sdpa.mm
diff --git a/backends/apple/metal/metal_backend.py b/backends/apple/metal/metal_backend.py
@@ -37,6 +37,7 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
             "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
             "torchao::_linear_fp_act_4bit_weight": None,
             "at::_ops::topk::call": None,
+            "metal::gather_qmv": None,
         }
 
     @classmethod
@@ -76,6 +77,17 @@ def get_aoti_compile_options(
 
         from torchao.experimental.ops.mps.cshim import torchao_op_c_shim
 
-        inductor_configs["aot_inductor.custom_ops_to_c_shims"] = torchao_op_c_shim
+        custom_c_shims = {**torchao_op_c_shim}
+
+        try:
+            from executorch.backends.apple.metal.ops.gather_qmv import (
+                metal_gather_qmv_c_shim,
+            )
+
+            custom_c_shims.update(metal_gather_qmv_c_shim)
+        except ImportError:
+            pass
+
+        inductor_configs["aot_inductor.custom_ops_to_c_shims"] = custom_c_shims
 
         return inductor_configs
diff --git a/backends/apple/metal/ops/__init__.py b/backends/apple/metal/ops/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/apple/metal/ops/gather_qmv.py b/backends/apple/metal/ops/gather_qmv.py
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+metal::gather_qmv custom op for MoE expert-indexed quantized matmul.
+
+Performs y[i] = W[expert_idx[i]] @ x[i] with INT4 quantized expert weights.
+The Metal fallback kernel is in runtime/ops/op_gather_qmv.mm.
+"""
+
+import torch
+from torch import Tensor
+
+
+@torch.library.custom_op("metal::gather_qmv", mutates_args=())
+def gather_qmv(
+    x: Tensor,  # [P, K] — activations (P = num token-expert pairs)
+    w: Tensor,  # [E, N, K_packed] — packed INT4 expert weights
+    scales: Tensor,  # [E, N, K/gs] — per-group scales
+    biases: Tensor,  # [E, N, K/gs] — per-group biases
+    expert_indices: Tensor,  # [P] — expert index per pair
+    group_size: int,
+) -> Tensor:
+    """Reference implementation for tracing and CPU testing."""
+    P, K = x.shape
+    E, N, K_packed = w.shape
+
+    y = torch.zeros(P, N, dtype=x.dtype, device=x.device)
+    for i in range(P):
+        eidx = expert_indices[i].item()
+        w_e = w[eidx]  # [N, K_packed]
+        s_e = scales[eidx]  # [N, K/gs]
+        b_e = biases[eidx]  # [N, K/gs]
+
+        # Dequantize: unpack INT4, apply affine dequant
+        w_unpacked = _dequantize_int4_affine(w_e, s_e, b_e, K, group_size)
+        y[i] = w_unpacked @ x[i]
+
+    return y
+
+
+def _dequantize_int4_affine(
+    w_packed: Tensor, scales: Tensor, biases: Tensor, K: int, group_size: int
+) -> Tensor:
+    """Dequantize packed INT4 weights using MLX affine format."""
+    N = w_packed.shape[0]
+    w_bytes = w_packed.to(torch.int16)
+    low = w_bytes & 0x0F
+    high = (w_bytes >> 4) & 0x0F
+    w_int = torch.stack([low, high], dim=-1).reshape(N, K).float()
+
+    scales_expanded = scales.float().repeat_interleave(group_size, dim=-1)[:, :K]
+    biases_expanded = biases.float().repeat_interleave(group_size, dim=-1)[:, :K]
+
+    return (w_int * scales_expanded + biases_expanded).to(scales.dtype)
+
+
+@torch.library.register_fake("metal::gather_qmv")
+def gather_qmv_fake(
+    x: Tensor,
+    w: Tensor,
+    scales: Tensor,
+    biases: Tensor,
+    expert_indices: Tensor,
+    group_size: int,
+) -> Tensor:
+    P = x.shape[0]
+    N = w.shape[1]
+    return torch.empty(P, N, dtype=x.dtype, device=x.device)
+
+
+# C shim mapping for AOTInductor code generation.
+# Maps the torch op to the C function name that the generated wrapper calls.
+metal_gather_qmv_c_shim = {
+    torch.ops.metal.gather_qmv.default: [
+        "AOTITorchError aoti_torch_mps_gather_qmv("
+        "AtenTensorHandle X, AtenTensorHandle W, AtenTensorHandle S, "
+        "AtenTensorHandle Z, AtenTensorHandle ExpertIndices, "
+        "int64_t group_size, AtenTensorHandle* ret)"
+    ],
+}
diff --git a/backends/apple/metal/runtime/ops/op_gather_qmv.mm b/backends/apple/metal/runtime/ops/op_gather_qmv.mm
diff --git a/backends/apple/metal/tests/test_modules.py b/backends/apple/metal/tests/test_modules.py