pytorch
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 16 additions & 0 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎backends/cuda/int4_dispatch.py‎
Lines changed: 101 additions & 0 deletions b/‎backends/cuda/int4_dispatch.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/shims/int4_plain_mm.cu‎
Lines changed: 81 additions & 0 deletions b/‎backends/cuda/runtime/shims/int4_plain_mm.cu‎
Lines changed: 81 additions & 0 deletions
@@ -110,7 +110,8 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp
 # Only build CUDA shims when CUDA language/toolchain is available.
 if(CMAKE_CUDA_COMPILER)
   list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
-       runtime/shims/sort.cu runtime/shims/rand.cu
+       runtime/shims/int4_plain_mm.cu runtime/shims/sort.cu
+       runtime/shims/rand.cu
   )
 endif()
 
 
@@ -226,6 +226,8 @@ def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
             "at::_ops::_weight_int4pack_mm::call": None,
             "at::_ops::sort_stable::call": None,
             "aoti_torch_cuda_randint_low_out": None,
+            "executorch_cuda::int4_plain_mm": None,
+            "aoti_torch_cuda_int4_plain_mm": None,
         }
 
     @classmethod
@@ -298,6 +300,20 @@ def get_aoti_compile_options(
             "aot_inductor.emit_multi_arch_kernel": emit_multi_arch_kernel,
         }
 
+        try:
+            import torch
+
+            options["aot_inductor.custom_ops_to_c_shims"] = {
+                torch.ops.executorch_cuda.int4_plain_mm.default: [
+                    "AOTITorchError aoti_torch_cuda_int4_plain_mm("
+                    "AtenTensorHandle, AtenTensorHandle, AtenTensorHandle, "
+                    "AtenTensorHandle, int64_t, AtenTensorHandle*)"
+                ],
+            }
+        except AttributeError:
+            # int4_dispatch.py not imported — op not registered, skip C shim mapping
+            pass
+
         # Parse compile_specs to check for platform
 
         platform = "linux"
 
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Int4Tensor F.linear dispatch for CUDA.
+
+Decode (M<=4): Custom op ``executorch_cuda::int4_plain_mm`` — in eager this
+               dequants + calls F.linear; in .pte runtime the C shim runs a
+               W4A8 dp4a matvec kernel.
+Prefill (M>4): Inline dequant + F.linear — AOTI compiles this into the .so
+               using inductor's own cuBLAS codegen, so no explicit cuBLAS
+               dependency in our shim library.
+
+Import this module before using nn.Linear with Int4Tensor weights::
+
+    import executorch.backends.cuda.int4_dispatch  # noqa: F401
+"""
+
+import torch
+import torch.nn.functional as F
+from torch.library import impl, Library
+from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
+
+# ---------------------------------------------------------------------------
+# Custom op for decode (M=1): dp4a matvec in C shim, dequant+F.linear in eager
+# ---------------------------------------------------------------------------
+
+_lib = Library("executorch_cuda", "DEF")
+_lib.define(
+    "int4_plain_mm(Tensor self, Tensor qdata, Tensor scale, Tensor zero, int group_size) -> Tensor"
+)
+
+
+@impl(_lib, "int4_plain_mm", "Meta")
+def _meta(self, qdata, scale, zero, group_size):
+    return torch.empty(
+        self.shape[0], qdata.shape[0], dtype=self.dtype, device=self.device
+    )
+
+
+@impl(_lib, "int4_plain_mm", "CUDA")
+def _cuda(self, qdata, scale, zero, group_size):
+    return _dequant_matmul(self, qdata, scale, zero, group_size)
+
+
+def _dequant_matmul(x, qdata, scale, zero, group_size):
+    """Dequant INT4 weights to input dtype and call F.linear."""
+    N, K_half = qdata.shape
+    K = K_half * 2
+    n_groups = K // group_size
+    gs_half = group_size // 2
+    dtype = x.dtype
+
+    p = qdata.to(torch.uint8).reshape(N, n_groups, gs_half)
+    low = (p & 0x0F).to(dtype)
+    high = ((p >> 4) & 0x0F).to(dtype)
+    data = torch.stack([low, high], dim=-1).reshape(N, n_groups, group_size)
+
+    s = scale.to(dtype).t().unsqueeze(-1)
+    z = zero.to(dtype).t().unsqueeze(-1)
+    w_deq = ((data - z) * s).reshape(N, K)
+
+    return F.linear(x, w_deq)
+
+
+# ---------------------------------------------------------------------------
+# Int4Tensor F.linear dispatch
+# ---------------------------------------------------------------------------
+
+aten = torch.ops.aten
+_implements = Int4Tensor.implements
+_implements_torch_function = Int4Tensor.implements_torch_function
+
+
+@_implements([aten.linear.default])
+@_implements_torch_function([F.linear])
+def _(func, types, args, kwargs):
+    input_tensor = args[0]
+    weight_tensor = args[1]
+    bias = args[2] if len(args) > 2 else None
+
+    orig_shape = input_tensor.shape
+    x_2d = input_tensor.reshape(-1, orig_shape[-1])
+
+    qdata = weight_tensor.qdata
+    scale = weight_tensor.scale
+    zero = weight_tensor.zero_point
+    gs = weight_tensor.block_size[-1]
+
+    M = x_2d.shape[0]
+    if M <= 4:
+        out = torch.ops.executorch_cuda.int4_plain_mm(x_2d, qdata, scale, zero, gs)
+    else:
+        out = _dequant_matmul(x_2d, qdata, scale, zero, gs)
+
+    out = out.reshape(*orig_shape[:-1], -1)
+    if bias is not None:
+        out = out + bias
+    return out
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/shims/int4_plain_mm.h>
+#include <executorch/backends/cuda/runtime/shims/int4_plain_mm.cuh>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTITorchError aoti_torch_cuda_int4_plain_mm(
+    Tensor* self,
+    Tensor* qdata,
+    Tensor* scale,
+    Tensor* zero,
+    int64_t group_size,
+    Tensor** ret0) {
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_int4_plain_mm: self is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      qdata != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_int4_plain_mm: qdata is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      scale != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_int4_plain_mm: scale is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      zero != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_int4_plain_mm: zero is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret0 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_int4_plain_mm: ret0 is null");
+
+  int32_t M = self->size(0);
+  int32_t N = qdata->size(0);
+  Tensor* C = nullptr;
+  std::array<int64_t, 2> c_shape = {M, N};
+  std::array<int64_t, 2> c_stride = {N, 1};
+  aoti_torch_empty_strided(
+      2,
+      c_shape.data(),
+      c_stride.data(),
+      static_cast<int32_t>(
+          executorch::backends::aoti::slim::c10::ScalarType::BFloat16),
+      static_cast<int32_t>(
+          executorch::backends::aoti::slim::c10::DeviceType::CUDA),
+      0,
+      &C);
+
+  _int4_plain_mm_cuda(*self, *qdata, *scale, *zero, group_size, C);
+  ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR();
+
+  *ret0 = C;
+  return Error::Ok;
+}
+
+#ifdef __cplusplus
+}
+#endif
+} // namespace executorch::backends::cuda
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,8 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp`
`110`	`110`	`# Only build CUDA shims when CUDA language/toolchain is available.`
`111`	`111`	`if(CMAKE_CUDA_COMPILER)`
`112`	`112`	`list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu`
`113`		`- runtime/shims/sort.cu runtime/shims/rand.cu`
	`113`	`+ runtime/shims/int4_plain_mm.cu runtime/shims/sort.cu`
	`114`	`+ runtime/shims/rand.cu`
`114`	`115`	`)`
`115`	`116`	`endif()`
`116`	`117`