Harden int4_plain_mm: dtype checks, scale hoist, docstrings

mergennachin · mergennachin · commit e7375a1f325c · 2026-05-12T11:46:07.000-07:00
- Add dtype checks for qdata (uint8/int8), scale (bf16), zero (bf16) in C shim
- Hoist weight scale/zero loads outside inner loop (reload only on group change)
- Clarify int4_dispatch.py docblock: runs at eager/trace time, not .pte runtime
- Clarify test docblock: tests eager dispatch, not C shim runtime
diff --git a/backends/cuda/int4_dispatch.py b/backends/cuda/int4_dispatch.py
@@ -4,14 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Int4Tensor F.linear dispatch for CUDA.
-
-Decode (M<=4): Custom op ``executorch_cuda::int4_plain_mm`` — in eager this
-               dequants + calls F.linear; in .pte runtime the C shim runs a
-               W4A8 dp4a matvec kernel.
-Prefill (M>4): Inline dequant + F.linear — AOTI compiles this into the .so
-               using inductor's own cuBLAS codegen, so no explicit cuBLAS
-               dependency in our shim library.
+"""Int4Tensor F.linear dispatch for CUDA — runs at eager / export trace time.
+
+This module overrides Int4Tensor's F.linear dispatch so that torch.export
+traces through our custom op and dequant logic instead of torchao's default
+(mslk/tinygemm). The code here executes during eager inference and during
+AOTI export tracing — it does NOT run at .pte runtime.
+
+At .pte runtime, the captured graph is executed by the AOTI-generated .so:
+  - The custom op ``executorch_cuda::int4_plain_mm`` maps to a C shim that
+    runs the W4A8 dp4a matvec kernel (backends/cuda/runtime/shims/).
+  - The inline dequant + F.linear is compiled by inductor into fused Triton
+    dequant + cuBLAS matmul kernels.
+
+Dispatch strategy (determines what gets captured in the export graph):
+  Decode (M<=4): Custom op ``executorch_cuda::int4_plain_mm``
+  Prefill (M>4): Inline dequant + F.linear (standard PyTorch ops)
 
 Import this module before using nn.Linear with Int4Tensor weights::
 
diff --git a/backends/cuda/runtime/shims/int4_plain_mm.cuh b/backends/cuda/runtime/shims/int4_plain_mm.cuh
@@ -130,6 +130,9 @@ __global__ void __launch_bounds__(MV_THREADS)
 
   float sum = 0.0f;
 
+  int32_t prev_g = -1;
+  float ws = 0.0f, wz = 0.0f;
+
   for (int32_t i = lane_id; i < K_half_16; i += MV_WARP_SIZE) {
     uint4 packed16 = __ldg(&qrow16[i]);
     int32_t k_base = i * 32;
@@ -141,6 +144,12 @@ __global__ void __launch_bounds__(MV_THREADS)
       int32_t k_word = k_base + w * 8;
       int32_t g = k_word >> gs_shift;
 
+      if (g != prev_g) {
+        ws = __bfloat162float(__ldg(&scale_base[g * scale_stride]));
+        wz = __bfloat162float(__ldg(&zero_base[g * scale_stride]));
+        prev_g = g;
+      }
+
       int32_t vi_lo = packed & 0x0F0F0F0F;
       int32_t vi_hi = (packed >> 4) & 0x0F0F0F0F;
 
@@ -156,8 +165,6 @@ __global__ void __launch_bounds__(MV_THREADS)
       int32_t dp = __dp4a(vi_lo, a_even, 0);
       dp = __dp4a(vi_hi, a_odd, dp);
 
-      float ws = __bfloat162float(__ldg(&scale_base[g * scale_stride]));
-      float wz = __bfloat162float(__ldg(&zero_base[g * scale_stride]));
       float a_scale = qb->d;
 
       int32_t a_sum8 = __dp4a(0x01010101, a_even, 0);
@@ -212,6 +219,11 @@ void _int4_plain_mm_cuda(
   int32_t N = qdata.size(0);
 
   ET_CHECK(A.dtype() == c10::ScalarType::BFloat16);
+  ET_CHECK(
+      qdata.dtype() == c10::ScalarType::Byte ||
+      qdata.dtype() == c10::ScalarType::Char);
+  ET_CHECK(scale.dtype() == c10::ScalarType::BFloat16);
+  ET_CHECK(zero.dtype() == c10::ScalarType::BFloat16);
   ET_CHECK(A.dim() == 2);
   ET_CHECK(qdata.dim() == 2);
   ET_CHECK(qdata.size(1) == K / 2);
diff --git a/backends/cuda/tests/test_int4_dispatch.py b/backends/cuda/tests/test_int4_dispatch.py
@@ -7,10 +7,18 @@
 
 """Tests for Int4Tensor F.linear dispatch via int4_dispatch.
 
+These tests validate the eager / trace-time dispatch path — the same code
+that torch.export traces through when building the AOTI graph. They do NOT
+test the .pte runtime C shim (dp4a kernel); that is covered by
+test_aoti_torch_cuda_int4_plain_mm.cpp (C++ unit tests) and
+test_cuda_pipeline.py::TestCudaExport (end-to-end export + lower).
+
 The API contract: after importing int4_dispatch, F.linear and nn.Linear
 with Int4Tensor weights produce numerically correct results. Tests verify
-this across decode (M=1), prefill (M>1), batched (3D), bias, group sizes,
-and symmetric/asymmetric quantization.
+this across decode (M<=4), prefill (M>4), batched (3D), bias, group sizes,
+and symmetric/asymmetric quantization. Correctness is measured as mean
+relative error against the unquantized bf16 reference (not per-element
+atol/rtol, which is too strict for INT4 quantization noise).
 
 Usage:
   python -m pytest backends/cuda/tests/test_int4_dispatch.py -v