[Gemm,Sm100] Implement TMA gather for varlen_k

tridao · tridao · commit 07f67b6b7d02 · 2026-04-05T02:00:30.000-04:00
diff --git a/quack/copy_utils.py b/quack/copy_utils.py
@@ -13,9 +13,10 @@
 import cutlass.pipeline
 from cutlass._mlir.dialects import llvm
 from cutlass._mlir import ir
+from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
 
+from quack import layout_utils
 from quack.utils import make_vector
-from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
 
 
 Sm100MmaPeerBitMask = 0xFEFFFFFF
@@ -1023,16 +1024,83 @@ def gather_m_get_tma_copy_fn(
     tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
 
     def copy_fn(src_idx, dst_idx, tma_bar_ptr: cute.Pointer):
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
         col_idx = tile_K * src_idx
         for m in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
             row_indices = [tSR_rAIdx[v, m] for v in range(4)]
-            smem_ptr = tSR_sA[None, m, None, dst_idx].iterator
+            smem_ptr = tSR_sA_cur[None, m, None].iterator
             with cute.arch.elect_one():
                 tma_gather4_load_fn(smem_ptr, tma_bar_ptr, col_idx, row_indices)
 
     return copy_fn
 
 
+@cute.jit
+def gather_k_get_tma_copy_fn(
+    tma_atom: cute.CopyAtom,
+    sA: cute.Tensor,  # ((4, tile_K/4), (tile_M,), STAGE) — K-grouped load layout
+    sAIdx: cute.Tensor,  # (tile_K, a_prefetch_stage) — K indices in smem
+    col_idx: Int32,  # M offset in global tensor (contiguous dim for M-major)
+    warp_idx: Int32,
+    num_warps: int,
+    num_cta: int = 1,
+) -> Tuple[Callable, Callable]:
+    """Build a copy function for TMA gather4 in K dimension (M-major A).
+
+    Each gather4 instruction loads 4 K-columns × tile_M contiguous M-elements.
+    col_idx is the absolute M position in the global tensor.
+    K indices come from sAIdx (prefetched to smem by the scheduler warp).
+
+    Returns copy_fn(src_idx, dst_idx, tma_bar_ptr) which:
+      Issues gather4 calls with those K indices as row_indices
+    """
+    tile_K = cute.size(sAIdx, mode=[0])
+    assert tile_K % 4 == 0
+    cta_group = num_cta
+
+    # Tiled copy for loading K indices from smem to registers (4 per vector, across warps)
+    copy_AIdx_s2r = cute.make_tiled_copy_tv(
+        cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), Int32, num_bits_per_copy=128),
+        cute.make_layout(num_warps),  # thr_layout
+        cute.make_layout(4),  # val_layout — 4 K indices per gather4
+    )
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    warp_copy_AIdx_s2r = copy_AIdx_s2r.get_slice(warp_idx)
+    tSR_sAIdx = warp_copy_AIdx_s2r.partition_S(sAIdx)  # (((4,1),4,4))
+    # ((4,1),4,(64,2),(1,4)):((64,0),1024,(1,4096),(0,8192))
+    tSR_sA = warp_copy_AIdx_s2r.partition_S(layout_utils.transpose_view(sA))
+    tma_desc_ptr = get_tma_desc_addr(tma_atom)
+    tma_gather4_load_fn = partial(tma_gather4_load, tma_desc_ptr, num_cta=cta_group)
+
+    def prefetch_from_smem_fn(
+        a_prefetch_pipeline,
+        src_idx,
+        dst_idx,
+        a_prefetch_consumer_state,
+    ) -> cute.Tensor:
+        a_prefetch_pipeline.consumer_wait(a_prefetch_consumer_state)
+        tSR_rAIdx = load_s2r(tSR_sAIdx[None, None, dst_idx])
+        cute.arch.sync_warp()
+        with cute.arch.elect_one():
+            a_prefetch_pipeline.consumer_release(a_prefetch_consumer_state)
+        return tSR_rAIdx
+
+    def copy_fn(src_idx, dst_idx, tSR_rAIdx, tma_bar_ptr: cute.Pointer):
+        # Issue gather4: col_idx = M position, row_indices = 4 K positions
+        tSR_sA_cur = tSR_sA[None, None, None, dst_idx]
+        gather_dim = cute.size(tSR_sA_cur, mode=[2, 0])  # Typically 64
+        for k in cutlass.range(cute.size(tSR_rAIdx, mode=[1]), unroll_full=True):
+            row_indices = [tSR_rAIdx[v, k] for v in range(4)]
+            for m in cutlass.range(cute.size(tSR_sA_cur, mode=[2, 1]), unroll_full=True):
+                smem_ptr = tSR_sA_cur[None, k, (None, m)].iterator
+                with cute.arch.elect_one():
+                    tma_gather4_load_fn(
+                        smem_ptr, tma_bar_ptr, col_idx + m * gather_dim, row_indices
+                    )
+
+    return copy_fn, prefetch_from_smem_fn
+
+
 # ---------------------------------------------------------------------------
 # Store helpers
 # ---------------------------------------------------------------------------
diff --git a/quack/gemm.py b/quack/gemm.py
@@ -188,9 +188,6 @@ def gemm(
     assert device_capacity[0] in [9, 10, 11, 12], "Only SM90, SM100, SM110, and SM120 are supported"
     if use_tma_gather:
         assert device_capacity[0] in [10, 11], "TMA gather currently requires SM100/SM110"
-        assert gather_A and varlen_m and not varlen_k, (
-            "TMA gather currently only supports varlen_m + gather_A"
-        )
     if rounding_mode == RoundingMode.RS:
         assert device_capacity[0] == 10, "Stochastic rounding (RoundingMode.RS) requires SM100"
     if is_dynamic_persistent and device_capacity[0] == 9:
diff --git a/quack/gemm_sm100.py b/quack/gemm_sm100.py
@@ -504,8 +504,6 @@ def __call__(
         assert (varlen_args.mAIdx is not None) == self.gather_A
         varlen_m = varlen_args.mCuSeqlensM is not None
         varlen_k = varlen_args.mCuSeqlensK is not None
-        if const_expr(self.use_tma_gather):
-            assert varlen_m and not varlen_k, "TMA gather currently only supports varlen_m"
 
         # Setup attributes that dependent on gemm inputs
         self._setup_attributes(epilogue_args, varlen_args)
@@ -794,8 +792,6 @@ def kernel(
         assert not (varlen_m and varlen_k)
         if const_expr(self.gather_A):
             assert varlen_m or varlen_k
-        if const_expr(self.use_tma_gather):
-            assert varlen_m and not varlen_k
         has_D = const_expr(mD_mnl is not None)
         has_C = const_expr(mC_mnl is not None)
 
@@ -1561,7 +1557,7 @@ def _make_gather_A_copy(
                 )
             elif const_expr(varlen_k):
                 col_idx = Int32(tile_coord_mnkl[0] * self.cta_tile_shape_mnk[0])
-                copy_A = copy_utils.gather_k_get_tma_copy_fn(
+                copy_A, prefetch_A = copy_utils.gather_k_get_tma_copy_fn(
                     tma_atom_a,
                     sA,
                     sAIdx,
@@ -1686,7 +1682,7 @@ def load_AB_tma_gather(
             tma_bar_ptr = ab_pipeline.producer_get_barrier(ab_producer_state)
             if is_tma_warp:
                 copy_B(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr)
-            copy_A(k_tile, smem_idx, tma_bar_ptr=tma_bar_ptr, *prefetch_out)
+            copy_A(k_tile, smem_idx, *prefetch_out, tma_bar_ptr=tma_bar_ptr)
             ab_pipeline.producer_commit(ab_producer_state)
             ab_producer_state.advance()
             peek_ab_empty_status = Boolean(True)
diff --git a/tests/test_linear_varlen_k.py b/tests/test_linear_varlen_k.py
@@ -3,6 +3,8 @@
 import pytest
 import torch
 
+from quack.cute_dsl_utils import get_device_capacity
+from quack.gemm import gemm as quack_gemm
 from quack.gemm_interface import (
     gemm,
     gemm_ref,
@@ -11,6 +13,11 @@
     gemm_add_inplace,
 )
 
+sm100_tma_gather_only = pytest.mark.skipif(
+    not torch.cuda.is_available() or get_device_capacity(torch.device("cuda"))[0] not in (10, 11),
+    reason="TMA gather tests require SM100/SM110",
+)
+
 
 def generate_A_with_gather(m, total_k, device, dtype, gather_A=False):
     """Generate A matrix and optionally A_idx for gather_A case with varlen_k.
@@ -42,6 +49,105 @@ def generate_A_with_gather(m, total_k, device, dtype, gather_A=False):
     return A, A_idx
 
 
+def run_lowlevel_varlen_k_gemm(
+    A,
+    B,
+    out,
+    cu_seqlens_k,
+    A_idx,
+    *,
+    dynamic_persistent=False,
+    use_tma_gather=False,
+):
+    device_capacity = get_device_capacity(A.device)[0]
+    tile_count_semaphore = (
+        torch.zeros(1, dtype=torch.int32, device=A.device)
+        if dynamic_persistent and device_capacity == 9
+        else None
+    )
+    quack_gemm(
+        A,
+        B,
+        out,
+        C=None,
+        tile_count_semaphore=tile_count_semaphore,
+        tile_M=256,
+        tile_N=256,
+        cluster_M=2,
+        cluster_N=1,
+        persistent=True,
+        is_dynamic_persistent=dynamic_persistent,
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+        use_tma_gather=use_tma_gather,
+    )
+
+
+@sm100_tma_gather_only
+@pytest.mark.parametrize("dynamic_persistent", [False, True])
+@pytest.mark.parametrize("input_dtype", [torch.bfloat16])
+@pytest.mark.parametrize("n", [1024])
+@pytest.mark.parametrize("m", [2048])
+def test_gemm_varlen_k_tma_gather_matches_cpasync(
+    m,
+    n,
+    input_dtype,
+    dynamic_persistent,
+):
+    """Compare TMA gather vs cp.async gather for varlen_k."""
+    device = "cuda"
+    torch.random.manual_seed(42)
+    num_groups = 4
+    # Use K values divisible by tile_K (64 for bf16) to avoid partial-tile edge cases
+    seq_lens = torch.randint(2, 6, (num_groups,), device="cpu") * 64
+    total_k = seq_lens.sum().item()
+    cu_seqlens_k = torch.cat(
+        [torch.zeros(1, dtype=torch.int32), seq_lens.cumsum(0).to(torch.int32)]
+    ).to(device)
+    A, A_idx = generate_A_with_gather(m, total_k, device, input_dtype, gather_A=True)
+    # B for quack_gemm varlen_k: 2D (n, total_k), n-major (stride(-2)==1)
+    B_ref = torch.randn((total_k, n), device=device, dtype=input_dtype) / math.sqrt(
+        total_k / num_groups
+    )
+    B = B_ref.T  # (n, total_k) with n contiguous — stride(-2)==1
+
+    out_cpasync = torch.empty((num_groups, m, n), device=device, dtype=input_dtype)
+    out_tma = torch.empty_like(out_cpasync)
+
+    run_lowlevel_varlen_k_gemm(
+        A,
+        B,
+        out_cpasync,
+        cu_seqlens_k,
+        A_idx,
+        dynamic_persistent=dynamic_persistent,
+        use_tma_gather=False,
+    )
+    run_lowlevel_varlen_k_gemm(
+        A,
+        B,
+        out_tma,
+        cu_seqlens_k,
+        A_idx,
+        dynamic_persistent=dynamic_persistent,
+        use_tma_gather=True,
+    )
+
+    # gemm_ref expects B as (total_K, N)
+    out_ref = gemm_ref(
+        A.float(),
+        B_ref.float(),
+        cu_seqlens_k=cu_seqlens_k,
+        A_idx=A_idx,
+    )
+    out_pt = gemm_ref(A, B_ref, cu_seqlens_k=cu_seqlens_k, A_idx=A_idx)
+
+    assert out_tma.shape == (num_groups, m, n)
+    assert (out_tma - out_ref).abs().max() < 2 * (out_pt - out_ref).abs().max() + 1e-5
+    assert (out_cpasync - out_ref).abs().max() < 2 * (out_pt - out_ref).abs().max() + 1e-5
+    torch.testing.assert_close(out_tma, out_cpasync, atol=3e-2, rtol=1e-3)
+
+
 @pytest.mark.parametrize("permute_batch", [False, True])
 @pytest.mark.parametrize("gather_A", [False, True])
 # @pytest.mark.parametrize("gather_A", [False])