From 73589fcbb89ac158c063ae4b2dc796db4fc10b34 Mon Sep 17 00:00:00 2001
From: mygitljf <2410316423@qq.com>
Date: Wed, 20 May 2026 18:55:41 +0000
Subject: [PATCH] [Bug][TMA] Skip OOB gate for 1D TMA bulk-copy eligibility
 (#2180)

When the global tensor has a dynamic outer shape and the index cannot
be statically proven within bounds, `buffer_oob` is set to true. The
existing logic in `AnalyzeCopyFacts` then masked out 1D bulk-copy
eligibility along with the descriptor-based 2D path, causing
`InferLayout` to install a swizzle-shaped shared layout via
`InferBulkLayout` and forcing `Copy::Lower` to fall through to the
2D `LowerBulk`. There the `inner_box_dim > 256` branch issues
`(K / 256)` separate `tl::tma_load` calls instead of a single 1D
bulk copy.

The 1D bulk-copy path emits `cp.async.bulk` (not `cp.async.bulk.tensor`)
and has the same OOB semantics as a plain `T.copy()` - it does not
need the descriptor-only OOB gate. Drop `!ctx.buffer_oob` from the
1D eligibility check so dynamic-outer-shape 1D copies that already
satisfy `CheckBulkCopy1D` (contiguous innermost slice, full-extent
trailing dim, element count match) keep the single-issue path.

Repro: see issue #2180 - `T.tma_copy(A[var, 0:K], a_shared, barrier=mbar)`
with `M=T.dynamic('M')`, `K=T.const('K')` on `A[M, K]` was lowering to
4 split `tl::tma_load` calls; with this patch it lowers to a single
`tl::tma_load(smem, gmem, mbar, total_bytes)` and no `CUtensorMap`
descriptor. Verified locally on A100 at the codegen-string level
(end-to-end launch requires sm_90+ and is left to CI).

Tests: 3 new regression cases in
`testing/python/language/test_tilelang_language_tma_1d.py` covering
fp32/K=1024, fp32/K=512, fp16/K=1024 with `T.dynamic` outer shape.
All existing TMA tests pass without regression.
---
 src/backend/cuda/op/copy_analysis.cc          |  9 ++++-
 .../language/test_tilelang_language_tma_1d.py | 37 +++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/backend/cuda/op/copy_analysis.cc b/src/backend/cuda/op/copy_analysis.cc
index bbb672b187..52cd723bda 100644
--- a/src/backend/cuda/op/copy_analysis.cc
+++ b/src/backend/cuda/op/copy_analysis.cc
@@ -493,8 +493,13 @@ CopyFacts AnalyzeCopyFacts(const CopyNode &op, const CopyAnalysisContext &ctx) {
   const LayoutMap &layout_map =
       ctx.layout_map != nullptr ? *ctx.layout_map : empty_layout_map;
   bool is_cutedsl = TargetIsCuTeDSL(ctx.target);
-  facts.layout_dependent_tma_available =
-      facts.has_layout_map && !is_cutedsl && !ctx.buffer_oob;
+  // Issue #2180: only the descriptor-based 2D TMA path needs the OOB gate.
+  // The 1D bulk-copy path emits `cp.async.bulk`, which has the same OOB
+  // semantics as plain T.copy(); gating it on `buffer_oob` causes
+  // InferLayout to fall through to the 2D path for dynamic-outer-shape
+  // tensors and install a swizzle-shaped shared layout, which then forces
+  // Lower() into LowerBulk and triggers the 256-element splitting.
+  facts.layout_dependent_tma_available = facts.has_layout_map && !is_cutedsl;
 
   if (facts.layout_dependent_tma_available) {
     facts.can_bulk_load_1d =
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
index 9cb79c10c6..fdb783a2d3 100644
--- a/testing/python/language/test_tilelang_language_tma_1d.py
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -46,10 +46,47 @@ def run_elementwise_add(M, N):
         assert "tma_load" in code and "CUtensorMap" in code
 
 
+def _lower_issue_2180_kernel(K, dtype):
+    M = T.dynamic("M")
+
+    @T.prim_func
+    def gemm(A: T.Tensor([M, K], dtype)):
+        with T.Kernel(M, threads=256):
+            var = T.alloc_var(T.int32, init=0)
+            a_shared = T.alloc_shared(K, dtype=dtype)
+            mbar = T.alloc_barrier(256)
+            T.tma_copy(A[var, 0:K], a_shared, barrier=mbar)
+
+    artifact = tilelang.lower(gemm, target={"kind": "cuda", "arch": "sm_90a"})
+    return artifact.kernel_source
+
+
+def _check_single_1d_tma(code):
+    n_tma_load = code.count("tl::tma_load(")
+    has_desc = "CUtensorMap" in code
+    assert n_tma_load == 1, f"Issue #2180: expected exactly 1 tl::tma_load, got {n_tma_load}.\nGenerated source:\n{code}"
+    assert not has_desc, f"Issue #2180: expected 1D bulk-copy without CUtensorMap descriptor.\nGenerated source:\n{code}"
+
+
+def test_issue_2180_full_row_fp32_k1024():
+    _check_single_1d_tma(_lower_issue_2180_kernel(K=1024, dtype=T.float32))
+
+
+def test_issue_2180_full_row_fp32_k512():
+    _check_single_1d_tma(_lower_issue_2180_kernel(K=512, dtype=T.float32))
+
+
+def test_issue_2180_full_row_fp16_k1024():
+    _check_single_1d_tma(_lower_issue_2180_kernel(K=1024, dtype=T.float16))
+
+
 def main():
     run_elementwise_add(128, 128)
     run_elementwise_add(256, 128)
     run_elementwise_add(256, 256)
+    test_issue_2180_full_row_fp32_k1024()
+    test_issue_2180_full_row_fp32_k512()
+    test_issue_2180_full_row_fp16_k1024()
 
 
 if __name__ == "__main__":