From 73589fcbb89ac158c063ae4b2dc796db4fc10b34 Mon Sep 17 00:00:00 2001 From: mygitljf <2410316423@qq.com> Date: Wed, 20 May 2026 18:55:41 +0000 Subject: [PATCH] [Bug][TMA] Skip OOB gate for 1D TMA bulk-copy eligibility (#2180) When the global tensor has a dynamic outer shape and the index cannot be statically proven within bounds, `buffer_oob` is set to true. The existing logic in `AnalyzeCopyFacts` then masked out 1D bulk-copy eligibility along with the descriptor-based 2D path, causing `InferLayout` to install a swizzle-shaped shared layout via `InferBulkLayout` and forcing `Copy::Lower` to fall through to the 2D `LowerBulk`. There the `inner_box_dim > 256` branch issues `(K / 256)` separate `tl::tma_load` calls instead of a single 1D bulk copy. The 1D bulk-copy path emits `cp.async.bulk` (not `cp.async.bulk.tensor`) and has the same OOB semantics as a plain `T.copy()` - it does not need the descriptor-only OOB gate. Drop `!ctx.buffer_oob` from the 1D eligibility check so dynamic-outer-shape 1D copies that already satisfy `CheckBulkCopy1D` (contiguous innermost slice, full-extent trailing dim, element count match) keep the single-issue path. Repro: see issue #2180 - `T.tma_copy(A[var, 0:K], a_shared, barrier=mbar)` with `M=T.dynamic('M')`, `K=T.const('K')` on `A[M, K]` was lowering to 4 split `tl::tma_load` calls; with this patch it lowers to a single `tl::tma_load(smem, gmem, mbar, total_bytes)` and no `CUtensorMap` descriptor. Verified locally on A100 at the codegen-string level (end-to-end launch requires sm_90+ and is left to CI). Tests: 3 new regression cases in `testing/python/language/test_tilelang_language_tma_1d.py` covering fp32/K=1024, fp32/K=512, fp16/K=1024 with `T.dynamic` outer shape. All existing TMA tests pass without regression. --- src/backend/cuda/op/copy_analysis.cc | 9 ++++- .../language/test_tilelang_language_tma_1d.py | 37 +++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/backend/cuda/op/copy_analysis.cc b/src/backend/cuda/op/copy_analysis.cc index bbb672b187..52cd723bda 100644 --- a/src/backend/cuda/op/copy_analysis.cc +++ b/src/backend/cuda/op/copy_analysis.cc @@ -493,8 +493,13 @@ CopyFacts AnalyzeCopyFacts(const CopyNode &op, const CopyAnalysisContext &ctx) { const LayoutMap &layout_map = ctx.layout_map != nullptr ? *ctx.layout_map : empty_layout_map; bool is_cutedsl = TargetIsCuTeDSL(ctx.target); - facts.layout_dependent_tma_available = - facts.has_layout_map && !is_cutedsl && !ctx.buffer_oob; + // Issue #2180: only the descriptor-based 2D TMA path needs the OOB gate. + // The 1D bulk-copy path emits `cp.async.bulk`, which has the same OOB + // semantics as plain T.copy(); gating it on `buffer_oob` causes + // InferLayout to fall through to the 2D path for dynamic-outer-shape + // tensors and install a swizzle-shaped shared layout, which then forces + // Lower() into LowerBulk and triggers the 256-element splitting. + facts.layout_dependent_tma_available = facts.has_layout_map && !is_cutedsl; if (facts.layout_dependent_tma_available) { facts.can_bulk_load_1d = diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py index 9cb79c10c6..fdb783a2d3 100644 --- a/testing/python/language/test_tilelang_language_tma_1d.py +++ b/testing/python/language/test_tilelang_language_tma_1d.py @@ -46,10 +46,47 @@ def run_elementwise_add(M, N): assert "tma_load" in code and "CUtensorMap" in code +def _lower_issue_2180_kernel(K, dtype): + M = T.dynamic("M") + + @T.prim_func + def gemm(A: T.Tensor([M, K], dtype)): + with T.Kernel(M, threads=256): + var = T.alloc_var(T.int32, init=0) + a_shared = T.alloc_shared(K, dtype=dtype) + mbar = T.alloc_barrier(256) + T.tma_copy(A[var, 0:K], a_shared, barrier=mbar) + + artifact = tilelang.lower(gemm, target={"kind": "cuda", "arch": "sm_90a"}) + return artifact.kernel_source + + +def _check_single_1d_tma(code): + n_tma_load = code.count("tl::tma_load(") + has_desc = "CUtensorMap" in code + assert n_tma_load == 1, f"Issue #2180: expected exactly 1 tl::tma_load, got {n_tma_load}.\nGenerated source:\n{code}" + assert not has_desc, f"Issue #2180: expected 1D bulk-copy without CUtensorMap descriptor.\nGenerated source:\n{code}" + + +def test_issue_2180_full_row_fp32_k1024(): + _check_single_1d_tma(_lower_issue_2180_kernel(K=1024, dtype=T.float32)) + + +def test_issue_2180_full_row_fp32_k512(): + _check_single_1d_tma(_lower_issue_2180_kernel(K=512, dtype=T.float32)) + + +def test_issue_2180_full_row_fp16_k1024(): + _check_single_1d_tma(_lower_issue_2180_kernel(K=1024, dtype=T.float16)) + + def main(): run_elementwise_add(128, 128) run_elementwise_add(256, 128) run_elementwise_add(256, 256) + test_issue_2180_full_row_fp32_k1024() + test_issue_2180_full_row_fp32_k512() + test_issue_2180_full_row_fp16_k1024() if __name__ == "__main__":