[TLE][MTHREADS] Support atomic operands

Kylin1207 · Kylin1207 · commit 4b4f2c2ed9ca · 2026-05-25T16:14:29.000+08:00
diff --git a/third_party/mthreads/include/triton/Dialect/Triton/IR/Dialect.h b/third_party/mthreads/include/triton/Dialect/Triton/IR/Dialect.h
@@ -27,6 +27,11 @@ namespace triton {
 struct GlobalMemory : public SideEffects::Resource::Base<GlobalMemory> {
   StringRef getName() final { return "<GlobalMemory>"; }
 };
+#ifdef __TLE__
+struct SharedMemory : public SideEffects::Resource::Base<SharedMemory> {
+  StringRef getName() final { return "<SharedMemory>"; }
+};
+#endif
 
 class DialectInferLayoutInterface
     : public DialectInterface::Base<DialectInferLayoutInterface> {
diff --git a/third_party/mthreads/include/triton/Dialect/Triton/IR/TritonOps.td b/third_party/mthreads/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -20,6 +20,9 @@ include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 // Interfaces
 //
 def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
+#ifdef __TLE__
+def SharedMemory : Resource<"::mlir::triton::SharedMemory">;
+#endif // __TLE__
 
 //
 // Op Base
@@ -350,8 +353,13 @@ def TT_StoreOp : TT_Op<"store", [
 def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [
   SameOperandsAndResultShape,
   SameOperandsAndResultEncoding,
+#ifdef __TLE__
+  TypesMatchWith<"value type matches ptr type", "ptr", "val",
+                 "getPointeeType($_self)">,
+#else
   TypesMatchWith<"ptr type matches value type", "val", "ptr",
                  "getPointerTypeSameShape($_self)">,
+#endif // __TLE__
   TypesMatchWith<"mask type matches value type",
                  "val", "mask", "getI1SameShape($_self)",
                  "($_op.getOperands().size() <= 2) || std::equal_to<>()">
@@ -366,7 +374,12 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [
 
     let arguments = (ins
       TT_AtomicRMWAttr:$atomic_rmw_op,
+#ifdef __TLE__
+      Arg<TT_PtrLike, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>,
+                           MemRead<SharedMemory>, MemWrite<SharedMemory>]>:$ptr,
+#else
       Arg<TT_PtrLike, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$ptr,
+#endif // __TLE__
       TT_Type:$val,
       Optional<TT_BoolLike>:$mask,
       TT_MemSemanticAttr:$sem,
@@ -386,10 +399,17 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [
 def TT_AtomicCASOp : TT_Op<"atomic_cas", [
   SameOperandsAndResultShape,
   SameOperandsAndResultEncoding,
+#ifdef __TLE__
+  TypesMatchWith<"cmp type matches ptr type", "ptr", "cmp",
+                  "getPointeeType($_self)">,
+  TypesMatchWith<"value type matches ptr type", "ptr", "val",
+                  "getPointeeType($_self)">
+#else
   TypesMatchWith<"ptr type matches cmp type", "cmp", "ptr",
                   "getPointerTypeSameShape($_self)">,
   TypesMatchWith<"ptr type matches value type", "val", "ptr",
                   "getPointerTypeSameShape($_self)">
+#endif // __TLE__
 ]> {
     let summary = "atomic cas";
 
@@ -404,7 +424,12 @@ def TT_AtomicCASOp : TT_Op<"atomic_cas", [
     }];
 
     let arguments = (ins
+#ifdef __TLE__
+      Arg<TT_PtrLike, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>,
+                           MemRead<SharedMemory>, MemWrite<SharedMemory>]>:$ptr,
+#else
       Arg<TT_PtrLike, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$ptr,
+#endif // __TLE__
       TT_Type:$cmp,
       TT_Type:$val,
       TT_MemSemanticAttr:$sem,
diff --git a/third_party/mthreads/python/test/unit/tle/test_tle_local_ptr.py b/third_party/mthreads/python/test/unit/tle/test_tle_local_ptr.py
@@ -5,7 +5,7 @@
 import triton.experimental.tle.language as tle
 from triton.compiler.errors import CompilationError
 
-from test_tle_utils import compile_musa, require_mthreads_libtriton
+from test_tle_utils import compile_musa, compile_to_ttir, require_mthreads_libtriton
 
 require_mthreads_libtriton()
 
@@ -40,6 +40,40 @@ def _local_ptr_full_view_kernel(out_ptr):
     tl.store(out_ptr + tl.arange(0, 16), loaded)
 
 
+@triton.jit
+def _local_ptr_atomic_add_kernel(out_ptr, BLOCK: tl.constexpr):
+    offsets = tl.arange(0, BLOCK)
+    init = tl.full((BLOCK, ), 0, tl.int32)
+    smem = tle.gpu.alloc((BLOCK, ), dtype=tl.int32, init_value=init, nv_mma_shared_layout=False)
+    ptrs = tle.gpu.local_ptr(smem, (offsets, ))
+    increments = offsets.to(tl.int32) + 1
+    old = tl.atomic_add(ptrs, increments, sem="relaxed", scope="cta")
+    after = tl.load(ptrs)
+    tl.store(out_ptr + offsets, old)
+    tl.store(out_ptr + BLOCK + offsets, after)
+
+
+@triton.jit
+def _local_ptr_atomic_cas_kernel(out_ptr):
+    init = tl.full((1, ), 3, tl.int32)
+    smem = tle.gpu.alloc((1, ), dtype=tl.int32, init_value=init, nv_mma_shared_layout=False)
+    ptr = tle.gpu.local_ptr(smem, (0, ))
+    old = tl.atomic_cas(ptr, 3, 9, sem="relaxed", scope="cta")
+    after = tl.load(ptr)
+    tl.store(out_ptr, old)
+    tl.store(out_ptr + 1, after)
+
+
+@triton.jit
+def _local_ptr_atomic_cas_update_kernel(out_ptr):
+    init = tl.full((1, ), 3, tl.int32)
+    smem = tle.gpu.alloc((1, ), dtype=tl.int32, init_value=init, nv_mma_shared_layout=False)
+    ptr = tle.gpu.local_ptr(smem, (0, ))
+    tl.atomic_cas(ptr, 3, 9, sem="relaxed", scope="cta")
+    after = tl.load(ptr)
+    tl.store(out_ptr, after)
+
+
 @triton.jit
 def _local_ptr_non_integer_index_kernel(out_ptr):
     smem = tle.gpu.alloc((16, ), dtype=tl.float32, nv_mma_shared_layout=False)
@@ -110,6 +144,44 @@ def test_tle_local_ptr_full_view_store_load_rewrites_to_memdesc_ops():
     assert "musa_tle.local_pointers" not in llir, llir
 
 
+def test_tle_local_ptr_atomic_ops_accept_addrspace3_ttir():
+    add_ttir = compile_to_ttir(
+        _local_ptr_atomic_add_kernel,
+        signature={"out_ptr": "*i32", "BLOCK": "constexpr"},
+        constexprs={"BLOCK": 16},
+    )
+    cas_ttir = compile_to_ttir(_local_ptr_atomic_cas_kernel, signature={"out_ptr": "*i32"})
+
+    assert "tt.atomic_rmw add, relaxed, cta" in add_ttir, add_ttir
+    assert ("(tensor<16x!tt.ptr<i32, 3>>, tensor<16xi32>, tensor<16xi1>) -> tensor<16xi32>" in add_ttir), add_ttir
+    assert "tt.atomic_cas relaxed, cta" in cas_ttir, cas_ttir
+    assert "(!tt.ptr<i32, 3>, i32, i32) -> i32" in cas_ttir, cas_ttir
+
+
+def test_tle_local_ptr_atomic_add_lowers_through_mthreads_llvm():
+    compiled = compile_musa(
+        _local_ptr_atomic_add_kernel,
+        signature={"out_ptr": "*i32", "BLOCK": "constexpr"},
+        constexprs={"BLOCK": 16},
+    )
+
+    ttgir = compiled.asm["ttgir"]
+    llir = compiled.asm["llir"]
+    assert "tt.atomic_rmw" in ttgir, ttgir
+    assert "tensor<16x!tt.ptr<i32, 3>" in ttgir, ttgir
+    assert "musa_tle.local_pointers" not in llir, llir
+
+
+def test_tle_local_ptr_atomic_cas_lowers_through_mthreads_llvm():
+    compiled = compile_musa(_local_ptr_atomic_cas_kernel, signature={"out_ptr": "*i32"})
+
+    ttgir = compiled.asm["ttgir"]
+    llir = compiled.asm["llir"]
+    assert "tt.atomic_cas" in ttgir, ttgir
+    assert "-> !tt.ptr<i32, 3>" in ttgir, ttgir
+    assert "musa_tle.local_pointers" not in llir, llir
+
+
 def test_tle_local_ptr_rejects_non_integer_indices():
     with pytest.raises(CompilationError, match="local_ptr indices must use integer dtypes"):
         compile_musa(_local_ptr_non_integer_index_kernel, signature={"out_ptr": "*fp32"})
@@ -158,3 +230,25 @@ def test_tle_local_ptr_full_view_runtime_round_trip():
 
     ref = torch.arange(0, 16, dtype=torch.float32) + 7.0
     torch.testing.assert_close(out.cpu(), ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not torch.musa.is_available(), reason="MUSA device is not available")
+def test_tle_local_ptr_atomic_add_runtime_round_trip():
+    block = 16
+    out = torch.empty((block * 2, ), device="musa", dtype=torch.int32)
+
+    _local_ptr_atomic_add_kernel[(1, )](out, BLOCK=block, num_warps=1)
+
+    ref_old = torch.zeros((block, ), dtype=torch.int32)
+    ref_after = torch.arange(1, block + 1, dtype=torch.int32)
+    torch.testing.assert_close(out[:block].cpu(), ref_old, rtol=0, atol=0)
+    torch.testing.assert_close(out[block:].cpu(), ref_after, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not torch.musa.is_available(), reason="MUSA device is not available")
+def test_tle_local_ptr_atomic_cas_runtime_round_trip():
+    out = torch.empty((1, ), device="musa", dtype=torch.int32)
+
+    _local_ptr_atomic_cas_update_kernel[(1, )](out, num_warps=1)
+
+    torch.testing.assert_close(out.cpu(), torch.tensor([9], dtype=torch.int32), rtol=0, atol=0)