[ROCm] Add TDM (Tensor Descriptor Memory) support for gfx1250

nurmukhametov · nurmukhametov · commit 0b1544258891 · 2026-05-08T13:54:25.000+01:00
gfx1250 introduces device-side tensor descriptors (TDM), constructed
on-device via tt.make_tensor_descriptor. Wire TDM through the XLA
Triton pipeline so xtile.extract/insert lower to descriptor-based
loads on TDM-capable hardware, with a pointer-based fallback for
TDM-incompatible tile shapes.
diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -50,6 +50,9 @@ void CreateTritonXlaPipeline(
   auto* cuda_cc = gpu_cc.cuda_compute_capability();
   bool is_at_least_hopper = cuda_cc != nullptr && cuda_cc->IsAtLeastHopper();
 
+  auto* rocm_cc = gpu_cc.rocm_compute_capability();
+  bool rocm_supports_tdm = rocm_cc != nullptr && rocm_cc->has_tdm_support();
+
   if (rewrite_int4) {
     pm->addPass(mlir::triton::xla::CreateInt4ToPackedInt4RewritePass(
         /*enable_bf16x2=*/is_at_least_hopper));
@@ -59,7 +62,8 @@ void CreateTritonXlaPipeline(
     pm->addPass(CreateInsertPDLPass());
   }
   pm->addPass(mlir::triton::xla::CreateTritonXLAExtractInsertToTritonPass(
-      /*allow_tma=*/allow_tma && is_at_least_hopper, num_stages));
+      /*allow_tma=*/allow_tma && is_at_least_hopper,
+      /*allow_tdm=*/rocm_supports_tdm, num_stages));
   if (enable_pdl) {
     pm->addPass(emitters::CreateLowerPdlWaitPass());
   }
diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -18,15 +18,16 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "third_party/amd/include/TritonAMDGPUToLLVM/Passes.h"
-#include "third_party/amd/include/TritonAMDGPUTransforms/Passes.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
+#include "third_party/amd/include/TritonAMDGPUToLLVM/Passes.h"
+#include "third_party/amd/include/TritonAMDGPUTransforms/Passes.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/rocm/rocm_compute_capability.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
@@ -39,10 +40,12 @@ namespace mt = ::mlir::triton;
 
 // Based on make_ttir() in
 // @triton//:third_party/amd/backend/compiler.py
-static void MakeTTIR(mlir::OpPassManager* pm) {
+static void MakeTTIR(mlir::OpPassManager* pm,
+                     const stream_executor::RocmComputeCapability& rocm_cc) {
   pm->addPass(mlir::createInlinerPass());
-  // if not amd.supports_tdm(arch)
-  // pm->addPass(mt::createTritonRewriteTensorDescriptorToPointer());
+  if (!rocm_cc.has_tdm_support()) {
+    pm->addPass(mt::createTritonRewriteTensorDescriptorToPointer());
+  }
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mt::createTritonCombineOps());
   pm->addPass(mt::createTritonReorderBroadcast());
@@ -99,6 +102,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
   bool use_block_pingpong =
       is_pingpong_schedule_enabled(rocm_cc, use_async_copy);
 
+  pm->addPass(mlir::createTritonAMDGPUOptimizeDescriptorEncoding());
   pm->addPass(mlir::createTritonAMDGPUScheduleLoops({num_stages}));
   pm->addPass(
       mlir::createTritonAMDGPUPipeline({use_async_copy, use_block_pingpong}));
@@ -185,7 +189,7 @@ void CreateTritonRocmPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::RocmComputeCapability& rocm_cc, int num_warps,
     int num_ctas, int num_stages) {
-  MakeTTIR(pm);
+  MakeTTIR(pm, rocm_cc);
   MakeTTGIR(pm, rocm_cc, num_warps, num_ctas, num_stages);
   MakeLLIR(pm, rocm_cc, num_stages);
 }
diff --git a/xla/backends/gpu/codegen/triton/transforms/passes.h b/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -33,6 +33,8 @@ namespace mlir::triton::xla {
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
     bool allow_tma, int num_stages);
+std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
+    bool allow_tma, bool allow_tdm, int num_stages);
 std::unique_ptr<mlir::Pass> CreateTritonXLASqueezeDimsPass();
 std::unique_ptr<mlir::Pass> CreateTritonXLAFoldTransposePass();
 std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
diff --git a/xla/backends/gpu/codegen/triton/transforms/passes.td b/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -28,6 +28,8 @@ def TritonXLAExtractInsertToTritonPass : Pass<"triton-xla-extract-insert-to-trit
   let options = [
     Option<"allow_tma_", "allow_tma", "bool", "false",
            "Whether to permit lowering to TMA.">,
+    Option<"allow_tdm_", "allow_tdm", "bool", "false",
+           "Whether to permit lowering to TDM (device-side tensor descriptors).">,
     Option<"num_stages_", "num_stages", "int", "1",
            "Number of stages for pipelining.">,
   ];
diff --git a/xla/backends/gpu/codegen/triton/transforms/tests/triton_pipeline_tdm.mlir b/xla/backends/gpu/codegen/triton/transforms/tests/triton_pipeline_tdm.mlir
@@ -0,0 +1,28 @@
+// RUN: xla-opt %s --triton-xla-pipeline='target=gfx1250' \
+// RUN:   | FileCheck %s --check-prefix=CHECK-TDM
+//
+// RUN: xla-opt %s --triton-xla-pipeline='target=gfx950' \
+// RUN:   | FileCheck %s --check-prefix=CHECK-NOTDM
+
+// Verifies that the full Triton XLA + AMD lowering pipeline emits TDM
+// intrinsics on gfx1250 and pointer-arithmetic buffer ops on non-TDM arches.
+
+func.func @lower_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
+  %extracted_tensor = triton_xla.extract from %arg0
+      as memref<256x256xbf16, #xtile.layout<[1, 0]>>
+      [0, 0] [16, 64] [1, 1] : tensor<16x64xbf16>
+  triton_xla.insert %extracted_tensor into %arg1
+      as memref<256x256xbf16, #xtile.layout<[1, 0]>>
+      [0, 0] [16, 64] [1, 1] : tensor<16x64xbf16>
+  func.return
+}
+
+// CHECK-TDM-LABEL: llvm.func @lower_extract_insert
+// CHECK-TDM:       tensor.load.to.lds
+// CHECK-TDM:       s.wait.tensorcnt
+// CHECK-TDM:       tensor.store.from.lds
+
+// CHECK-NOTDM-LABEL: llvm.func @lower_extract_insert
+// CHECK-NOTDM-NOT:   tensor.load.to.lds
+// CHECK-NOTDM-NOT:   tensor.store.from.lds
+// CHECK-NOTDM:       raw.ptr.buffer.load
diff --git a/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir b/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
@@ -6,6 +6,10 @@
 // RUN: -triton-xla-extract-insert-to-triton="allow_tma=1 num_stages=3" \
 // RUN: | FileCheck %s --check-prefix=CHECK-TMA
 
+// RUN: xla-opt %s -split-input-file \
+// RUN: -triton-xla-extract-insert-to-triton="allow_tdm=1" \
+// RUN: | FileCheck %s --check-prefix=CHECK-TDM
+
 func.func @lower_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
   %extracted_tensor = triton_xla.extract from %arg0
       as memref<512x8x128xbf16, #xtile.layout<[2, 1, 0]>>
@@ -30,6 +34,17 @@ func.func @lower_extract_insert(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
 // CHECK-TMA:         tt.descriptor_store %arg1[{{.*}}],
 // CHECK-TMA:         tt.return
 
+// Middle singleton dim is TDM-incompatible, so fall back to pointer loads.
+// CHECK-TDM-LABEL: tt.func @lower_extract_insert(
+// CHECK-TDM-SAME:      %arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32},
+// CHECK-TDM-SAME:      %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}) {
+// CHECK-TDM-NOT:     tt.make_tensor_descriptor
+// CHECK-TDM-NOT:     tt.descriptor_load
+// CHECK-TDM-NOT:     tt.descriptor_store
+// CHECK-TDM:         %[[LOAD:.*]] = tt.load
+// CHECK-TDM:         tt.store {{.*}}, %[[LOAD]]
+// CHECK-TDM:         tt.return
+
 // -----
 
 func.func @non_perfect_tile_shape(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -46,6 +61,12 @@ func.func @non_perfect_tile_shape(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
 // CHECK:         %[[LOAD:.*]] = tt.load {{.*}}, %{{.*}}, %{{.*}} :
 // CHECK:         tt.store {{.*}}, %[[LOAD]], %{{.*}} :
 
+// CHECK-TDM-LABEL: tt.func @non_perfect_tile_shape
+// CHECK-TDM:         %[[DESC0:.*]] = tt.make_tensor_descriptor %arg0
+// CHECK-TDM:         tt.descriptor_load %[[DESC0]]
+// CHECK-TDM:         %[[DESC1:.*]] = tt.make_tensor_descriptor %arg1
+// CHECK-TDM:         tt.descriptor_store %[[DESC1]]
+
 // -----
 
 func.func @incompatible_tma_global_strides(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -62,6 +83,11 @@ func.func @incompatible_tma_global_strides(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<
 // CHECK-TMA:         tt.load
 // CHECK-TMA:         tt.store
 
+// CHECK-TDM-LABEL: tt.func @incompatible_tma_global_strides
+// CHECK-TDM-NOT:     tt.make_tensor_descriptor
+// CHECK-TDM:         tt.load
+// CHECK-TDM:         tt.store
+
 // -----
 
 #indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]">
@@ -91,6 +117,11 @@ module {
 // CHECK:         tt.store {{.*}}, %{{.*}}, %{{.*}}
 // CHECK:         tt.store {{.*}}, %{{.*}}, %{{.*}}
 
+// CHECK-TDM-LABEL: tt.func @slice_with_tiling_that_needs_padding_has_boundary_checks
+// CHECK-TDM:       tt.descriptor_load
+// CHECK-TDM:       tt.descriptor_store
+// CHECK-TDM:       tt.descriptor_store
+
 // -----
 
 #indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]">
@@ -120,6 +151,11 @@ module {
 // CHECK:         tt.store {{.*}}, %{{.*}}, %{{.*}}
 // CHECK:         tt.store {{.*}}, %{{.*}} :
 
+// CHECK-TDM-LABEL: tt.func @slice_with_extra_output_that_can_reuse_tile_due_to_padding
+// CHECK-TDM:       tt.descriptor_load
+// CHECK-TDM:       tt.descriptor_store
+// CHECK-TDM:       tt.descriptor_store
+
 // -----
 
 func.func @extract_with_non_unit_minor_dim_stride(%arg0: !tt.ptr<bf16>,
@@ -137,6 +173,10 @@ func.func @extract_with_non_unit_minor_dim_stride(%arg0: !tt.ptr<bf16>,
 // CHECK-TMA:   tt.load
 // CHECK-TMA:   tt.descriptor_store
 
+// CHECK-TDM-LABEL: tt.func @extract_with_non_unit_minor_dim_stride
+// CHECK-TDM:   tt.load
+// CHECK-TDM:   tt.descriptor_store
+
 // -----
 
 func.func @lower_extract_insert_1d(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -163,6 +203,15 @@ func.func @lower_extract_insert_1d(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
 // CHECK-TMA:         tt.descriptor_store %arg1[{{.*}}], %[[LOAD]]
 // CHECK-TMA:         tt.return
 
+// CHECK-TDM-LABEL: tt.func @lower_extract_insert_1d(
+// CHECK-TDM-SAME:      %arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32},
+// CHECK-TDM-SAME:      %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}) {
+// CHECK-TDM:         %[[DESC0:.*]] = tt.make_tensor_descriptor %arg0
+// CHECK-TDM:         %[[LOAD:.*]] = tt.descriptor_load %[[DESC0]]
+// CHECK-TDM:         %[[DESC1:.*]] = tt.make_tensor_descriptor %arg1
+// CHECK-TDM:         tt.descriptor_store %[[DESC1]][{{.*}}], %[[LOAD]]
+// CHECK-TDM:         tt.return
+
 // -----
 
 func.func @lower_extract_insert_5d(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -189,6 +238,15 @@ func.func @lower_extract_insert_5d(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
 // CHECK-TMA:         tt.descriptor_store %arg1[{{.*}}], %[[LOAD]]
 // CHECK-TMA:         tt.return
 
+// CHECK-TDM-LABEL: tt.func @lower_extract_insert_5d(
+// CHECK-TDM-SAME:      %arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32},
+// CHECK-TDM-SAME:      %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}) {
+// CHECK-TDM:         %[[DESC0:.*]] = tt.make_tensor_descriptor %arg0
+// CHECK-TDM:         %[[LOAD:.*]] = tt.descriptor_load %[[DESC0]]
+// CHECK-TDM:         %[[DESC1:.*]] = tt.make_tensor_descriptor %arg1
+// CHECK-TDM:         tt.descriptor_store %[[DESC1]][{{.*}}], %[[LOAD]]
+// CHECK-TDM:         tt.return
+
 // -----
 
 func.func @extract_insert_with_zero_stride(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<bf16>) {
@@ -205,6 +263,11 @@ func.func @extract_insert_with_zero_stride(%arg0: !tt.ptr<bf16>, %arg1: !tt.ptr<
 // CHECK-TMA-SAME:      %arg0: !tt.tensordesc<1x64xbf16>
 // CHECK-TMA-SAME:      %arg1: !tt.tensordesc<1x64xbf16>
 
+// CHECK-TDM-LABEL: tt.func @extract_insert_with_zero_stride
+// CHECK-TDM-NOT:     tt.make_tensor_descriptor
+// CHECK-TDM:         tt.load
+// CHECK-TDM:         tt.store
+
 // -----
 
 func.func @incompatible_tma_const_offset_not_divisible_by_16_bytes(
@@ -222,6 +285,11 @@ func.func @incompatible_tma_const_offset_not_divisible_by_16_bytes(
 // CHECK-TMA:         tt.load
 // CHECK-TMA:         tt.descriptor_store
 
+// CHECK-TDM-LABEL: tt.func @incompatible_tma_const_offset_not_divisible_by_16_bytes
+// CHECK-TDM-NOT:     tt.make_tensor_descriptor
+// CHECK-TDM:         tt.load
+// CHECK-TDM:         tt.store
+
 // -----
 
 #indexing_map = #xla.indexing_map<"(pid_0) -> ((pid_0 mod 9) * 16 + (pid_0 floordiv 9) * 130), domain: pid_0 in [0, 575]">
@@ -251,6 +319,10 @@ module {
 // CHECK-TMA:         tt.load
 // CHECK-TMA:         tt.descriptor_store
 
+// CHECK-TDM-LABEL: tt.func @incompatible_tma_dynamic_offset_not_divisible_by_16_bytes
+// CHECK-TDM:         tt.descriptor_load
+// CHECK-TDM:         tt.store
+
 // -----
 
 func.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma(
@@ -276,6 +348,11 @@ func.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma(
 // CHECK-TMA-NOT:         tt.descriptor_load %arg0
 // CHECK-TMA:             tt.descriptor_load %arg1
 
+// CHECK-TDM-LABEL: tt.func @parameter_into_broadcast_with_3_or_more_stages_does_not_use_tma
+// CHECK-TDM:         tt.descriptor_load
+// CHECK-TDM:         tt.descriptor_load
+// CHECK-TDM:         tt.descriptor_store
+
 // -----
 
 #indexing_map_unaligned = #xla.indexing_map<"(d0) -> (d0 * 2816), domain: d0 in [0, 2047]">
@@ -301,6 +378,10 @@ module {
 // CHECK: %[[MASK:.*]] = arith.cmpi slt
 // CHECK: tt.load {{.*}}, %[[MASK]], {{.*}}
 
+// CHECK-TDM-LABEL: tt.func @apply_mask_to_unaligned_offset_with_perfect_total_size
+// CHECK-TDM:         tt.descriptor_load
+// CHECK-TDM:         tt.descriptor_store
+
 // -----
 
 #indexing_map_aligned_with_oob_at_end = #xla.indexing_map<"(pid, d1) -> ((pid floordiv 64) * 384 + d1 * 32), domain: pid in [0, 1023], d1 in [0, 11]">
@@ -328,3 +409,7 @@ module {
 // CHECK-LABEL: tt.func @apply_mask_to_aligned_offset_with_out_of_bounds_reads_at_end
 // CHECK: %[[MASK:.*]] = arith.cmpi slt
 // CHECK: tt.load {{.*}}, %[[MASK]], {{.*}}
+
+// CHECK-TDM-LABEL: tt.func @apply_mask_to_aligned_offset_with_out_of_bounds_reads_at_end
+// CHECK-TDM:         tt.descriptor_load
+// CHECK-TDM:         tt.descriptor_store
diff --git a/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc b/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc