PR tensorflow#34806: [ROCm] fix the calling convention for AMD GPU

amd-songpiao · mmakevic-amd · commit 2e5d65ebfebe · 2025-12-15T14:15:26.000Z
Imported from GitHub PR openxla/xla#34806 Bugfix: PR tensorflow#34230 ("argument removal without building prototype") removed the call to **BuildKernelPrototypeFromUniqueName** which internally called **AnnotateFunctionAsGpuKernel** to set the correct calling convention based on the target GPU. Without this, Triton's **PTX_Kernel** calling convention was copied directly, which doesn't work on AMD GPUs and lead to "LLVM ERROR: unsupported calling convention". Fix: Added a call to **AnnotateFunctionAsGpuKernel** in **RemoveUnusedTritonAbiArguments** to properly set: PTX_Kernel (71) for NVIDIA AMDGPU_KERNEL (91) for AMD SPIR_KERNEL (76) for SPIR @xla-rotation could you review my PR, please? Copybara import of the project: -- ebd6e1fa03033bc9f6913351323fce26e1a8e4d2 by Songlin Piao <Songlin.Piao@amd.com>: replace the manual calling convention fix with AnnotateFunctionAsGpuKernel -- 4f16d9579b11c2984c8ebe58041b0d2b9ea5ba3f by Songlin Piao <Songlin.Piao@amd.com>: added a filecheck test Merging this change closes tensorflow#34806 PiperOrigin-RevId: 842146580
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
@@ -266,10 +266,15 @@ absl::StatusOr<llvm::Function*> RemoveUnusedTritonAbiArguments(
           .getCallee();
   llvm::Function* new_function = static_cast<llvm::Function*>(inserted);
 
-  new_function->setCallingConv(impl_fn->getCallingConv());
   new_function->copyMetadata(impl_fn, 0);
   new_function->setAttributes(impl_fn->getAttributes());
 
+  // Set the correct calling convention for the target GPU.
+  // Triton generates PTX_Kernel CC even for AMD, so we need to use
+  // AnnotateFunctionAsGpuKernel to set the correct CC based on target triple.
+  llvm::IRBuilder<> builder(llvm_module->getContext());
+  AnnotateFunctionAsGpuKernel(llvm_module, new_function, &builder);
+
   new_function->splice(new_function->begin(), impl_fn);
 
   for (const auto& [impl_fn_arg, kernel_arg] :
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -661,6 +661,7 @@ lit_test_suite_for_gpus(
             "slice_to_dynamic.hlo",
             "sorting.hlo",
             "sub_byte_collectives.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
             "zero_clamp_abs_index.hlo",
         ],
@@ -673,10 +674,12 @@ lit_test_suite_for_gpus(
     disabled_on_gpus = {
         "v100": [
             "kernel_reuse.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
         ],
         "p100": [
             "kernel_reuse.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
         ],
         "mi200": [
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -0,0 +1,26 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+
+// Verify that Triton kernels have the correct calling convention:
+// - PTX_KERNEL (71) for NVIDIA targets
+// - AMDGPU_KERNEL (91) for AMD targets
+// CHECK-PTX: define ptx_kernel void @triton_
+// CHECK-GCN: define amdgpu_kernel void @triton_
+
+HloModule TritonCallingConvention, is_scheduled=true
+
+triton_softmax {
+  param_0 = f32[4,4]{1,0} parameter(0)
+  ROOT exp = f32[4,4]{1,0} exponential(param_0)
+}
+
+ENTRY main {
+  param_0 = f32[4,4]{1,0} parameter(0)
+  ROOT triton_softmax = f32[4,4]{1,0} fusion(param_0), kind=kCustom,
+    calls=triton_softmax,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["4","4"]}],
+                                   "num_warps":"1",
+                                   "num_ctas":"1",
+                                   "num_stages":"1"}}}
+}