[AMDGPU][ASAN] Move allocas to entry block in amdgpu-sw-lower-lds pass (#190772)

skc7 · web-flow · commit fca80b4ff3fc · 2026-04-17T11:14:49.000+05:30
The `amdgpu-sw-lower-lds` pass inserts a workitem-0 check, malloc, and
barrier before the original entry block, creating a new entry block.
This pushes the original allocas into a non-entry block, causing LLVM to
treat them as dynamic allocas.

AMDGPU backend generates incorrect flat addresses for dynamic alloca
addrspacecasts at -O0, causing memory faults when ASan is enabled with
LDS.

This PR hoists constant-size allocas to the new entry block so they
remain static.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -782,7 +782,20 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
   // Create WIdBlock block which has instructions related to selection of
   // {0,0,0} indiex work item in the work group.
   auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
-  IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
+
+  // Move constant-size allocas from the original entry block to the new entry
+  // block (WIdBlock) so they remain static allocas. Splice the leading cluster
+  // in bulk, then move any stragglers that are interleaved with other
+  // instructions.
+  auto SplitIt = PrevEntryBlock->getFirstNonPHIOrDbgOrAlloca();
+  WIdBlock->splice(WIdBlock->end(), PrevEntryBlock, PrevEntryBlock->begin(),
+                   SplitIt);
+  for (Instruction &I : make_early_inc_range(*PrevEntryBlock))
+    if (auto *AI = dyn_cast<AllocaInst>(&I))
+      if (isa<ConstantInt>(AI->getArraySize()))
+        AI->moveBefore(*WIdBlock, WIdBlock->end());
+
+  IRB.SetInsertPoint(WIdBlock, WIdBlock->end());
   DebugLoc FirstDL =
       getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
   IRB.SetCurrentDebugLocation(FirstDL);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-alloca-placement.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-alloca-placement.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+; Test that the sw-lower-lds pass moves constant-size allocas from the original
+; entry block to the new entry block (WId), so they remain static allocas.
+
+@lds = internal addrspace(3) global [64 x i32] poison, align 4
+
+; Allocas clustered at the top of the entry block (common case).
+define amdgpu_kernel void @kernel_allocas_at_top(i32 %n) sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_allocas_at_top(
+; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[WID:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK:       [[BB18]]:
+; CHECK:         store i32 [[N]], ptr addrspace(5) [[A]], align 4
+; CHECK-NEXT:    store i32 [[N]], ptr addrspace(5) [[B]], align 4
+;
+  %a = alloca i32, align 4, addrspace(5)
+  %b = alloca i32, align 4, addrspace(5)
+  store i32 %n, ptr addrspace(5) %a, align 4
+  store i32 %n, ptr addrspace(5) %b, align 4
+  store i32 %n, ptr addrspace(3) @lds, align 4
+  ret void
+}
+
+; Allocas interleaved with non-alloca instructions.
+define amdgpu_kernel void @kernel_allocas_scattered(i32 %n) sanitize_address {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_allocas_scattered(
+; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[WID:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK:       [[BB18]]:
+; CHECK:         store i32 [[N]], ptr addrspace(5) [[A]], align 4
+; CHECK-NEXT:    store i32 [[N]], ptr addrspace(5) [[B]], align 4
+;
+  %a = alloca i32, align 4, addrspace(5)
+  store i32 %n, ptr addrspace(5) %a, align 4
+  %b = alloca i32, align 4, addrspace(5)
+  store i32 %n, ptr addrspace(5) %b, align 4
+  store i32 %n, ptr addrspace(3) @lds, align 4
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}