Skip to content

Commit fca80b4

Browse files
authored
[AMDGPU][ASAN] Move allocas to entry block in amdgpu-sw-lower-lds pass (#190772)
The `amdgpu-sw-lower-lds` pass inserts a workitem-0 check, malloc, and barrier before the original entry block, creating a new entry block. This pushes the original allocas into a non-entry block, causing LLVM to treat them as dynamic allocas. AMDGPU backend generates incorrect flat addresses for dynamic alloca addrspacecasts at -O0, causing memory faults when ASan is enabled with LDS. This PR hoists constant-size allocas to the new entry block so they remain static.
1 parent ab94dbc commit fca80b4

2 files changed

Lines changed: 75 additions & 1 deletion

File tree

llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,20 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
782782
// Create WIdBlock block which has instructions related to selection of
783783
// {0,0,0} indiex work item in the work group.
784784
auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
785-
IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
785+
786+
// Move constant-size allocas from the original entry block to the new entry
787+
// block (WIdBlock) so they remain static allocas. Splice the leading cluster
788+
// in bulk, then move any stragglers that are interleaved with other
789+
// instructions.
790+
auto SplitIt = PrevEntryBlock->getFirstNonPHIOrDbgOrAlloca();
791+
WIdBlock->splice(WIdBlock->end(), PrevEntryBlock, PrevEntryBlock->begin(),
792+
SplitIt);
793+
for (Instruction &I : make_early_inc_range(*PrevEntryBlock))
794+
if (auto *AI = dyn_cast<AllocaInst>(&I))
795+
if (isa<ConstantInt>(AI->getArraySize()))
796+
AI->moveBefore(*WIdBlock, WIdBlock->end());
797+
798+
IRB.SetInsertPoint(WIdBlock, WIdBlock->end());
786799
DebugLoc FirstDL =
787800
getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
788801
IRB.SetCurrentDebugLocation(FirstDL);
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
2+
3+
; Test that the sw-lower-lds pass moves constant-size allocas from the original
4+
; entry block to the new entry block (WId), so they remain static allocas.
5+
6+
@lds = internal addrspace(3) global [64 x i32] poison, align 4
7+
8+
; Allocas clustered at the top of the entry block (common case).
9+
define amdgpu_kernel void @kernel_allocas_at_top(i32 %n) sanitize_address {
10+
; CHECK-LABEL: define amdgpu_kernel void @kernel_allocas_at_top(
11+
; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: [[WID:.*]]:
13+
; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(5)
14+
; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4, addrspace(5)
15+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
16+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
17+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
18+
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
19+
; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
20+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
21+
; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
22+
; CHECK: [[BB18]]:
23+
; CHECK: store i32 [[N]], ptr addrspace(5) [[A]], align 4
24+
; CHECK-NEXT: store i32 [[N]], ptr addrspace(5) [[B]], align 4
25+
;
26+
%a = alloca i32, align 4, addrspace(5)
27+
%b = alloca i32, align 4, addrspace(5)
28+
store i32 %n, ptr addrspace(5) %a, align 4
29+
store i32 %n, ptr addrspace(5) %b, align 4
30+
store i32 %n, ptr addrspace(3) @lds, align 4
31+
ret void
32+
}
33+
34+
; Allocas interleaved with non-alloca instructions.
35+
define amdgpu_kernel void @kernel_allocas_scattered(i32 %n) sanitize_address {
36+
; CHECK-LABEL: define amdgpu_kernel void @kernel_allocas_scattered(
37+
; CHECK-SAME: i32 [[N:%.*]]) #[[ATTR0]] {
38+
; CHECK-NEXT: [[WID:.*]]:
39+
; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(5)
40+
; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4, addrspace(5)
41+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
42+
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
43+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
44+
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
45+
; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
46+
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
47+
; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
48+
; CHECK: [[BB18]]:
49+
; CHECK: store i32 [[N]], ptr addrspace(5) [[A]], align 4
50+
; CHECK-NEXT: store i32 [[N]], ptr addrspace(5) [[B]], align 4
51+
;
52+
%a = alloca i32, align 4, addrspace(5)
53+
store i32 %n, ptr addrspace(5) %a, align 4
54+
%b = alloca i32, align 4, addrspace(5)
55+
store i32 %n, ptr addrspace(5) %b, align 4
56+
store i32 %n, ptr addrspace(3) @lds, align 4
57+
ret void
58+
}
59+
60+
!llvm.module.flags = !{!0}
61+
!0 = !{i32 4, !"nosanitize_address", i32 1}

0 commit comments

Comments
 (0)