Skip to content

Commit 5cda8d9

Browse files
authored
[AMDGPU] Emit the relocation symbol for LDS and named barrier when object linking is enabled (#192380)
1 parent 699b6bd commit 5cda8d9

4 files changed

Lines changed: 109 additions & 3 deletions

File tree

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "AMDGPUHSAMetadataStreamer.h"
2121
#include "AMDGPUMCResourceInfo.h"
2222
#include "AMDGPUResourceUsageAnalysis.h"
23+
#include "AMDGPUTargetMachine.h"
2324
#include "GCNSubtarget.h"
2425
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2526
#include "MCTargetDesc/AMDGPUMCExpr.h"
@@ -330,10 +331,18 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
330331
return;
331332
}
332333

333-
// LDS variables aren't emitted in HSA or PAL yet.
334334
const Triple::OSType OS = TM.getTargetTriple().getOS();
335-
if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
336-
return;
335+
if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
336+
if (!AMDGPUTargetMachine::EnableObjectLinking)
337+
return;
338+
// With object linking, LDS definitions should have been externalized
339+
// by earlier passes (e.g. LDS lowering, named barrier lowering).
340+
// Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
341+
// so the linker can assign their offsets.
342+
assert(GV->isDeclaration() &&
343+
"LDS definitions should have been externalized when object "
344+
"linking is enabled");
345+
}
337346

338347
MCSymbol *GVSym = getSymbol(GV);
339348

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8348,6 +8348,18 @@ bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
83488348
if (!GV->hasExternalLinkage())
83498349
return true;
83508350

8351+
// With object linking, external LDS declarations need relocations so the
8352+
// linker can assign their offsets.
8353+
if (AMDGPUTargetMachine::EnableObjectLinking) {
8354+
if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
8355+
if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8356+
assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
8357+
"when object linking is enabled");
8358+
return false;
8359+
}
8360+
}
8361+
}
8362+
83518363
const auto OS = getTargetMachine().getTargetTriple().getOS();
83528364
return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
83538365
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-enable-object-linking < %s | FileCheck %s
2+
3+
; Verify object linking codegen for named barriers on GFX1250:
4+
; 1. Barrier instructions use M0-based forms with relocation references
5+
; 2. group_segment_fixed_size = 0 (linker patches it)
6+
; 3. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds)
7+
8+
@bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
9+
10+
; CHECK-LABEL: kernel:
11+
; CHECK: s_lshr_b32 s{{[0-9]+}}, __amdgpu_named_barrier.bar{{[^ @]*}}@abs32@lo, 4
12+
; CHECK: s_barrier_signal m0
13+
; CHECK: s_barrier_join m0
14+
; CHECK: s_barrier_wait 1
15+
16+
; KD: group_segment_fixed_size = 0 (linker will patch).
17+
; CHECK: .amdhsa_group_segment_fixed_size 0
18+
19+
; LDS symbol declaration
20+
; CHECK: .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4
21+
22+
define amdgpu_kernel void @kernel() {
23+
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
24+
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
25+
call void @llvm.amdgcn.s.barrier.wait(i16 1)
26+
call void @helper()
27+
ret void
28+
}
29+
30+
declare void @helper()
31+
declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #0
32+
declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #0
33+
declare void @llvm.amdgcn.s.barrier.wait(i16) #0
34+
35+
attributes #0 = { convergent nounwind }
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking < %s | FileCheck -check-prefixes=ASM %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=obj < %s | llvm-readobj -r --syms - | FileCheck -check-prefixes=ELF %s
3+
4+
; Test that with object linking enabled, external LDS declarations produce
5+
; @abs32@lo relocations, SHN_AMDGPU_LDS symbols, and .amdgpu_lds directives.
6+
; Covers multiple LDS variables with different sizes and alignments (including
7+
; zero-sized dynamic LDS), usage from both kernels and device functions, and
8+
; group_segment_fixed_size = 0 (linker patches via binary patching).
9+
10+
@lds_large = external addrspace(3) global [256 x i8], align 16
11+
@lds_small = external addrspace(3) global [128 x i8], align 4
12+
@lds_dynamic = external addrspace(3) global [0 x i8], align 8
13+
14+
; --- Assembly checks ---
15+
; ASM-LABEL: {{^}}device_func:
16+
; ASM: v_add_u32_e32 v{{[0-9]+}}, lds_large@abs32@lo, v{{[0-9]+}}
17+
18+
; ASM-LABEL: {{^}}test_kernel:
19+
; ASM-DAG: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, lds_small@abs32@lo
20+
; ASM-DAG: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, lds_dynamic@abs32@lo
21+
22+
; ASM-DAG: .amdgpu_lds lds_large, 256, 16
23+
; ASM-DAG: .amdgpu_lds lds_small, 128, 4
24+
; ASM-DAG: .amdgpu_lds lds_dynamic, 0, 8
25+
26+
; ASM: .group_segment_fixed_size: 0
27+
28+
; --- ELF checks ---
29+
; ELF-DAG: R_AMDGPU_ABS32_LO lds_large
30+
; ELF-DAG: R_AMDGPU_ABS32_LO lds_small
31+
; ELF-DAG: R_AMDGPU_ABS32_LO lds_dynamic
32+
33+
; ELF-DAG: Name: lds_large
34+
; ELF-DAG: Name: lds_small
35+
; ELF-DAG: Name: lds_dynamic
36+
37+
define void @device_func(i32 %idx) {
38+
%gep = getelementptr [256 x i8], ptr addrspace(3) @lds_large, i32 0, i32 %idx
39+
store i8 1, ptr addrspace(3) %gep
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @test_kernel(i32 %idx) {
44+
%gep1 = getelementptr [128 x i8], ptr addrspace(3) @lds_small, i32 0, i32 %idx
45+
store i8 2, ptr addrspace(3) %gep1
46+
%gep2 = getelementptr [0 x i8], ptr addrspace(3) @lds_dynamic, i32 0, i32 %idx
47+
store i8 3, ptr addrspace(3) %gep2
48+
call void @device_func(i32 %idx)
49+
ret void
50+
}

0 commit comments

Comments
 (0)