Skip to content

Commit 63a26bc

Browse files
authored
Unrolled build for #146181
Rollup merge of #146181 - Flakebi:dynamic-shared-memory, r=ZuseZ4,Sa4dus,workingjubilee,RalfJung,nikic,kjetilkjeka,kulst Add intrinsic for launch-sized workgroup memory on GPUs Workgroup memory is a memory region that is shared between all threads in a workgroup on GPUs. Workgroup memory can be allocated statically or after compilation, when launching a gpu-kernel. The intrinsic added here returns the pointer to the memory that is allocated at launch-time. # Interface With this change, workgroup memory can be accessed in Rust by calling the new `gpu_launch_sized_workgroup_mem<T>() -> *mut T` intrinsic. It returns the pointer to workgroup memory guaranteeing that it is aligned to at least the alignment of `T`. The pointer is dereferencable for the size specified when launching the current gpu-kernel (which may be the size of `T` but can also be larger or smaller or zero). All calls to this intrinsic return a pointer to the same address. See the intrinsic documentation for more details. ## Alternative Interfaces It was also considered to expose dynamic workgroup memory as extern static variables in Rust, like they are represented in LLVM IR. However, due to the pointer not being guaranteed to be dereferencable (that depends on the allocated size at runtime), such a global must be zero-sized, which makes global variables a bad fit. # Implementation Details Workgroup memory in amdgpu and nvptx lives in address space 3. Workgroup memory from a launch is implemented by creating an external global variable in address space 3. The global is declared with size 0, as the actual size is only known at runtime. It is defined behavior in LLVM to access an external global outside the defined size. There is no similar way to get the allocated size of launch-sized workgroup memory on amdgpu an nvptx, so users have to pass this out-of-band or rely on target specific ways for now. Tracking issue: #135516
2 parents 9838411 + 13ec3de commit 63a26bc

11 files changed

Lines changed: 193 additions & 9 deletions

File tree

compiler/rustc_abi/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,6 +1753,9 @@ pub struct AddressSpace(pub u32);
17531753
impl AddressSpace {
17541754
/// LLVM's `0` address space.
17551755
pub const ZERO: Self = AddressSpace(0);
1756+
/// The address space for workgroup memory on nvptx and amdgpu.
1757+
/// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details.
1758+
pub const GPU_WORKGROUP: Self = AddressSpace(3);
17561759
}
17571760

17581761
/// How many scalable vectors are in a `BackendRepr::ScalableVector`?

compiler/rustc_codegen_llvm/src/declare.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use std::borrow::Borrow;
1515

1616
use itertools::Itertools;
17+
use rustc_abi::AddressSpace;
1718
use rustc_codegen_ssa::traits::{MiscCodegenMethods, TypeMembershipCodegenMethods};
1819
use rustc_data_structures::fx::FxIndexSet;
1920
use rustc_middle::ty::{Instance, Ty};
@@ -104,6 +105,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
104105
)
105106
}
106107
}
108+
109+
/// Declare a global value in a specific address space.
110+
///
111+
/// If there’s a value with the same name already declared, the function will
112+
/// return its Value instead.
113+
pub(crate) fn declare_global_in_addrspace(
114+
&self,
115+
name: &str,
116+
ty: &'ll Type,
117+
addr_space: AddressSpace,
118+
) -> &'ll Value {
119+
debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
120+
unsafe {
121+
llvm::LLVMRustGetOrInsertGlobalInAddrspace(
122+
(**self).borrow().llmod,
123+
name.as_c_char_ptr(),
124+
name.len(),
125+
ty,
126+
addr_space.0,
127+
)
128+
}
129+
}
107130
}
108131

109132
impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {

compiler/rustc_codegen_llvm/src/intrinsic.rs

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ use std::ffi::c_uint;
33
use std::{assert_matches, iter, ptr};
44

55
use rustc_abi::{
6-
Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive, Size,
7-
WrappingRange,
6+
AddressSpace, Align, BackendRepr, Float, HasDataLayout, Integer, NumScalableVectors, Primitive,
7+
Size, WrappingRange,
88
};
99
use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
1010
use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
@@ -178,6 +178,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
178178
span: Span,
179179
) -> Result<(), ty::Instance<'tcx>> {
180180
let tcx = self.tcx;
181+
let llvm_version = crate::llvm_util::get_version();
181182

182183
let name = tcx.item_name(instance.def_id());
183184
let fn_args = instance.args;
@@ -194,7 +195,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
194195
| sym::maximum_number_nsz_f64
195196
| sym::maximum_number_nsz_f128
196197
// Need at least LLVM 22 for `min/maximumnum` to not crash LLVM.
197-
if crate::llvm_util::get_version() >= (22, 0, 0) =>
198+
if llvm_version >= (22, 0, 0) =>
198199
{
199200
let intrinsic_name = if name.as_str().starts_with("min") {
200201
"llvm.minimumnum"
@@ -420,7 +421,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
420421
}
421422

422423
// FIXME move into the branch below when LLVM 22 is the lowest version we support.
423-
sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
424+
sym::carryless_mul if llvm_version >= (22, 0, 0) => {
424425
let ty = args[0].layout.ty;
425426
if !ty.is_integral() {
426427
tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
@@ -620,6 +621,46 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
620621
return Ok(());
621622
}
622623

624+
sym::gpu_launch_sized_workgroup_mem => {
625+
// Generate an anonymous global per call, with these properties:
626+
// 1. The global is in the address space for workgroup memory
627+
// 2. It is an `external` global
628+
// 3. It is correctly aligned for the pointee `T`
629+
// All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
630+
// The name is irrelevant.
631+
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
632+
let name = if llvm_version < (23, 0, 0) && tcx.sess.target.arch == Arch::Nvptx64 {
633+
// The auto-assigned name for extern shared globals in the nvptx backend does
634+
// not compile in ptxas. Workaround this issue by assigning a name.
635+
// Fixed in LLVM 23.
636+
"gpu_launch_sized_workgroup_mem"
637+
} else {
638+
""
639+
};
640+
let global = self.declare_global_in_addrspace(
641+
name,
642+
self.type_array(self.type_i8(), 0),
643+
AddressSpace::GPU_WORKGROUP,
644+
);
645+
let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
646+
// The alignment of the global is used to specify the *minimum* alignment that
647+
// must be obeyed by the GPU runtime.
648+
// When multiple of these global variables are used by a kernel, the maximum alignment is taken.
649+
// See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
650+
let alignment = self.align_of(*inner_ty).bytes() as u32;
651+
unsafe {
652+
// FIXME Workaround the above issue by taking maximum alignment if the global existed
653+
if tcx.sess.target.arch == Arch::Nvptx64 {
654+
if alignment > llvm::LLVMGetAlignment(global) {
655+
llvm::LLVMSetAlignment(global, alignment);
656+
}
657+
} else {
658+
llvm::LLVMSetAlignment(global, alignment);
659+
}
660+
}
661+
self.cx().const_pointercast(global, self.type_ptr())
662+
}
663+
623664
sym::amdgpu_dispatch_ptr => {
624665
let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
625666
// Relying on `LLVMBuildPointerCast` to produce an addrspacecast

compiler/rustc_codegen_llvm/src/llvm/ffi.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,6 +2003,13 @@ unsafe extern "C" {
20032003
NameLen: size_t,
20042004
T: &'a Type,
20052005
) -> &'a Value;
2006+
pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
2007+
M: &'a Module,
2008+
Name: *const c_char,
2009+
NameLen: size_t,
2010+
T: &'a Type,
2011+
AddressSpace: c_uint,
2012+
) -> &'a Value;
20062013
pub(crate) fn LLVMRustGetNamedValue(
20072014
M: &Module,
20082015
Name: *const c_char,

compiler/rustc_codegen_ssa/src/mir/intrinsic.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
111111
sym::abort
112112
| sym::unreachable
113113
| sym::cold_path
114+
| sym::gpu_launch_sized_workgroup_mem
114115
| sym::breakpoint
115116
| sym::amdgpu_dispatch_ptr
116117
| sym::assert_zero_valid

compiler/rustc_hir_analysis/src/check/intrinsic.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
130130
| sym::forget
131131
| sym::frem_algebraic
132132
| sym::fsub_algebraic
133+
| sym::gpu_launch_sized_workgroup_mem
133134
| sym::is_val_statically_known
134135
| sym::log2f16
135136
| sym::log2f32
@@ -297,6 +298,7 @@ pub(crate) fn check_intrinsic_type(
297298
sym::field_offset => (1, 0, vec![], tcx.types.usize),
298299
sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
299300
sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
301+
sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
300302
sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
301303
(1, 0, vec![], tcx.types.unit)
302304
}

compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,10 +299,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
299299
.getCallee());
300300
}
301301

302-
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
303-
const char *Name,
304-
size_t NameLen,
305-
LLVMTypeRef Ty) {
302+
// Get the global variable with the given name if it exists or create a new
303+
// external global.
304+
extern "C" LLVMValueRef
305+
LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
306+
size_t NameLen, LLVMTypeRef Ty,
307+
unsigned int AddressSpace) {
306308
Module *Mod = unwrap(M);
307309
auto NameRef = StringRef(Name, NameLen);
308310

@@ -313,10 +315,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
313315
GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
314316
if (!GV)
315317
GV = new GlobalVariable(*Mod, unwrap(Ty), false,
316-
GlobalValue::ExternalLinkage, nullptr, NameRef);
318+
GlobalValue::ExternalLinkage, nullptr, NameRef,
319+
nullptr, GlobalValue::NotThreadLocal, AddressSpace);
317320
return wrap(GV);
318321
}
319322

323+
// Get the global variable with the given name if it exists or create a new
324+
// external global.
325+
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
326+
const char *Name,
327+
size_t NameLen,
328+
LLVMTypeRef Ty) {
329+
Module *Mod = unwrap(M);
330+
unsigned int AddressSpace =
331+
Mod->getDataLayout().getDefaultGlobalsAddressSpace();
332+
return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
333+
AddressSpace);
334+
}
335+
320336
// Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
321337
enum class LLVMRustAttributeKind {
322338
AlwaysInline = 0,

compiler/rustc_span/src/symbol.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,7 @@ symbols! {
10331033
global_asm,
10341034
global_registration,
10351035
globs,
1036+
gpu_launch_sized_workgroup_mem,
10361037
gt,
10371038
guard,
10381039
guard_patterns,

library/core/src/intrinsics/gpu.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,51 @@
55
66
#![unstable(feature = "gpu_intrinsics", issue = "none")]
77

8+
/// Returns the pointer to workgroup memory allocated at launch-time on GPUs.
9+
///
10+
/// Workgroup memory is a memory region that is shared between all threads in
11+
/// the same workgroup. It is faster to access than other memory but pointers do not
12+
/// work outside the workgroup where they were obtained.
13+
/// Workgroup memory can be allocated statically or after compilation, when
14+
/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to
15+
/// the memory that is allocated at launch-time.
16+
/// The size of this memory can differ between launches of a gpu-kernel, depending on
17+
/// what is specified at launch-time.
18+
/// However, the alignment is fixed by the kernel itself, at compile-time.
19+
///
20+
/// The returned pointer is the start of the workgroup memory region that is
21+
/// allocated at launch-time.
22+
/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the
23+
/// generic type, return the same address, so alias the same memory.
24+
/// The returned pointer is aligned by at least the alignment of `T`.
25+
///
26+
/// If `gpu_launch_sized_workgroup_mem` is invoked multiple times with different
27+
/// types that have different alignment, then you may only rely on the resulting
28+
/// pointer having the alignment of `T` after a call to `gpu_launch_sized_workgroup_mem::<T>`
29+
/// has occurred in the current program execution.
30+
///
31+
/// # Safety
32+
///
33+
/// The pointer is safe to dereference from the start (the returned pointer) up to the
34+
/// size of workgroup memory that was specified when launching the current gpu-kernel.
35+
/// This allocated size is not related in any way to `T`.
36+
///
37+
/// The user must take care of synchronizing access to workgroup memory between
38+
/// threads in a workgroup. The usual data race requirements apply.
39+
///
40+
/// # Other APIs
41+
///
42+
/// CUDA and HIP call this dynamic shared memory, shared between threads in a block.
43+
/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
44+
/// GLSL calls this shared memory, shared between invocations in a work group.
45+
/// DirectX calls this groupshared memory, shared between threads in a thread-group.
46+
#[must_use = "returns a pointer that does nothing unless used"]
47+
#[rustc_intrinsic]
48+
#[rustc_nounwind]
49+
#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")]
50+
#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
51+
pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;
52+
853
/// Returns a pointer to the HSA kernel dispatch packet.
954
///
1055
/// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.

src/tools/tidy/src/style.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool {
222222
|| static_regex!(
223223
"\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*"
224224
).is_match(line)
225+
// Matching for FileCheck checks
226+
|| static_regex!(
227+
"\\s*// [a-zA-Z0-9-_]*:\\s.*"
228+
).is_match(line)
225229
}
226230

227231
/// Returns `true` if `line` is allowed to be longer than the normal limit.

0 commit comments

Comments
 (0)