diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml index 84fce27e..c056d144 100644 --- a/.github/workflows/container_images.yml +++ b/.github/workflows/container_images.yml @@ -7,6 +7,8 @@ on: - ".github/workflows/container_images.yml" - "container/**" push: + branches: + - main paths: - ".github/workflows/container_images.yml" - "container/**" @@ -162,4 +164,4 @@ jobs: $(printf '${{ env.REGISTRY }}/${{ matrix.variance.image }}@sha256:%s ' *) - name: Inspect image run: | - docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }} \ No newline at end of file + docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }} diff --git a/.github/workflows/deploy_guide.yml b/.github/workflows/deploy_guide.yml index 3d3ce319..cb7cf6bb 100644 --- a/.github/workflows/deploy_guide.yml +++ b/.github/workflows/deploy_guide.yml @@ -12,7 +12,6 @@ concurrency: jobs: deploy: # Only run on the main repository, not on forks - if: github.repository == 'rust-gpu/Rust-CUDA' runs-on: ubuntu-latest environment: name: github-pages diff --git a/README.md b/README.md index 9e183f68..d48d016f 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The current line-up of libraries is the following: - `cuda_std` for GPU-side functions and utilities, such as thread index queries, memory allocation, warp intrinsics, etc. - _Not_ a low level library, provides many utility functions to make it easier to write cleaner and more reliable GPU kernels. - Closely tied to `rustc_codegen_nvvm` which exposes GPU features through it internally. -- [`cudnn`](https://github.com/Rust-GPU/Rust-CUDA/tree/master/crates/cudnn) for a collection of GPU-accelerated primitives for deep neural networks. +- [`cudnn`](https://github.com/Rust-GPU/rust-cuda/tree/master/crates/cudnn) for a collection of GPU-accelerated primitives for deep neural networks. - `cust` for CPU-side CUDA features such as launching GPU kernels, GPU memory allocation, device queries, etc. - High level with features such as RAII and Rust Results that make it easier and cleaner to manage the interface to the GPU. - A high level wrapper for the CUDA Driver API, the lower level version of the more common CUDA Runtime API used from C++. @@ -92,7 +92,7 @@ Other projects related to using Rust on the GPU: cargo build ``` -## Use Rust-CUDA in Container Environments +## Use Rust CUDA in Container Environments The distribution related Dockerfile are located in `container` folder. Taking ubuntu 24.04 as an example, run the following command in repository root: diff --git a/container/rockylinux9-cuda12/Dockerfile b/container/rockylinux9-cuda12/Dockerfile index a63b1c37..8e0e4444 100644 --- a/container/rockylinux9-cuda12/Dockerfile +++ b/container/rockylinux9-cuda12/Dockerfile @@ -72,8 +72,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" # Setup the workspace -WORKDIR /data/Rust-CUDA -RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \ +WORKDIR /data/rust-cuda +RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \ rustup show # Add nvvm to LD_LIBRARY_PATH. diff --git a/container/ubuntu22-cuda11/Dockerfile b/container/ubuntu22-cuda11/Dockerfile index 84d08b6b..a4df7c66 100644 --- a/container/ubuntu22-cuda11/Dockerfile +++ b/container/ubuntu22-cuda11/Dockerfile @@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" # Setup the workspace -WORKDIR /data/Rust-CUDA -RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \ +WORKDIR /data/rust-cuda +RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \ rustup show # Add nvvm to LD_LIBRARY_PATH. diff --git a/container/ubuntu22-cuda12/Dockerfile b/container/ubuntu22-cuda12/Dockerfile index 7d66ff9c..df8fbb82 100644 --- a/container/ubuntu22-cuda12/Dockerfile +++ b/container/ubuntu22-cuda12/Dockerfile @@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" # Setup the workspace -WORKDIR /data/Rust-CUDA -RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \ +WORKDIR /data/rust-cuda +RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \ rustup show # Add nvvm to LD_LIBRARY_PATH. diff --git a/container/ubuntu24-cuda12/Dockerfile b/container/ubuntu24-cuda12/Dockerfile index 01365070..b4ba12f5 100644 --- a/container/ubuntu24-cuda12/Dockerfile +++ b/container/ubuntu24-cuda12/Dockerfile @@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" # Setup the workspace -WORKDIR /data/Rust-CUDA -RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \ +WORKDIR /data/rust-cuda +RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \ rustup show # Add nvvm to LD_LIBRARY_PATH. diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml index a3e377cc..23340f1b 100644 --- a/crates/blastoff/Cargo.toml +++ b/crates/blastoff/Cargo.toml @@ -3,7 +3,7 @@ name = "blastoff" version = "0.1.0" edition = "2021" authors = ["Riccardo D'Ambrosio "] -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" [dependencies] bitflags = "2.8" diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml index 8095b79e..d412ea79 100644 --- a/crates/cuda_builder/Cargo.toml +++ b/crates/cuda_builder/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" authors = ["Riccardo D'Ambrosio ", "The rust-gpu Authors"] license = "MIT OR Apache-2.0" description = "Builder for easily building rustc_codegen_nvvm crates" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml index 209929ce..42c06ab2 100644 --- a/crates/cuda_std/Cargo.toml +++ b/crates/cuda_std/Cargo.toml @@ -4,7 +4,7 @@ version = "0.2.2" edition = "2018" license = "MIT OR Apache-2.0" description = "Standard library for CUDA with rustc_codegen_nvvm" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/cuda_std_macros/Cargo.toml b/crates/cuda_std_macros/Cargo.toml index 45e92cdc..4557dcff 100644 --- a/crates/cuda_std_macros/Cargo.toml +++ b/crates/cuda_std_macros/Cargo.toml @@ -4,7 +4,7 @@ version = "0.2.0" edition = "2018" license = "MIT OR Apache-2.0" description = "Macros for cuda_std" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [lib] diff --git a/crates/cudnn-sys/Cargo.toml b/crates/cudnn-sys/Cargo.toml index 0c48edf6..dbedaecb 100644 --- a/crates/cudnn-sys/Cargo.toml +++ b/crates/cudnn-sys/Cargo.toml @@ -3,7 +3,7 @@ name = "cudnn-sys" version = "0.1.0" edition = "2024" license = "MIT OR Apache-2.0" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" links = "cudnn" build = "build/main.rs" diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md index 904995ab..00951dbf 100644 --- a/crates/cust/CHANGELOG.md +++ b/crates/cust/CHANGELOG.md @@ -145,7 +145,7 @@ it much easier to write multigpu code. The CUDA API is fully thread-safe except - Added `MemoryAdvise::advise_read_mostly`. - Added `MemoryAdvise::preferred_location` and `MemoryAdvise::unset_preferred_location`. Note that advising APIs are only present on high end GPUs such as V100s. -- `StreamFlags::NON_BLOCKING` has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/Rust-CUDA/issues/15). +- `StreamFlags::NON_BLOCKING` has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/rust-cuda/issues/15). ## 0.2.0 - 11/26/21 diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml index 8882dae3..370fe331 100644 --- a/crates/cust/Cargo.toml +++ b/crates/cust/Cargo.toml @@ -9,7 +9,7 @@ authors = [ edition = "2021" license = "MIT OR Apache-2.0" description = "High level bindings to the CUDA Driver API" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 284baa98..3ee5c5f1 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -30,7 +30,7 @@ bitflags::bitflags! { /// This stream does not synchronize with the NULL stream. /// - /// **Note: this flag has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/Rust-CUDA/issues/15)** + /// **Note: this flag has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/rust-cuda/issues/15)** /// /// Note that the name is chosen to correspond to CUDA documentation, but is nevertheless /// misleading. All work within a single stream is ordered and asynchronous regardless @@ -93,7 +93,7 @@ impl Stream { /// # } /// ``` pub fn new(mut flags: StreamFlags, priority: Option) -> CudaResult { - // NOTE(RDambrosio016): See https://github.com/Rust-GPU/Rust-CUDA/issues/15 + // NOTE(RDambrosio016): See https://github.com/Rust-GPU/rust-cuda/issues/15 flags.remove(StreamFlags::NON_BLOCKING); unsafe { let mut stream = Stream { diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml index c4b9d2c7..60e2d17f 100644 --- a/crates/cust_core/Cargo.toml +++ b/crates/cust_core/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.1" edition = "2021" license = "MIT OR Apache-2.0" description = "Core library for cust that can be shared across CPU and GPU" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/cust_derive/Cargo.toml b/crates/cust_derive/Cargo.toml index d4d908af..207f7207 100644 --- a/crates/cust_derive/Cargo.toml +++ b/crates/cust_derive/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Brook Heisler ", "Riccardo D'Ambrosio "] edition = "2018" license = "MIT OR Apache-2.0" description = "High level bindings to libnvvm" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/optix-sys/Cargo.toml b/crates/optix-sys/Cargo.toml index 494b09ed..562690be 100644 --- a/crates/optix-sys/Cargo.toml +++ b/crates/optix-sys/Cargo.toml @@ -3,7 +3,7 @@ name = "optix-sys" version = "0.1.0" edition = "2024" license = "MIT OR Apache-2.0" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" links = "optix" build = "build/main.rs" diff --git a/crates/optix/Cargo.toml b/crates/optix/Cargo.toml index 600fdaf8..67b3c95d 100644 --- a/crates/optix/Cargo.toml +++ b/crates/optix/Cargo.toml @@ -3,7 +3,7 @@ name = "optix" version = "0.1.0" edition = "2021" license = "MIT OR Apache-2.0" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" authors = ["Anders Langlands ", "Riccardo D'Ambrosio "] diff --git a/crates/optix_device_macros/Cargo.toml b/crates/optix_device_macros/Cargo.toml index 0c78df26..769b8cf3 100644 --- a/crates/optix_device_macros/Cargo.toml +++ b/crates/optix_device_macros/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2018" license = "MIT OR Apache-2.0" description = "Macros for optix_device" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [lib] diff --git a/crates/ptx/Cargo.toml b/crates/ptx/Cargo.toml index 559badfa..eff4da3c 100644 --- a/crates/ptx/Cargo.toml +++ b/crates/ptx/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2018" license = "MIT OR Apache-2.0" description = "PTX parser and analyzer" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/ptx_compiler/Cargo.toml b/crates/ptx_compiler/Cargo.toml index 0785596a..fc6c2e28 100644 --- a/crates/ptx_compiler/Cargo.toml +++ b/crates/ptx_compiler/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.1" edition = "2021" license = "MIT OR Apache-2.0" description = "High level bindings to CUDA's ptx compilation APIs" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [dependencies] diff --git a/crates/ptx_compiler/src/lib.rs b/crates/ptx_compiler/src/lib.rs index a4c25273..8fae5da3 100644 --- a/crates/ptx_compiler/src/lib.rs +++ b/crates/ptx_compiler/src/lib.rs @@ -102,6 +102,7 @@ impl CompilerFailure { .to_result()?; let size = size.assume_init(); let mut vec = Vec::with_capacity(size); + #[allow(clippy::unnecessary_cast)] nvptx_compiler_sys::nvPTXCompilerGetErrorLog( self.handle, vec.as_mut_ptr() as *mut c_char, @@ -138,6 +139,7 @@ impl CompiledProgram { .to_result()?; let size = size.assume_init(); let mut vec = Vec::with_capacity(size); + #[allow(clippy::unnecessary_cast)] nvptx_compiler_sys::nvPTXCompilerGetInfoLog( self.handle, vec.as_mut_ptr() as *mut c_char, diff --git a/crates/rustc_codegen_nvvm/Cargo.toml b/crates/rustc_codegen_nvvm/Cargo.toml index 825b32c2..f0c4dcfd 100644 --- a/crates/rustc_codegen_nvvm/Cargo.toml +++ b/crates/rustc_codegen_nvvm/Cargo.toml @@ -8,7 +8,7 @@ authors = [ edition = "2024" license = "MIT OR Apache-2.0" description = "A codegen backend for Rustc which targets the libnvvm CUDA library" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [lib] diff --git a/crates/rustc_codegen_nvvm/src/builder.rs b/crates/rustc_codegen_nvvm/src/builder.rs index 7d32c553..e011e14c 100644 --- a/crates/rustc_codegen_nvvm/src/builder.rs +++ b/crates/rustc_codegen_nvvm/src/builder.rs @@ -6,7 +6,7 @@ use libc::{c_char, c_uint}; use rustc_abi as abi; use rustc_abi::{AddressSpace, Align, HasDataLayout, Size, TargetDataLayout, WrappingRange}; use rustc_codegen_ssa::MemFlags; -use rustc_codegen_ssa::common::{IntPredicate, RealPredicate, TypeKind}; +use rustc_codegen_ssa::common::{AtomicRmwBinOp, IntPredicate, RealPredicate, TypeKind}; use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue}; use rustc_codegen_ssa::mir::place::PlaceRef; use rustc_codegen_ssa::traits::*; @@ -213,9 +213,9 @@ macro_rules! math_builder_methods { _ => { self.cx.fatal(format!( "Unimplemented 128-bit integer operation '{}' with {} arguments. \ - This operation is not yet supported in Rust-CUDA. \ + This operation is not yet supported in Rust CUDA. \ Consider using 64-bit integers or filing an issue at \ - https://github.com/Rust-GPU/Rust-CUDA/issues", + https://github.com/Rust-GPU/rust-cuda/issues", stringify!($name), args_vec.len() )); @@ -546,30 +546,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { fn atomic_load( &mut self, - _ty: &'ll Type, + ty: &'ll Type, ptr: &'ll Value, - _order: AtomicOrdering, + order: AtomicOrdering, _size: Size, ) -> &'ll Value { - // core seems to think that nvptx has atomic loads, which is not true for NVVM IR, - // therefore our only option is to print that this is not supported then trap. - // i have heard of cursed things such as emulating this with __threadfence and volatile loads - // but that needs to be experimented with in terms of safety and behavior. - // NVVM has explicit intrinsics for adding and subtracting floats which we expose elsewhere - - // TODO(RDambrosio016): is there a way we can just generate a panic with a message instead - // of doing this ourselves? since all panics will be aborts, it should be equivalent - // let message = "Atomic Loads are not supported in CUDA.\0"; - - // let vprintf = self.get_intrinsic("vprintf"); - // let formatlist = self.const_str(Symbol::intern(message)).0; - // let valist = self.const_null(self.type_void()); - - // self.call(vprintf, &[formatlist, valist], None); - - let (ty, f) = self.get_intrinsic("llvm.trap"); - self.call(ty, None, None, f, &[], None, None); - unsafe { llvm::LLVMBuildLoad(self.llbuilder, ptr, unnamed()) } + // Since for any A, A | 0 = A, and performing atomics on constant memory is UB in Rust, we can abuse or to perform atomic reads. + self.atomic_rmw(AtomicRmwBinOp::AtomicOr, ptr, self.const_int(ty, 0), order) } fn load_operand(&mut self, place: PlaceRef<'tcx, &'ll Value>) -> OperandRef<'tcx, &'ll Value> { @@ -796,24 +779,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { fn atomic_store( &mut self, - _val: &'ll Value, + val: &'ll Value, ptr: &'ll Value, - _order: AtomicOrdering, + order: AtomicOrdering, _size: Size, ) { - // see comment in atomic_load - - // let message = "Atomic Stores are not supported in CUDA.\0"; - - // let vprintf = self.get_intrinsic("vprintf"); - // let formatlist = self.const_str(Symbol::intern(message)).0; - // let valist = self.const_null(self.type_void()); - - // self.call(vprintf, &[formatlist, valist], None); - self.abort(); - unsafe { - llvm::LLVMBuildLoad(self.llbuilder, ptr, UNNAMED); - } + // We can exchange *ptr with val, and then discard the result. + self.atomic_rmw(AtomicRmwBinOp::AtomicXchg, ptr, val, order); } fn gep(&mut self, ty: &'ll Type, ptr: &'ll Value, indices: &[&'ll Value]) -> &'ll Value { @@ -1134,26 +1106,155 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { // Atomic Operations fn atomic_cmpxchg( &mut self, - _dst: &'ll Value, - _cmp: &'ll Value, - _src: &'ll Value, - _order: AtomicOrdering, - _failure_order: AtomicOrdering, - _weak: bool, + dst: &'ll Value, + cmp: &'ll Value, + src: &'ll Value, + order: AtomicOrdering, + failure_order: AtomicOrdering, + weak: bool, ) -> (&'ll Value, &'ll Value) { - // allowed but only for some things and with restrictions - // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction - self.fatal("atomic cmpxchg is not supported") + // LLVM verifier rejects cases where the `failure_order` is stronger than `order` + match (order, failure_order) { + // Failure order `Release` & `AcqRel` is simply invalid. + (_, AtomicOrdering::Release | AtomicOrdering::AcqRel) => { + self.abort(); + return ( + self.const_undef(self.val_ty(cmp)), + self.const_undef(self.type_i1()), + ); + } + // Success & failure ordering are the same - OK. + (AtomicOrdering::SeqCst, AtomicOrdering::SeqCst) + | (AtomicOrdering::Relaxed, AtomicOrdering::Relaxed) + | (AtomicOrdering::Acquire, AtomicOrdering::Acquire) => (), + // Failure is `SeqCst`(strongest) & success is anything else(weaker) - reject. + (_, AtomicOrdering::SeqCst) => { + self.abort(); + return ( + self.const_undef(self.val_ty(cmp)), + self.const_undef(self.type_i1()), + ); + } + // Failure is Relaxed(weakest), and success is anything - OK. + (_, AtomicOrdering::Relaxed) => (), + // Failure is anything, and success is SeqCest(strongest) - OK. + (AtomicOrdering::SeqCst, _) => (), + // Failure is Acquire, and success is Release - OK. + (AtomicOrdering::Release, AtomicOrdering::Acquire) => (), + // Success is AcqRel & failure is Acquire - OK + (AtomicOrdering::AcqRel, AtomicOrdering::Acquire) => (), + // Success is weaker than failure - reject. + (AtomicOrdering::Relaxed, AtomicOrdering::Acquire) => { + self.abort(); + return ( + self.const_undef(self.val_ty(cmp)), + self.const_undef(self.type_i1()), + ); + } + }; + let res = self.atomic_op( + dst, + |builder, dst| { + // We are in a supported address space - just use ordinary atomics + unsafe { + llvm::LLVMRustBuildAtomicCmpXchg( + builder.llbuilder, + dst, + cmp, + src, + crate::llvm::AtomicOrdering::from_generic(order), + crate::llvm::AtomicOrdering::from_generic(failure_order), + weak as u32, + ) + } + }, + |builder, dst| { + // Local space is only accessible to the current thread. + // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store. + let load: &'ll Value = + unsafe { llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) }; + let compare = builder.icmp(IntPredicate::IntEQ, load, cmp); + // We can do something smart & branchless here: + // We select either the current value(if the comparison fails), or a new value. + // We then *undconditionally* write that back to local memory(which is very, very cheap). + // TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes. + let value = builder.select(compare, src, load); + unsafe { llvm::LLVMBuildStore(builder.llbuilder, value, dst) }; + let res_type = + builder.type_struct(&[builder.val_ty(cmp), builder.type_ix(1)], false); + // We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics. + let res = builder.const_undef(res_type); + let res = builder.insert_value(res, load, 0); + builder.insert_value(res, compare, 1) + }, + ); + // Unpack the result + let val = self.extract_value(res, 0); + let success = self.extract_value(res, 1); + (val, success) } fn atomic_rmw( &mut self, - _op: rustc_codegen_ssa::common::AtomicRmwBinOp, - _dst: &'ll Value, - _src: &'ll Value, - _order: AtomicOrdering, + op: AtomicRmwBinOp, + dst: &'ll Value, + src: &'ll Value, + order: AtomicOrdering, ) -> &'ll Value { - // see cmpxchg comment - self.fatal("atomic rmw is not supported") + if matches!(op, AtomicRmwBinOp::AtomicNand) { + self.fatal("Atomic NAND not supported yet!") + } + self.atomic_op( + dst, + |builder, dst| { + // We are in a supported address space - just use ordinary atomics + unsafe { + llvm::LLVMBuildAtomicRMW( + builder.llbuilder, + op.into(), + dst, + src, + crate::llvm::AtomicOrdering::from_generic(order), + 0, + ) + } + }, + |builder, dst| { + // Local space is only accessible to the current thread. + // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store. + let load: &'ll Value = + unsafe { llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) }; + let next_val = match op { + AtomicRmwBinOp::AtomicXchg => src, + AtomicRmwBinOp::AtomicAdd => builder.add(load, src), + AtomicRmwBinOp::AtomicSub => builder.sub(load, src), + AtomicRmwBinOp::AtomicAnd => builder.and(load, src), + AtomicRmwBinOp::AtomicNand => { + let and = builder.and(load, src); + builder.not(and) + } + AtomicRmwBinOp::AtomicOr => builder.or(load, src), + AtomicRmwBinOp::AtomicXor => builder.xor(load, src), + AtomicRmwBinOp::AtomicMax => { + let is_src_bigger = builder.icmp(IntPredicate::IntSGT, src, load); + builder.select(is_src_bigger, src, load) + } + AtomicRmwBinOp::AtomicMin => { + let is_src_smaller = builder.icmp(IntPredicate::IntSLT, src, load); + builder.select(is_src_smaller, src, load) + } + AtomicRmwBinOp::AtomicUMax => { + let is_src_bigger = builder.icmp(IntPredicate::IntUGT, src, load); + builder.select(is_src_bigger, src, load) + } + AtomicRmwBinOp::AtomicUMin => { + let is_src_smaller = builder.icmp(IntPredicate::IntULT, src, load); + builder.select(is_src_smaller, src, load) + } + }; + unsafe { llvm::LLVMBuildStore(builder.llbuilder, next_val, dst) }; + load + }, + ) } fn atomic_fence( @@ -1609,3 +1710,99 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { } } } +impl<'ll, 'tcx, 'a> Builder<'a, 'll, 'tcx> { + /// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space) + /// or emulation(with `emulate_local`, if `dst` points to a thread-local address space). + /// FIXME(FractalFir): this code assumess all pointers are generic. Adjust it once we support address spaces. + fn atomic_op( + &mut self, + dst: &'ll Value, + atomic_supported: impl FnOnce(&mut Builder<'a, 'll, 'tcx>, &'ll Value) -> &'ll Value, + emulate_local: impl FnOnce(&mut Builder<'a, 'll, 'tcx>, &'ll Value) -> &'ll Value, + ) -> &'ll Value { + // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them. + // For example, they are restricted in what address space they operate on. + // CUDA has 4 address spaces(and a generic one, which is an union of all of those). + // An atomic instruction can soundly operate on: + // 1. The global address space + // 2. The shared(cluster) address space. + // It can't operate on: + // 1. The const address space(atomics on consts are UB anyway) + // 2. The thread address space(which should be only accessible to 1 thread, anyway?) + // So, we do the following: + // 1. Check if the pointer is in one of the address spaces atomics support. + // a) if so, we perform an atomic operation + // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here, + // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?) + // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap. + + // We check if the `dst` pointer is in the `global` address space. + let (isspacep_global_ty, isspacep_global_fn) = + self.get_intrinsic("llvm.nvvm.isspacep.global"); + let isspacep_global = self.call( + isspacep_global_ty, + None, + None, + isspacep_global_fn, + &[dst], + None, + None, + ); + // We check if the `dst` pointer is in the `shared` address space. + let (isspacep_shared_ty, isspacep_shared_fn) = + self.get_intrinsic("llvm.nvvm.isspacep.shared"); + let isspacep_shared = self.call( + isspacep_shared_ty, + None, + None, + isspacep_shared_fn, + &[dst], + None, + None, + ); + // Combine those to check if we are in a supported address space. + let atomic_supported_addrspace = self.or(isspacep_shared, isspacep_global); + // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise. + let supported_bb = self.append_sibling_block("atomic_space_supported"); + let unsupported_bb = self.append_sibling_block("atomic_space_unsupported"); + self.cond_br(atomic_supported_addrspace, supported_bb, unsupported_bb); + // We also create a "merge" block we will jump to, after the the atomic ops finish. + let merge_bb = self.append_sibling_block("atomic_op_done"); + // Execute atomic op if supported, then jump to merge + self.switch_to_block(supported_bb); + let supported_res = atomic_supported(self, dst); + self.br(merge_bb); + // Check if the pointer is in the thread space. If so, we can emulate it. + self.switch_to_block(unsupported_bb); + let (isspacep_local_ty, isspacep_local_fn) = self.get_intrinsic("llvm.nvvm.isspacep.local"); + let isspacep_local = self.call( + isspacep_local_ty, + None, + None, + isspacep_local_fn, + &[dst], + None, + None, + ); + let local_bb = self.append_sibling_block("atomic_local_space"); + let atomic_ub_bb = self.append_sibling_block("atomic_space_ub"); + self.cond_br(isspacep_local, local_bb, atomic_ub_bb); + // The pointer is in the thread(local) space. + self.switch_to_block(local_bb); + let local_res = emulate_local(self, dst); + self.br(merge_bb); + // The pointer is neither in the supported address space, nor the local space. + // This is very likely UB. So, we trap here. + // TODO: should we print some kind of a message here? NVVM supports printf. + self.switch_to_block(atomic_ub_bb); + self.abort(); + self.unreachable(); + // Atomic is impl has finished, and we can now switch to the merge_bb + self.switch_to_block(merge_bb); + self.phi( + self.val_ty(local_res), + &[supported_res, local_res], + &[supported_bb, local_bb], + ) + } +} diff --git a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs index 170444b4..56e45053 100644 --- a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs +++ b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs @@ -449,5 +449,10 @@ impl<'ll> CodegenCx<'ll, '_> { "__nv_ynf", fn(t_i32, t_f32) -> t_f32 ); + // Address space checks + ifn!(map, "llvm.nvvm.isspacep.const", fn(i8p) -> i1); + ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1); + ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1); + ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1); } } diff --git a/crates/rustc_codegen_nvvm/src/lib.rs b/crates/rustc_codegen_nvvm/src/lib.rs index 825748ca..47c2065b 100644 --- a/crates/rustc_codegen_nvvm/src/lib.rs +++ b/crates/rustc_codegen_nvvm/src/lib.rs @@ -89,7 +89,7 @@ use std::ffi::CString; #[unsafe(no_mangle)] pub fn __rustc_codegen_backend() -> Box { rustc_driver::install_ice_hook( - "https://github.com/Rust-GPU/Rust-CUDA/issues/new", + "https://github.com/Rust-GPU/rust-cuda/issues/new", |handler| { handler.handle().note(concat!( "`rust-cuda` version `", diff --git a/crates/rustc_codegen_nvvm/src/llvm.rs b/crates/rustc_codegen_nvvm/src/llvm.rs index a0243eed..37c14ee8 100644 --- a/crates/rustc_codegen_nvvm/src/llvm.rs +++ b/crates/rustc_codegen_nvvm/src/llvm.rs @@ -16,9 +16,9 @@ // but likely will use in the future, so we ignore any unused functions // in case we need them in the future for things like debug info or LTO. #![allow(dead_code)] - use libc::{c_char, c_uint, c_void, size_t}; use libc::{c_int, c_ulonglong}; +use rustc_codegen_ssa::common::AtomicRmwBinOp; use std::ffi::{CStr, CString}; use std::fmt; use std::hash::{Hash, Hasher}; @@ -1947,4 +1947,85 @@ unsafe extern "C" { pub(crate) fn LLVMRustAddDereferenceableOrNullAttr(Fn: &Value, index: c_uint, bytes: u64); pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock); + // Atomics + pub fn LLVMRustBuildAtomicCmpXchg<'a>( + B: &Builder<'a>, + LHS: &Value, + CMP: &Value, + RHS: &Value, + Order: AtomicOrdering, + FailureOrder: AtomicOrdering, + Weak: Bool, + ) -> &'a Value; + + pub fn LLVMBuildAtomicRMW<'a>( + B: &Builder<'a>, + Op: LLVMAtomicRmwBinOp, + LHS: &Value, + RHS: &Value, + Order: AtomicOrdering, + SingleThreaded: Bool, + ) -> &'a Value; +} +/// LLVMAtomicOrdering +#[derive(Copy, Clone)] +#[repr(C)] +pub(crate) enum AtomicOrdering { + #[allow(dead_code)] + NotAtomic = 0, + #[allow(dead_code)] + Unordered = 1, + Monotonic = 2, + // Consume = 3, // Not specified yet. + Acquire = 4, + Release = 5, + AcquireRelease = 6, + SequentiallyConsistent = 7, +} +impl AtomicOrdering { + pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self { + use rustc_middle::ty::AtomicOrdering as Common; + match ao { + Common::Relaxed => Self::Monotonic, + Common::Acquire => Self::Acquire, + Common::Release => Self::Release, + Common::AcqRel => Self::AcquireRelease, + Common::SeqCst => Self::SequentiallyConsistent, + } + } +} + +/// FFI-safe mirror of LLVMAtomicRMWBinOp from the LLVM C API. +#[derive(Copy, Clone)] +#[repr(C)] +pub enum LLVMAtomicRmwBinOp { + AtomicXchg = 0, + AtomicAdd = 1, + AtomicSub = 2, + AtomicAnd = 3, + AtomicNand = 4, + AtomicOr = 5, + AtomicXor = 6, + AtomicMax = 7, + AtomicMin = 8, + AtomicUMax = 9, + AtomicUMin = 10, +} + +impl From for LLVMAtomicRmwBinOp { + fn from(op: AtomicRmwBinOp) -> Self { + match op { + AtomicRmwBinOp::AtomicXchg => Self::AtomicXchg, + AtomicRmwBinOp::AtomicAdd => Self::AtomicAdd, + AtomicRmwBinOp::AtomicSub => Self::AtomicSub, + AtomicRmwBinOp::AtomicAnd => Self::AtomicAnd, + AtomicRmwBinOp::AtomicNand => Self::AtomicNand, + AtomicRmwBinOp::AtomicOr => Self::AtomicOr, + AtomicRmwBinOp::AtomicXor => Self::AtomicXor, + AtomicRmwBinOp::AtomicMax => Self::AtomicMax, + AtomicRmwBinOp::AtomicMin => Self::AtomicMin, + AtomicRmwBinOp::AtomicUMax => Self::AtomicUMax, + AtomicRmwBinOp::AtomicUMin => Self::AtomicUMin, + } + } } diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs index 2bee80e3..2c1ae5b2 100644 --- a/crates/rustc_codegen_nvvm/src/nvvm.rs +++ b/crates/rustc_codegen_nvvm/src/nvvm.rs @@ -125,9 +125,10 @@ pub fn codegen_bitcode_modules( let res = match prog.compile(&args.nvvm_options) { Ok(b) => b, Err(error) => { + let log = prog.compiler_log().unwrap().unwrap_or_default(); // this should never happen, if it does, something went really bad or its a bug on libnvvm's end panic!( - "libnvvm returned an error that was not previously caught by the verifier: {error:?}" + "libnvvm returned an error that was not previously caught by the verifier: {error:?} {log:?}" ); } }; diff --git a/crates/rustc_codegen_nvvm_macros/Cargo.toml b/crates/rustc_codegen_nvvm_macros/Cargo.toml index 65999434..9d78a324 100644 --- a/crates/rustc_codegen_nvvm_macros/Cargo.toml +++ b/crates/rustc_codegen_nvvm_macros/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" license = "MIT OR Apache-2.0" description = "Macros for rustc_codegen_nvvm" -repository = "https://github.com/Rust-GPU/Rust-CUDA" +repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" [lib] diff --git a/examples/cuda/gemm/src/main.rs b/examples/cuda/gemm/src/main.rs index 8e254176..73df03d5 100644 --- a/examples/cuda/gemm/src/main.rs +++ b/examples/cuda/gemm/src/main.rs @@ -1,4 +1,4 @@ -//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust-CUDA. +//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust CUDA. //! //! This example benchmarks naive and tiled GEMM kernels as well as cuBLAS for various matrix sizes. //! It uses the `cust` crate for CUDA management and `ndarray` for host-side matrix operations. diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md index e61cff4f..083480e5 100644 --- a/guide/src/guide/getting_started.md +++ b/guide/src/guide/getting_started.md @@ -9,7 +9,7 @@ Before you can use the project to write GPU crates, you will need a couple of pr - [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version `11.2-11.8` (and the appropriate driver - [see cuda release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)). - We recently [added experimental support for the `12.x` - SDK](https://github.com/Rust-GPU/Rust-CUDA/issues/100), please file any issues you + SDK](https://github.com/Rust-GPU/rust-cuda/issues/100), please file any issues you see This is only for building GPU crates, to execute built PTX you only need CUDA `9+`. @@ -234,7 +234,7 @@ components = ["rust-src", "rustc-dev", "llvm-tools-preview"] There is also a [Dockerfile](Dockerfile) prepared as a quickstart with all the necessary libraries for base cuda development. -You can use it as follows (assuming your clone of Rust-CUDA is at the absolute path `RUST_CUDA`): +You can use it as follows (assuming your clone of Rust CUDA is at the absolute path `RUST_CUDA`): - Ensure you have Docker setup to [use gpus](https://docs.docker.com/config/containers/resource_constraints/#gpu) - Build `docker build -t rust-cuda $RUST_CUDA` diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md index f07f9db0..55f4bcd1 100644 --- a/tests/compiletests/README.md +++ b/tests/compiletests/README.md @@ -1,6 +1,6 @@ -# Compiletests for Rust-CUDA +# Compiletests for Rust CUDA -This directory contains compile tests for the Rust-CUDA project using the `compiletest` framework. +This directory contains compile tests for the Rust CUDA project using the `compiletest` framework. The code in these tests is not executed. Tests check that the compiler compiles correctly. Tests in `dis/` verify correct PTX output. diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs index 9fb2ed4f..a0c04589 100644 --- a/tests/compiletests/src/main.rs +++ b/tests/compiletests/src/main.rs @@ -65,7 +65,7 @@ fn main() { // HACK(eddyb) force `compiletest` to pass `ui/...` relative paths to `rustc`, // which should always end up being the same regardless of the path that the - // Rust-CUDA repo is checked out at (among other things, this avoids hardcoded + // Rust CUDA repo is checked out at (among other things, this avoids hardcoded // `compiletest` limits being hit by e.g. users with slightly longer paths). std::env::set_current_dir(tests_dir).unwrap(); let tests_dir = PathBuf::from(""); diff --git a/tests/compiletests/ui/atomic/std_atomic_ops.rs b/tests/compiletests/ui/atomic/std_atomic_ops.rs new file mode 100644 index 00000000..82d7f4f8 --- /dev/null +++ b/tests/compiletests/ui/atomic/std_atomic_ops.rs @@ -0,0 +1,49 @@ +// Test CUDA atomic operations compile correctly +// build-pass +// compile-flags: -Z verify-llvm-ir +use core::sync::atomic::{AtomicUsize, Ordering}; + +use cuda_std::atomic::{ + AtomicF32, AtomicF64, BlockAtomicF32, BlockAtomicF64, SystemAtomicF32, SystemAtomicF64, +}; +use cuda_std::kernel; +static GLOBAL: AtomicUsize = AtomicUsize::new(0); +#[kernel] +pub unsafe fn test_cuda_atomic_floats() { + let local = AtomicUsize::new(0); + // `compare_exchange` should succeed + local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed); + // `compare_exchange` should fail + local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed); + // `compare_exchange` should succeed + GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed); + // `compare_exchange` should fail + GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed); + // Ops + local.swap(1, Ordering::Relaxed); + GLOBAL.swap(1, Ordering::Relaxed); + local.fetch_add(1, Ordering::Relaxed); + GLOBAL.fetch_add(1, Ordering::Relaxed); + local.fetch_sub(1, Ordering::Relaxed); + GLOBAL.fetch_sub(1, Ordering::Relaxed); + local.fetch_and(1, Ordering::Relaxed); + GLOBAL.fetch_and(1, Ordering::Relaxed); + local.fetch_and(1, Ordering::Relaxed); + GLOBAL.fetch_and(1, Ordering::Relaxed); + local.fetch_or(1, Ordering::Relaxed); + GLOBAL.fetch_or(1, Ordering::Relaxed); + local.fetch_xor(1, Ordering::Relaxed); + GLOBAL.fetch_xor(1, Ordering::Relaxed); + local.fetch_max(1, Ordering::Relaxed); + GLOBAL.fetch_max(1, Ordering::Relaxed); + local.fetch_min(1, Ordering::Relaxed); + GLOBAL.fetch_min(1, Ordering::Relaxed); + // Loads: + local.load(Ordering::Relaxed); + GLOBAL.load(Ordering::Relaxed); + local.store(1, Ordering::Relaxed); + GLOBAL.store(1, Ordering::Relaxed); + // Atomic NAND is not supported quite yet + //local.fetch_nand(1, Ordering::Relaxed); + //GLOBAL.fetch_nand(1, Ordering::Relaxed); +}