diff --git a/.github/workflows/container_images.yml b/.github/workflows/container_images.yml
index 84fce27e..c056d144 100644
--- a/.github/workflows/container_images.yml
+++ b/.github/workflows/container_images.yml
@@ -7,6 +7,8 @@ on:
       - ".github/workflows/container_images.yml"
       - "container/**"
   push:
+    branches:
+      - main
     paths:
       - ".github/workflows/container_images.yml"
       - "container/**"
@@ -162,4 +164,4 @@ jobs:
             $(printf '${{ env.REGISTRY }}/${{ matrix.variance.image }}@sha256:%s ' *)
       - name: Inspect image
         run: |
-          docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }}
\ No newline at end of file
+          docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }}
diff --git a/.github/workflows/deploy_guide.yml b/.github/workflows/deploy_guide.yml
index 3d3ce319..cb7cf6bb 100644
--- a/.github/workflows/deploy_guide.yml
+++ b/.github/workflows/deploy_guide.yml
@@ -12,7 +12,6 @@ concurrency:
 jobs:
   deploy:
     # Only run on the main repository, not on forks
-    if: github.repository == 'rust-gpu/Rust-CUDA'
     runs-on: ubuntu-latest
     environment:
       name: github-pages
diff --git a/README.md b/README.md
index 9e183f68..d48d016f 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ The current line-up of libraries is the following:
 - `cuda_std` for GPU-side functions and utilities, such as thread index queries, memory allocation, warp intrinsics, etc.
   - _Not_ a low level library, provides many utility functions to make it easier to write cleaner and more reliable GPU kernels.
   - Closely tied to `rustc_codegen_nvvm` which exposes GPU features through it internally.
-- [`cudnn`](https://github.com/Rust-GPU/Rust-CUDA/tree/master/crates/cudnn) for a collection of GPU-accelerated primitives for deep neural networks.
+- [`cudnn`](https://github.com/Rust-GPU/rust-cuda/tree/master/crates/cudnn) for a collection of GPU-accelerated primitives for deep neural networks.
 - `cust` for CPU-side CUDA features such as launching GPU kernels, GPU memory allocation, device queries, etc.
   - High level with features such as RAII and Rust Results that make it easier and cleaner to manage the interface to the GPU.
   - A high level wrapper for the CUDA Driver API, the lower level version of the more common CUDA Runtime API used from C++.
@@ -92,7 +92,7 @@ Other projects related to using Rust on the GPU:
 cargo build
 ```
 
-## Use Rust-CUDA in Container Environments
+## Use Rust CUDA in Container Environments
 
 The distribution related Dockerfile are located in `container` folder.
 Taking ubuntu 24.04 as an example, run the following command in repository root:
diff --git a/container/rockylinux9-cuda12/Dockerfile b/container/rockylinux9-cuda12/Dockerfile
index a63b1c37..8e0e4444 100644
--- a/container/rockylinux9-cuda12/Dockerfile
+++ b/container/rockylinux9-cuda12/Dockerfile
@@ -72,8 +72,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Setup the workspace
-WORKDIR /data/Rust-CUDA
-RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
     rustup show
 
 # Add nvvm to LD_LIBRARY_PATH.
diff --git a/container/ubuntu22-cuda11/Dockerfile b/container/ubuntu22-cuda11/Dockerfile
index 84d08b6b..a4df7c66 100644
--- a/container/ubuntu22-cuda11/Dockerfile
+++ b/container/ubuntu22-cuda11/Dockerfile
@@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Setup the workspace
-WORKDIR /data/Rust-CUDA
-RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
     rustup show
 
 # Add nvvm to LD_LIBRARY_PATH.
diff --git a/container/ubuntu22-cuda12/Dockerfile b/container/ubuntu22-cuda12/Dockerfile
index 7d66ff9c..df8fbb82 100644
--- a/container/ubuntu22-cuda12/Dockerfile
+++ b/container/ubuntu22-cuda12/Dockerfile
@@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Setup the workspace
-WORKDIR /data/Rust-CUDA
-RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
     rustup show
 
 # Add nvvm to LD_LIBRARY_PATH.
diff --git a/container/ubuntu24-cuda12/Dockerfile b/container/ubuntu24-cuda12/Dockerfile
index 01365070..b4ba12f5 100644
--- a/container/ubuntu24-cuda12/Dockerfile
+++ b/container/ubuntu24-cuda12/Dockerfile
@@ -71,8 +71,8 @@ RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Setup the workspace
-WORKDIR /data/Rust-CUDA
-RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/Rust-CUDA/rust-toolchain.toml \
+WORKDIR /data/rust-cuda
+RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
     rustup show
 
 # Add nvvm to LD_LIBRARY_PATH.
diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml
index a3e377cc..23340f1b 100644
--- a/crates/blastoff/Cargo.toml
+++ b/crates/blastoff/Cargo.toml
@@ -3,7 +3,7 @@ name = "blastoff"
 version = "0.1.0"
 edition = "2021"
 authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 
 [dependencies]
 bitflags = "2.8"
diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml
index 8095b79e..d412ea79 100644
--- a/crates/cuda_builder/Cargo.toml
+++ b/crates/cuda_builder/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>", "The rust-gpu Authors"]
 license = "MIT OR Apache-2.0"
 description = "Builder for easily building rustc_codegen_nvvm crates"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml
index 209929ce..42c06ab2 100644
--- a/crates/cuda_std/Cargo.toml
+++ b/crates/cuda_std/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.2.2"
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "Standard library for CUDA with rustc_codegen_nvvm"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/cuda_std_macros/Cargo.toml b/crates/cuda_std_macros/Cargo.toml
index 45e92cdc..4557dcff 100644
--- a/crates/cuda_std_macros/Cargo.toml
+++ b/crates/cuda_std_macros/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.2.0"
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "Macros for cuda_std"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [lib]
diff --git a/crates/cudnn-sys/Cargo.toml b/crates/cudnn-sys/Cargo.toml
index 0c48edf6..dbedaecb 100644
--- a/crates/cudnn-sys/Cargo.toml
+++ b/crates/cudnn-sys/Cargo.toml
@@ -3,7 +3,7 @@ name = "cudnn-sys"
 version = "0.1.0"
 edition = "2024"
 license = "MIT OR Apache-2.0"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 links = "cudnn"
 build = "build/main.rs"
diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
index 904995ab..00951dbf 100644
--- a/crates/cust/CHANGELOG.md
+++ b/crates/cust/CHANGELOG.md
@@ -145,7 +145,7 @@ it much easier to write multigpu code. The CUDA API is fully thread-safe except
 - Added `MemoryAdvise::advise_read_mostly`.
 - Added `MemoryAdvise::preferred_location` and `MemoryAdvise::unset_preferred_location`.
 Note that advising APIs are only present on high end GPUs such as V100s.
-- `StreamFlags::NON_BLOCKING` has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/Rust-CUDA/issues/15).
+- `StreamFlags::NON_BLOCKING` has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/rust-cuda/issues/15).
 
 ## 0.2.0 - 11/26/21
 
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
index 8882dae3..370fe331 100644
--- a/crates/cust/Cargo.toml
+++ b/crates/cust/Cargo.toml
@@ -9,7 +9,7 @@ authors = [
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to the CUDA Driver API"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs
index 284baa98..3ee5c5f1 100644
--- a/crates/cust/src/stream.rs
+++ b/crates/cust/src/stream.rs
@@ -30,7 +30,7 @@ bitflags::bitflags! {
 
         /// This stream does not synchronize with the NULL stream.
         ///
-        /// **Note: this flag has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/Rust-CUDA/issues/15)**
+        /// **Note: this flag has been temporarily disabled because of [soundness concerns](https://github.com/Rust-GPU/rust-cuda/issues/15)**
         ///
         /// Note that the name is chosen to correspond to CUDA documentation, but is nevertheless
         /// misleading. All work within a single stream is ordered and asynchronous regardless
@@ -93,7 +93,7 @@ impl Stream {
     /// # }
     /// ```
     pub fn new(mut flags: StreamFlags, priority: Option<i32>) -> CudaResult<Self> {
-        // NOTE(RDambrosio016): See https://github.com/Rust-GPU/Rust-CUDA/issues/15
+        // NOTE(RDambrosio016): See https://github.com/Rust-GPU/rust-cuda/issues/15
         flags.remove(StreamFlags::NON_BLOCKING);
         unsafe {
             let mut stream = Stream {
diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml
index c4b9d2c7..60e2d17f 100644
--- a/crates/cust_core/Cargo.toml
+++ b/crates/cust_core/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.1"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "Core library for cust that can be shared across CPU and GPU"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/cust_derive/Cargo.toml b/crates/cust_derive/Cargo.toml
index d4d908af..207f7207 100644
--- a/crates/cust_derive/Cargo.toml
+++ b/crates/cust_derive/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Brook Heisler <brookheisler@gmail.com>", "Riccardo D'Ambrosio <rdamb
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "Macros for cust"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [lib]
diff --git a/crates/cust_raw/Cargo.toml b/crates/cust_raw/Cargo.toml
index 97e1c1b6..e6557d19 100644
--- a/crates/cust_raw/Cargo.toml
+++ b/crates/cust_raw/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.11.3"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "Low level bindings to the CUDA Driver API"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 links = "cuda"
 build = "build/main.rs"
diff --git a/crates/gpu_rand/Cargo.toml b/crates/gpu_rand/Cargo.toml
index af780254..f8ede974 100644
--- a/crates/gpu_rand/Cargo.toml
+++ b/crates/gpu_rand/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["The Rand Project Developers", "The Rust CUDA Project Developers"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 description = "GPU-friendly random number generators for the Rust CUDA Project"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/nvvm/Cargo.toml b/crates/nvvm/Cargo.toml
index b8e2d741..22cf71d2 100644
--- a/crates/nvvm/Cargo.toml
+++ b/crates/nvvm/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to libnvvm"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/optix-sys/Cargo.toml b/crates/optix-sys/Cargo.toml
index 494b09ed..562690be 100644
--- a/crates/optix-sys/Cargo.toml
+++ b/crates/optix-sys/Cargo.toml
@@ -3,7 +3,7 @@ name = "optix-sys"
 version = "0.1.0"
 edition = "2024"
 license = "MIT OR Apache-2.0"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 links = "optix"
 build = "build/main.rs"
diff --git a/crates/optix/Cargo.toml b/crates/optix/Cargo.toml
index 600fdaf8..67b3c95d 100644
--- a/crates/optix/Cargo.toml
+++ b/crates/optix/Cargo.toml
@@ -3,7 +3,7 @@ name = "optix"
 version = "0.1.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 authors = ["Anders Langlands <anderslanglands@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 
diff --git a/crates/optix_device_macros/Cargo.toml b/crates/optix_device_macros/Cargo.toml
index 0c78df26..769b8cf3 100644
--- a/crates/optix_device_macros/Cargo.toml
+++ b/crates/optix_device_macros/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "Macros for optix_device"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [lib]
diff --git a/crates/ptx/Cargo.toml b/crates/ptx/Cargo.toml
index 559badfa..eff4da3c 100644
--- a/crates/ptx/Cargo.toml
+++ b/crates/ptx/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2018"
 license = "MIT OR Apache-2.0"
 description = "PTX parser and analyzer"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/ptx_compiler/Cargo.toml b/crates/ptx_compiler/Cargo.toml
index 0785596a..fc6c2e28 100644
--- a/crates/ptx_compiler/Cargo.toml
+++ b/crates/ptx_compiler/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.1"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to CUDA's ptx compilation APIs"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [dependencies]
diff --git a/crates/ptx_compiler/src/lib.rs b/crates/ptx_compiler/src/lib.rs
index a4c25273..8fae5da3 100644
--- a/crates/ptx_compiler/src/lib.rs
+++ b/crates/ptx_compiler/src/lib.rs
@@ -102,6 +102,7 @@ impl CompilerFailure {
                 .to_result()?;
             let size = size.assume_init();
             let mut vec = Vec::with_capacity(size);
+            #[allow(clippy::unnecessary_cast)]
             nvptx_compiler_sys::nvPTXCompilerGetErrorLog(
                 self.handle,
                 vec.as_mut_ptr() as *mut c_char,
@@ -138,6 +139,7 @@ impl CompiledProgram {
                 .to_result()?;
             let size = size.assume_init();
             let mut vec = Vec::with_capacity(size);
+            #[allow(clippy::unnecessary_cast)]
             nvptx_compiler_sys::nvPTXCompilerGetInfoLog(
                 self.handle,
                 vec.as_mut_ptr() as *mut c_char,
diff --git a/crates/rustc_codegen_nvvm/Cargo.toml b/crates/rustc_codegen_nvvm/Cargo.toml
index 825b32c2..f0c4dcfd 100644
--- a/crates/rustc_codegen_nvvm/Cargo.toml
+++ b/crates/rustc_codegen_nvvm/Cargo.toml
@@ -8,7 +8,7 @@ authors = [
 edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "A codegen backend for Rustc which targets the libnvvm CUDA library"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [lib]
diff --git a/crates/rustc_codegen_nvvm/src/builder.rs b/crates/rustc_codegen_nvvm/src/builder.rs
index 7d32c553..e011e14c 100644
--- a/crates/rustc_codegen_nvvm/src/builder.rs
+++ b/crates/rustc_codegen_nvvm/src/builder.rs
@@ -6,7 +6,7 @@ use libc::{c_char, c_uint};
 use rustc_abi as abi;
 use rustc_abi::{AddressSpace, Align, HasDataLayout, Size, TargetDataLayout, WrappingRange};
 use rustc_codegen_ssa::MemFlags;
-use rustc_codegen_ssa::common::{IntPredicate, RealPredicate, TypeKind};
+use rustc_codegen_ssa::common::{AtomicRmwBinOp, IntPredicate, RealPredicate, TypeKind};
 use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
 use rustc_codegen_ssa::mir::place::PlaceRef;
 use rustc_codegen_ssa::traits::*;
@@ -213,9 +213,9 @@ macro_rules! math_builder_methods {
                     _ => {
                         self.cx.fatal(format!(
                             "Unimplemented 128-bit integer operation '{}' with {} arguments. \
-                             This operation is not yet supported in Rust-CUDA. \
+                             This operation is not yet supported in Rust CUDA. \
                              Consider using 64-bit integers or filing an issue at \
-                             https://github.com/Rust-GPU/Rust-CUDA/issues",
+                             https://github.com/Rust-GPU/rust-cuda/issues",
                             stringify!($name),
                             args_vec.len()
                         ));
@@ -546,30 +546,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 
     fn atomic_load(
         &mut self,
-        _ty: &'ll Type,
+        ty: &'ll Type,
         ptr: &'ll Value,
-        _order: AtomicOrdering,
+        order: AtomicOrdering,
         _size: Size,
     ) -> &'ll Value {
-        // core seems to think that nvptx has atomic loads, which is not true for NVVM IR,
-        // therefore our only option is to print that this is not supported then trap.
-        // i have heard of cursed things such as emulating this with __threadfence and volatile loads
-        // but that needs to be experimented with in terms of safety and behavior.
-        // NVVM has explicit intrinsics for adding and subtracting floats which we expose elsewhere
-
-        // TODO(RDambrosio016): is there a way we can just generate a panic with a message instead
-        // of doing this ourselves? since all panics will be aborts, it should be equivalent
-        // let message = "Atomic Loads are not supported in CUDA.\0";
-
-        // let vprintf = self.get_intrinsic("vprintf");
-        // let formatlist = self.const_str(Symbol::intern(message)).0;
-        // let valist = self.const_null(self.type_void());
-
-        // self.call(vprintf, &[formatlist, valist], None);
-
-        let (ty, f) = self.get_intrinsic("llvm.trap");
-        self.call(ty, None, None, f, &[], None, None);
-        unsafe { llvm::LLVMBuildLoad(self.llbuilder, ptr, unnamed()) }
+        // Since for any A, A | 0 = A, and performing atomics on constant memory is UB in Rust, we can abuse or to perform atomic reads.
+        self.atomic_rmw(AtomicRmwBinOp::AtomicOr, ptr, self.const_int(ty, 0), order)
     }
 
     fn load_operand(&mut self, place: PlaceRef<'tcx, &'ll Value>) -> OperandRef<'tcx, &'ll Value> {
@@ -796,24 +779,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 
     fn atomic_store(
         &mut self,
-        _val: &'ll Value,
+        val: &'ll Value,
         ptr: &'ll Value,
-        _order: AtomicOrdering,
+        order: AtomicOrdering,
         _size: Size,
     ) {
-        // see comment in atomic_load
-
-        // let message = "Atomic Stores are not supported in CUDA.\0";
-
-        // let vprintf = self.get_intrinsic("vprintf");
-        // let formatlist = self.const_str(Symbol::intern(message)).0;
-        // let valist = self.const_null(self.type_void());
-
-        // self.call(vprintf, &[formatlist, valist], None);
-        self.abort();
-        unsafe {
-            llvm::LLVMBuildLoad(self.llbuilder, ptr, UNNAMED);
-        }
+        // We can exchange *ptr with val, and then discard the result.
+        self.atomic_rmw(AtomicRmwBinOp::AtomicXchg, ptr, val, order);
     }
 
     fn gep(&mut self, ty: &'ll Type, ptr: &'ll Value, indices: &[&'ll Value]) -> &'ll Value {
@@ -1134,26 +1106,155 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     // Atomic Operations
     fn atomic_cmpxchg(
         &mut self,
-        _dst: &'ll Value,
-        _cmp: &'ll Value,
-        _src: &'ll Value,
-        _order: AtomicOrdering,
-        _failure_order: AtomicOrdering,
-        _weak: bool,
+        dst: &'ll Value,
+        cmp: &'ll Value,
+        src: &'ll Value,
+        order: AtomicOrdering,
+        failure_order: AtomicOrdering,
+        weak: bool,
     ) -> (&'ll Value, &'ll Value) {
-        // allowed but only for some things and with restrictions
-        // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
-        self.fatal("atomic cmpxchg is not supported")
+        // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
+        match (order, failure_order) {
+            // Failure order `Release` & `AcqRel` is simply invalid.
+            (_, AtomicOrdering::Release | AtomicOrdering::AcqRel) => {
+                self.abort();
+                return (
+                    self.const_undef(self.val_ty(cmp)),
+                    self.const_undef(self.type_i1()),
+                );
+            }
+            // Success & failure ordering are the same - OK.
+            (AtomicOrdering::SeqCst, AtomicOrdering::SeqCst)
+            | (AtomicOrdering::Relaxed, AtomicOrdering::Relaxed)
+            | (AtomicOrdering::Acquire, AtomicOrdering::Acquire) => (),
+            // Failure is `SeqCst`(strongest) & success is anything else(weaker) - reject.
+            (_, AtomicOrdering::SeqCst) => {
+                self.abort();
+                return (
+                    self.const_undef(self.val_ty(cmp)),
+                    self.const_undef(self.type_i1()),
+                );
+            }
+            // Failure is Relaxed(weakest), and success is anything - OK.
+            (_, AtomicOrdering::Relaxed) => (),
+            // Failure is anything, and success is SeqCest(strongest) - OK.
+            (AtomicOrdering::SeqCst, _) => (),
+            // Failure is Acquire, and success is Release - OK.
+            (AtomicOrdering::Release, AtomicOrdering::Acquire) => (),
+            // Success is AcqRel & failure is Acquire - OK
+            (AtomicOrdering::AcqRel, AtomicOrdering::Acquire) => (),
+            // Success is weaker than failure - reject.
+            (AtomicOrdering::Relaxed, AtomicOrdering::Acquire) => {
+                self.abort();
+                return (
+                    self.const_undef(self.val_ty(cmp)),
+                    self.const_undef(self.type_i1()),
+                );
+            }
+        };
+        let res = self.atomic_op(
+            dst,
+            |builder, dst| {
+                // We are in a supported address space - just use ordinary atomics
+                unsafe {
+                    llvm::LLVMRustBuildAtomicCmpXchg(
+                        builder.llbuilder,
+                        dst,
+                        cmp,
+                        src,
+                        crate::llvm::AtomicOrdering::from_generic(order),
+                        crate::llvm::AtomicOrdering::from_generic(failure_order),
+                        weak as u32,
+                    )
+                }
+            },
+            |builder, dst| {
+                // Local space is only accessible to the current thread.
+                // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
+                let load: &'ll Value =
+                    unsafe { llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
+                let compare = builder.icmp(IntPredicate::IntEQ, load, cmp);
+                // We can do something smart & branchless here:
+                // We select either the current value(if the comparison fails), or a new value.
+                // We then *undconditionally* write that back to local memory(which is very, very cheap).
+                // TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes.
+                let value = builder.select(compare, src, load);
+                unsafe { llvm::LLVMBuildStore(builder.llbuilder, value, dst) };
+                let res_type =
+                    builder.type_struct(&[builder.val_ty(cmp), builder.type_ix(1)], false);
+                // We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics.
+                let res = builder.const_undef(res_type);
+                let res = builder.insert_value(res, load, 0);
+                builder.insert_value(res, compare, 1)
+            },
+        );
+        // Unpack the result
+        let val = self.extract_value(res, 0);
+        let success = self.extract_value(res, 1);
+        (val, success)
     }
     fn atomic_rmw(
         &mut self,
-        _op: rustc_codegen_ssa::common::AtomicRmwBinOp,
-        _dst: &'ll Value,
-        _src: &'ll Value,
-        _order: AtomicOrdering,
+        op: AtomicRmwBinOp,
+        dst: &'ll Value,
+        src: &'ll Value,
+        order: AtomicOrdering,
     ) -> &'ll Value {
-        // see cmpxchg comment
-        self.fatal("atomic rmw is not supported")
+        if matches!(op, AtomicRmwBinOp::AtomicNand) {
+            self.fatal("Atomic NAND not supported yet!")
+        }
+        self.atomic_op(
+            dst,
+            |builder, dst| {
+                // We are in a supported address space - just use ordinary atomics
+                unsafe {
+                    llvm::LLVMBuildAtomicRMW(
+                        builder.llbuilder,
+                        op.into(),
+                        dst,
+                        src,
+                        crate::llvm::AtomicOrdering::from_generic(order),
+                        0,
+                    )
+                }
+            },
+            |builder, dst| {
+                // Local space is only accessible to the current thread.
+                // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
+                let load: &'ll Value =
+                    unsafe { llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
+                let next_val = match op {
+                    AtomicRmwBinOp::AtomicXchg => src,
+                    AtomicRmwBinOp::AtomicAdd => builder.add(load, src),
+                    AtomicRmwBinOp::AtomicSub => builder.sub(load, src),
+                    AtomicRmwBinOp::AtomicAnd => builder.and(load, src),
+                    AtomicRmwBinOp::AtomicNand => {
+                        let and = builder.and(load, src);
+                        builder.not(and)
+                    }
+                    AtomicRmwBinOp::AtomicOr => builder.or(load, src),
+                    AtomicRmwBinOp::AtomicXor => builder.xor(load, src),
+                    AtomicRmwBinOp::AtomicMax => {
+                        let is_src_bigger = builder.icmp(IntPredicate::IntSGT, src, load);
+                        builder.select(is_src_bigger, src, load)
+                    }
+                    AtomicRmwBinOp::AtomicMin => {
+                        let is_src_smaller = builder.icmp(IntPredicate::IntSLT, src, load);
+                        builder.select(is_src_smaller, src, load)
+                    }
+                    AtomicRmwBinOp::AtomicUMax => {
+                        let is_src_bigger = builder.icmp(IntPredicate::IntUGT, src, load);
+                        builder.select(is_src_bigger, src, load)
+                    }
+                    AtomicRmwBinOp::AtomicUMin => {
+                        let is_src_smaller = builder.icmp(IntPredicate::IntULT, src, load);
+                        builder.select(is_src_smaller, src, load)
+                    }
+                };
+                unsafe { llvm::LLVMBuildStore(builder.llbuilder, next_val, dst) };
+                load
+            },
+        )
     }
 
     fn atomic_fence(
@@ -1609,3 +1710,99 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
         }
     }
 }
+impl<'ll, 'tcx, 'a> Builder<'a, 'll, 'tcx> {
+    /// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space)
+    /// or emulation(with `emulate_local`, if `dst` points to a thread-local address space).
+    /// FIXME(FractalFir): this code assumess all pointers are generic. Adjust it once we support address spaces.
+    fn atomic_op(
+        &mut self,
+        dst: &'ll Value,
+        atomic_supported: impl FnOnce(&mut Builder<'a, 'll, 'tcx>, &'ll Value) -> &'ll Value,
+        emulate_local: impl FnOnce(&mut Builder<'a, 'll, 'tcx>, &'ll Value) -> &'ll Value,
+    ) -> &'ll Value {
+        // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
+        // For example, they are restricted in what address space they operate on.
+        // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
+        // An atomic instruction can soundly operate on:
+        // 1. The global address space
+        // 2. The shared(cluster) address space.
+        // It can't operate on:
+        // 1. The const address space(atomics on consts are UB anyway)
+        // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
+        // So, we do the following:
+        // 1. Check if the pointer is in one of the address spaces atomics support.
+        //  a) if so, we perform an atomic operation
+        // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
+        // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
+        // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
+
+        // We check if the `dst` pointer is in the `global` address space.
+        let (isspacep_global_ty, isspacep_global_fn) =
+            self.get_intrinsic("llvm.nvvm.isspacep.global");
+        let isspacep_global = self.call(
+            isspacep_global_ty,
+            None,
+            None,
+            isspacep_global_fn,
+            &[dst],
+            None,
+            None,
+        );
+        // We check if the `dst` pointer is in the `shared` address space.
+        let (isspacep_shared_ty, isspacep_shared_fn) =
+            self.get_intrinsic("llvm.nvvm.isspacep.shared");
+        let isspacep_shared = self.call(
+            isspacep_shared_ty,
+            None,
+            None,
+            isspacep_shared_fn,
+            &[dst],
+            None,
+            None,
+        );
+        // Combine those to check if we are in a supported address space.
+        let atomic_supported_addrspace = self.or(isspacep_shared, isspacep_global);
+        // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
+        let supported_bb = self.append_sibling_block("atomic_space_supported");
+        let unsupported_bb = self.append_sibling_block("atomic_space_unsupported");
+        self.cond_br(atomic_supported_addrspace, supported_bb, unsupported_bb);
+        //  We also create a "merge" block we will jump to, after the the atomic ops finish.
+        let merge_bb = self.append_sibling_block("atomic_op_done");
+        // Execute atomic op if supported, then jump to merge
+        self.switch_to_block(supported_bb);
+        let supported_res = atomic_supported(self, dst);
+        self.br(merge_bb);
+        // Check if the pointer is in the thread space. If so, we can emulate it.
+        self.switch_to_block(unsupported_bb);
+        let (isspacep_local_ty, isspacep_local_fn) = self.get_intrinsic("llvm.nvvm.isspacep.local");
+        let isspacep_local = self.call(
+            isspacep_local_ty,
+            None,
+            None,
+            isspacep_local_fn,
+            &[dst],
+            None,
+            None,
+        );
+        let local_bb = self.append_sibling_block("atomic_local_space");
+        let atomic_ub_bb = self.append_sibling_block("atomic_space_ub");
+        self.cond_br(isspacep_local, local_bb, atomic_ub_bb);
+        // The pointer is in the thread(local) space.
+        self.switch_to_block(local_bb);
+        let local_res = emulate_local(self, dst);
+        self.br(merge_bb);
+        // The pointer is neither in the supported address space, nor the local space.
+        // This is very likely UB. So, we trap here.
+        // TODO: should we print some kind of a message here? NVVM supports printf.
+        self.switch_to_block(atomic_ub_bb);
+        self.abort();
+        self.unreachable();
+        // Atomic is impl has finished, and we can now switch to the merge_bb
+        self.switch_to_block(merge_bb);
+        self.phi(
+            self.val_ty(local_res),
+            &[supported_res, local_res],
+            &[supported_bb, local_bb],
+        )
+    }
+}
diff --git a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs
index 170444b4..56e45053 100644
--- a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs
+++ b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs
@@ -449,5 +449,10 @@ impl<'ll> CodegenCx<'ll, '_> {
             "__nv_ynf",
             fn(t_i32, t_f32) -> t_f32
         );
+        // Address space checks
+        ifn!(map, "llvm.nvvm.isspacep.const", fn(i8p) -> i1);
+        ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1);
+        ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1);
+        ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1);
     }
 }
diff --git a/crates/rustc_codegen_nvvm/src/lib.rs b/crates/rustc_codegen_nvvm/src/lib.rs
index 825748ca..47c2065b 100644
--- a/crates/rustc_codegen_nvvm/src/lib.rs
+++ b/crates/rustc_codegen_nvvm/src/lib.rs
@@ -89,7 +89,7 @@ use std::ffi::CString;
 #[unsafe(no_mangle)]
 pub fn __rustc_codegen_backend() -> Box<dyn CodegenBackend> {
     rustc_driver::install_ice_hook(
-        "https://github.com/Rust-GPU/Rust-CUDA/issues/new",
+        "https://github.com/Rust-GPU/rust-cuda/issues/new",
         |handler| {
             handler.handle().note(concat!(
                 "`rust-cuda` version `",
diff --git a/crates/rustc_codegen_nvvm/src/llvm.rs b/crates/rustc_codegen_nvvm/src/llvm.rs
index a0243eed..37c14ee8 100644
--- a/crates/rustc_codegen_nvvm/src/llvm.rs
+++ b/crates/rustc_codegen_nvvm/src/llvm.rs
@@ -16,9 +16,9 @@
 // but likely will use in the future, so we ignore any unused functions
 // in case we need them in the future for things like debug info or LTO.
 #![allow(dead_code)]
-
 use libc::{c_char, c_uint, c_void, size_t};
 use libc::{c_int, c_ulonglong};
+use rustc_codegen_ssa::common::AtomicRmwBinOp;
 use std::ffi::{CStr, CString};
 use std::fmt;
 use std::hash::{Hash, Hasher};
@@ -1947,4 +1947,85 @@ unsafe extern "C" {
     pub(crate) fn LLVMRustAddDereferenceableOrNullAttr(Fn: &Value, index: c_uint, bytes: u64);
 
     pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
+    // Atomics
+    pub fn LLVMRustBuildAtomicCmpXchg<'a>(
+        B: &Builder<'a>,
+        LHS: &Value,
+        CMP: &Value,
+        RHS: &Value,
+        Order: AtomicOrdering,
+        FailureOrder: AtomicOrdering,
+        Weak: Bool,
+    ) -> &'a Value;
+
+    pub fn LLVMBuildAtomicRMW<'a>(
+        B: &Builder<'a>,
+        Op: LLVMAtomicRmwBinOp,
+        LHS: &Value,
+        RHS: &Value,
+        Order: AtomicOrdering,
+        SingleThreaded: Bool,
+    ) -> &'a Value;
+}
+/// LLVMAtomicOrdering
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub(crate) enum AtomicOrdering {
+    #[allow(dead_code)]
+    NotAtomic = 0,
+    #[allow(dead_code)]
+    Unordered = 1,
+    Monotonic = 2,
+    // Consume = 3,  // Not specified yet.
+    Acquire = 4,
+    Release = 5,
+    AcquireRelease = 6,
+    SequentiallyConsistent = 7,
+}
+impl AtomicOrdering {
+    pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self {
+        use rustc_middle::ty::AtomicOrdering as Common;
+        match ao {
+            Common::Relaxed => Self::Monotonic,
+            Common::Acquire => Self::Acquire,
+            Common::Release => Self::Release,
+            Common::AcqRel => Self::AcquireRelease,
+            Common::SeqCst => Self::SequentiallyConsistent,
+        }
+    }
+}
+
+/// FFI-safe mirror of LLVMAtomicRMWBinOp from the LLVM C API.
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub enum LLVMAtomicRmwBinOp {
+    AtomicXchg = 0,
+    AtomicAdd = 1,
+    AtomicSub = 2,
+    AtomicAnd = 3,
+    AtomicNand = 4,
+    AtomicOr = 5,
+    AtomicXor = 6,
+    AtomicMax = 7,
+    AtomicMin = 8,
+    AtomicUMax = 9,
+    AtomicUMin = 10,
+}
+
+impl From<AtomicRmwBinOp> for LLVMAtomicRmwBinOp {
+    fn from(op: AtomicRmwBinOp) -> Self {
+        match op {
+            AtomicRmwBinOp::AtomicXchg => Self::AtomicXchg,
+            AtomicRmwBinOp::AtomicAdd => Self::AtomicAdd,
+            AtomicRmwBinOp::AtomicSub => Self::AtomicSub,
+            AtomicRmwBinOp::AtomicAnd => Self::AtomicAnd,
+            AtomicRmwBinOp::AtomicNand => Self::AtomicNand,
+            AtomicRmwBinOp::AtomicOr => Self::AtomicOr,
+            AtomicRmwBinOp::AtomicXor => Self::AtomicXor,
+            AtomicRmwBinOp::AtomicMax => Self::AtomicMax,
+            AtomicRmwBinOp::AtomicMin => Self::AtomicMin,
+            AtomicRmwBinOp::AtomicUMax => Self::AtomicUMax,
+            AtomicRmwBinOp::AtomicUMin => Self::AtomicUMin,
+        }
+    }
 }
diff --git a/crates/rustc_codegen_nvvm/src/nvvm.rs b/crates/rustc_codegen_nvvm/src/nvvm.rs
index 2bee80e3..2c1ae5b2 100644
--- a/crates/rustc_codegen_nvvm/src/nvvm.rs
+++ b/crates/rustc_codegen_nvvm/src/nvvm.rs
@@ -125,9 +125,10 @@ pub fn codegen_bitcode_modules(
     let res = match prog.compile(&args.nvvm_options) {
         Ok(b) => b,
         Err(error) => {
+            let log = prog.compiler_log().unwrap().unwrap_or_default();
             // this should never happen, if it does, something went really bad or its a bug on libnvvm's end
             panic!(
-                "libnvvm returned an error that was not previously caught by the verifier: {error:?}"
+                "libnvvm returned an error that was not previously caught by the verifier: {error:?} {log:?}"
             );
         }
     };
diff --git a/crates/rustc_codegen_nvvm_macros/Cargo.toml b/crates/rustc_codegen_nvvm_macros/Cargo.toml
index 65999434..9d78a324 100644
--- a/crates/rustc_codegen_nvvm_macros/Cargo.toml
+++ b/crates/rustc_codegen_nvvm_macros/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 description = "Macros for rustc_codegen_nvvm"
-repository = "https://github.com/Rust-GPU/Rust-CUDA"
+repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
 
 [lib]
diff --git a/examples/cuda/gemm/src/main.rs b/examples/cuda/gemm/src/main.rs
index 8e254176..73df03d5 100644
--- a/examples/cuda/gemm/src/main.rs
+++ b/examples/cuda/gemm/src/main.rs
@@ -1,4 +1,4 @@
-//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust-CUDA.
+//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust CUDA.
 //!
 //! This example benchmarks naive and tiled GEMM kernels as well as cuBLAS for various matrix sizes.
 //! It uses the `cust` crate for CUDA management and `ndarray` for host-side matrix operations.
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
index e61cff4f..083480e5 100644
--- a/guide/src/guide/getting_started.md
+++ b/guide/src/guide/getting_started.md
@@ -9,7 +9,7 @@ Before you can use the project to write GPU crates, you will need a couple of pr
 - [The CUDA SDK](https://developer.nvidia.com/cuda-downloads), version `11.2-11.8` (and the appropriate driver - [see cuda release notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html)).
 
   - We recently [added experimental support for the `12.x`
-    SDK](https://github.com/Rust-GPU/Rust-CUDA/issues/100), please file any issues you
+    SDK](https://github.com/Rust-GPU/rust-cuda/issues/100), please file any issues you
     see
 
   This is only for building GPU crates, to execute built PTX you only need CUDA `9+`.
@@ -234,7 +234,7 @@ components = ["rust-src", "rustc-dev", "llvm-tools-preview"]
 
 There is also a [Dockerfile](Dockerfile) prepared as a quickstart with all the necessary libraries for base cuda development.
 
-You can use it as follows (assuming your clone of Rust-CUDA is at the absolute path `RUST_CUDA`):
+You can use it as follows (assuming your clone of Rust CUDA is at the absolute path `RUST_CUDA`):
 
 - Ensure you have Docker setup to [use gpus](https://docs.docker.com/config/containers/resource_constraints/#gpu)
 - Build `docker build -t rust-cuda $RUST_CUDA`
diff --git a/tests/compiletests/README.md b/tests/compiletests/README.md
index f07f9db0..55f4bcd1 100644
--- a/tests/compiletests/README.md
+++ b/tests/compiletests/README.md
@@ -1,6 +1,6 @@
-# Compiletests for Rust-CUDA
+# Compiletests for Rust CUDA
 
-This directory contains compile tests for the Rust-CUDA project using the `compiletest` framework.
+This directory contains compile tests for the Rust CUDA project using the `compiletest` framework.
 
 The code in these tests is not executed. Tests check that the compiler compiles
 correctly. Tests in `dis/` verify correct PTX output.
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
index 9fb2ed4f..a0c04589 100644
--- a/tests/compiletests/src/main.rs
+++ b/tests/compiletests/src/main.rs
@@ -65,7 +65,7 @@ fn main() {
 
     // HACK(eddyb) force `compiletest` to pass `ui/...` relative paths to `rustc`,
     // which should always end up being the same regardless of the path that the
-    // Rust-CUDA repo is checked out at (among other things, this avoids hardcoded
+    // Rust CUDA repo is checked out at (among other things, this avoids hardcoded
     // `compiletest` limits being hit by e.g. users with slightly longer paths).
     std::env::set_current_dir(tests_dir).unwrap();
     let tests_dir = PathBuf::from("");
diff --git a/tests/compiletests/ui/atomic/std_atomic_ops.rs b/tests/compiletests/ui/atomic/std_atomic_ops.rs
new file mode 100644
index 00000000..82d7f4f8
--- /dev/null
+++ b/tests/compiletests/ui/atomic/std_atomic_ops.rs
@@ -0,0 +1,49 @@
+// Test CUDA atomic operations compile correctly
+// build-pass
+// compile-flags: -Z verify-llvm-ir
+use core::sync::atomic::{AtomicUsize, Ordering};
+
+use cuda_std::atomic::{
+    AtomicF32, AtomicF64, BlockAtomicF32, BlockAtomicF64, SystemAtomicF32, SystemAtomicF64,
+};
+use cuda_std::kernel;
+static GLOBAL: AtomicUsize = AtomicUsize::new(0);
+#[kernel]
+pub unsafe fn test_cuda_atomic_floats() {
+    let local = AtomicUsize::new(0);
+    // `compare_exchange` should succeed
+    local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+    // `compare_exchange` should fail
+    local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+    // `compare_exchange` should succeed
+    GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+    // `compare_exchange` should fail
+    GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+    // Ops
+    local.swap(1, Ordering::Relaxed);
+    GLOBAL.swap(1, Ordering::Relaxed);
+    local.fetch_add(1, Ordering::Relaxed);
+    GLOBAL.fetch_add(1, Ordering::Relaxed);
+    local.fetch_sub(1, Ordering::Relaxed);
+    GLOBAL.fetch_sub(1, Ordering::Relaxed);
+    local.fetch_and(1, Ordering::Relaxed);
+    GLOBAL.fetch_and(1, Ordering::Relaxed);
+    local.fetch_and(1, Ordering::Relaxed);
+    GLOBAL.fetch_and(1, Ordering::Relaxed);
+    local.fetch_or(1, Ordering::Relaxed);
+    GLOBAL.fetch_or(1, Ordering::Relaxed);
+    local.fetch_xor(1, Ordering::Relaxed);
+    GLOBAL.fetch_xor(1, Ordering::Relaxed);
+    local.fetch_max(1, Ordering::Relaxed);
+    GLOBAL.fetch_max(1, Ordering::Relaxed);
+    local.fetch_min(1, Ordering::Relaxed);
+    GLOBAL.fetch_min(1, Ordering::Relaxed);
+    // Loads:
+    local.load(Ordering::Relaxed);
+    GLOBAL.load(Ordering::Relaxed);
+    local.store(1, Ordering::Relaxed);
+    GLOBAL.store(1, Ordering::Relaxed);
+    // Atomic NAND is not supported quite yet
+    //local.fetch_nand(1, Ordering::Relaxed);
+    //GLOBAL.fetch_nand(1, Ordering::Relaxed);
+}