Skip to content

Commit 3ff9729

Browse files
committed
added sample for atomics use and added bounded atomic inc/dec instruction
1 parent 946c91f commit 3ff9729

6 files changed

Lines changed: 266 additions & 0 deletions

File tree

crates/cuda_std/src/atomic/intrinsics.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,32 @@ atomic_fetch_op_2_reg! {
385385
volatile, dec, 64, i64, system, sys,
386386
}
387387

388+
#[gpu_only]
389+
#[allow(clippy::missing_safety_doc)]
390+
/// Performs a bounded increment like CUDA's atomicInc: if *ptr >= bound then 0 else *ptr+1, returns old value.
391+
pub unsafe fn atomic_inc_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
392+
loop {
393+
let old = atomic_load_relaxed_32_device(ptr);
394+
let new = if old >= bound { 0 } else { old + 1 };
395+
if atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
396+
return old;
397+
}
398+
}
399+
}
400+
401+
#[gpu_only]
402+
#[allow(clippy::missing_safety_doc)]
403+
/// Performs a bounded decrement like CUDA's atomicDec: if *ptr == 0 || *ptr > bound then bound else *ptr-1, returns old value.
404+
pub unsafe fn atomic_dec_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
405+
loop {
406+
let old = atomic_load_relaxed_32_device(ptr);
407+
let new = if old == 0 || old > bound { bound } else { old - 1 };
408+
if atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
409+
return old;
410+
}
411+
}
412+
}
413+
388414
macro_rules! atomic_fetch_logic_op_3_reg {
389415
($($ordering:ident, $op:ident, $width:literal, $type:ty, $scope:ident, $scope_asm:ident),* $(,)*) => {
390416
$(
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "simple-atomic-intrinsics"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
9+
[build-dependencies]
10+
cuda_builder = { workspace = true, default-features = false }
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use std::env;
2+
use std::path;
3+
4+
use cuda_builder::CudaBuilder;
5+
6+
fn main() {
7+
println!("cargo::rerun-if-changed=build.rs");
8+
println!("cargo::rerun-if-changed=kernels");
9+
10+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11+
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12+
13+
CudaBuilder::new(manifest_dir.join("kernels"))
14+
.copy_to(out_path.join("kernels.ptx"))
15+
.build()
16+
.unwrap();
17+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "simple-atomic-intrinsics-kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#![allow(
2+
improper_ctypes_definitions,
3+
clippy::missing_safety_doc,
4+
unsafe_op_in_unsafe_fn
5+
)]
6+
7+
use cuda_std::atomic::intrinsics;
8+
use cuda_std::prelude::*;
9+
10+
#[kernel]
11+
pub unsafe fn test_kernel(data: *mut i32) {
12+
let tid = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as i32;
13+
14+
// Arithmetic atomics
15+
16+
intrinsics::atomic_fetch_add_relaxed_i32_device(data.add(0), 10);
17+
intrinsics::atomic_fetch_sub_relaxed_i32_device(data.add(1), 10);
18+
intrinsics::atomic_fetch_exch_relaxed_i32_device(data.add(2), tid);
19+
intrinsics::atomic_fetch_max_relaxed_i32_device(data.add(3), tid);
20+
intrinsics::atomic_fetch_min_relaxed_i32_device(data.add(4), tid);
21+
22+
intrinsics::atomic_inc_bounded_relaxed_u32_device(data.add(5) as *mut u32, 17);
23+
24+
intrinsics::atomic_dec_bounded_relaxed_u32_device(data.add(6) as *mut u32, 137);
25+
26+
27+
intrinsics::atomic_fetch_cas_relaxed_i32_device(data.add(7), tid - 1, tid);
28+
29+
// Bitwise atomics
30+
31+
intrinsics::atomic_fetch_and_relaxed_i32_device(data.add(8), 2 * tid + 7);
32+
33+
// Match CUDA's `1 << tid` wrapping behaviour for tid >= 32 (PTX shl.b32 masks
34+
// the shift count to 5 bits, same as Rust's wrapping_shl).
35+
intrinsics::atomic_fetch_or_relaxed_i32_device(data.add(9), 1i32.wrapping_shl(tid as u32));
36+
37+
intrinsics::atomic_fetch_xor_relaxed_i32_device(data.add(10), tid);
38+
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/* Demonstrates trivial use of global-memory atomic device functions, mirroring
2+
* NVIDIA's simpleAtomicIntrinsics CUDA sample.
3+
*
4+
* A 64×256 grid (16 384 threads) each performs eleven atomic operations on a
5+
* shared 11-element i32 array and the host verifies the results.
6+
*/
7+
8+
use cust::memory::{CopyDestination, DeviceBuffer};
9+
use cust::module::Module;
10+
use cust::stream::{Stream, StreamFlags};
11+
use std::error::Error;
12+
use std::time::Instant;
13+
14+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
15+
16+
const NUM_BLOCKS: u32 = 64;
17+
const NUM_THREADS: u32 = 256;
18+
const NUM_DATA: usize = 11;
19+
20+
fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool {
21+
let len = total_threads;
22+
let mut ok = true;
23+
24+
// slot 0 – atomicAdd(+10): sum of len additions of 10
25+
let expected = 10 * len as i32;
26+
if gpu_data[0] != expected {
27+
println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]);
28+
ok = false;
29+
}
30+
31+
// slot 1 – atomicSub(-10)
32+
let expected = -(10 * len as i32);
33+
if gpu_data[1] != expected {
34+
println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]);
35+
ok = false;
36+
}
37+
38+
// slot 2 – atomicExch: final value must be a valid tid in [0, len)
39+
if !(0..len as i32).contains(&gpu_data[2]) {
40+
println!("atomicExch failed: got {}", gpu_data[2]);
41+
ok = false;
42+
}
43+
44+
// slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8)
45+
let expected = {
46+
let mut v = -(1i32 << 8);
47+
for i in 0..len { v = v.max(i as i32); }
48+
v
49+
};
50+
if gpu_data[3] != expected {
51+
println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]);
52+
ok = false;
53+
}
54+
55+
// slot 4 – atomicMin
56+
let expected = {
57+
let mut v = 1i32 << 8;
58+
for i in 0..len { v = v.min(i as i32); }
59+
v
60+
};
61+
if gpu_data[4] != expected {
62+
println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]);
63+
ok = false;
64+
}
65+
66+
// slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16]
67+
if !(0..=16).contains(&gpu_data[5]) {
68+
println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]);
69+
ok = false;
70+
}
71+
72+
// slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137]
73+
if !(0..=137).contains(&gpu_data[6]) {
74+
println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]);
75+
ok = false;
76+
}
77+
78+
// slot 7 – atomicCAS: final value must be a valid tid in [0, len)
79+
if !(0..len as i32).contains(&gpu_data[7]) {
80+
println!("atomicCAS failed: got {}", gpu_data[7]);
81+
ok = false;
82+
}
83+
84+
// slot 8 – atomicAnd(2*tid+7) starting from 0xff
85+
let expected = {
86+
let mut v = 0xffi32;
87+
for i in 0..len { v &= 2 * i as i32 + 7; }
88+
v
89+
};
90+
if gpu_data[8] != expected {
91+
println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]);
92+
ok = false;
93+
}
94+
95+
// slot 9 – atomicOr(1<<tid) starting from 0.
96+
// For tid ≥ 32 the PTX shl.b32 wraps (modulo 32), same as wrapping_shl.
97+
let expected = {
98+
let mut v = 0i32;
99+
for i in 0..len { v |= 1i32.wrapping_shl(i as u32); }
100+
v
101+
};
102+
if gpu_data[9] != expected {
103+
println!("atomicOr failed: expected {expected}, got {}", gpu_data[9]);
104+
ok = false;
105+
}
106+
107+
// slot 10 – atomicXor(tid) starting from 0xff
108+
let expected = {
109+
let mut v = 0xffi32;
110+
for i in 0..len { v ^= i as i32; }
111+
v
112+
};
113+
if gpu_data[10] != expected {
114+
println!("atomicXor failed: expected {expected}, got {}", gpu_data[10]);
115+
ok = false;
116+
}
117+
118+
ok
119+
}
120+
121+
fn main() -> Result<(), Box<dyn Error>> {
122+
println!("simpleAtomicIntrinsics starting...");
123+
124+
let _ctx = cust::quick_init()?;
125+
let module = Module::from_ptx(PTX, &[])?;
126+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
127+
128+
let mut h_data = [0i32; NUM_DATA];
129+
// AND and XOR tests start with 0xff in their slots
130+
h_data[8] = 0xff;
131+
h_data[10] = 0xff;
132+
133+
let d_data = DeviceBuffer::from_slice(&h_data)?;
134+
135+
let kernel = module.get_function("test_kernel")?;
136+
137+
let start = Instant::now();
138+
139+
unsafe {
140+
cust::launch!(
141+
kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_data.as_device_ptr())
142+
)?;
143+
}
144+
145+
stream.synchronize()?;
146+
147+
let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
148+
println!("Processing time: {elapsed_ms:.3} ms");
149+
150+
d_data.copy_to(&mut h_data)?;
151+
152+
let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize;
153+
let passed = compute_gold(&h_data, total_threads);
154+
155+
println!(
156+
"simpleAtomicIntrinsics completed, returned {}",
157+
if passed { "OK" } else { "ERROR!" }
158+
);
159+
160+
if !passed {
161+
std::process::exit(1);
162+
}
163+
164+
Ok(())
165+
}

0 commit comments

Comments
 (0)