Skip to content

Commit bb3bbdd

Browse files
committed
atomicInc/atomicDec with bounds
1 parent 946c91f commit bb3bbdd

7 files changed

Lines changed: 284 additions & 0 deletions

File tree

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ members = [
5656
"samples/introduction/async_api/kernels",
5757
"samples/introduction/matmul",
5858
"samples/introduction/matmul/kernels",
59+
"samples/introduction/simple_atomic_intrinsics",
60+
"samples/introduction/simple_atomic_intrinsics/kernels",
5961

6062
"tests/compiletests",
6163
"tests/compiletests/deps-helper",

crates/cuda_std/src/atomic/mid.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,33 @@ macro_rules! impl_cas {
310310
impl_cas! {
311311
u32, u64, i32, i64, f32, f64
312312
}
313+
314+
#[gpu_only]
315+
#[allow(clippy::missing_safety_doc)]
316+
/// Performs a bounded increment like CUDA's atomicInc: if *ptr >= bound then 0 else *ptr+1, returns old value.
317+
pub unsafe fn atomic_inc_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
318+
loop {
319+
let old = intrinsics::atomic_load_relaxed_32_device(ptr);
320+
let new = if old >= bound { 0 } else { old + 1 };
321+
if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
322+
return old;
323+
}
324+
}
325+
}
326+
327+
#[gpu_only]
328+
#[allow(clippy::missing_safety_doc)]
329+
/// Performs a bounded decrement like CUDA's atomicDec: if *ptr == 0 || *ptr > bound then bound else *ptr-1, returns old value.
330+
pub unsafe fn atomic_dec_bounded_relaxed_u32_device(ptr: *mut u32, bound: u32) -> u32 {
331+
loop {
332+
let old = intrinsics::atomic_load_relaxed_32_device(ptr);
333+
let new = if old == 0 || old > bound {
334+
bound
335+
} else {
336+
old - 1
337+
};
338+
if intrinsics::atomic_fetch_cas_relaxed_u32_device(ptr, old, new) == old {
339+
return old;
340+
}
341+
}
342+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "simple-atomic-intrinsics"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
9+
[build-dependencies]
10+
cuda_builder = { workspace = true, default-features = false }
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use std::env;
2+
use std::path;
3+
4+
use cuda_builder::CudaBuilder;
5+
6+
fn main() {
7+
println!("cargo::rerun-if-changed=build.rs");
8+
println!("cargo::rerun-if-changed=kernels");
9+
10+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11+
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12+
13+
CudaBuilder::new(manifest_dir.join("kernels"))
14+
.copy_to(out_path.join("kernels.ptx"))
15+
.build()
16+
.unwrap();
17+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "simple-atomic-intrinsics-kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#![allow(
2+
improper_ctypes_definitions,
3+
clippy::missing_safety_doc,
4+
unsafe_op_in_unsafe_fn
5+
)]
6+
7+
use cuda_std::atomic::{intrinsics, mid};
8+
use cuda_std::prelude::*;
9+
10+
#[kernel]
11+
pub unsafe fn test_kernel(data: *mut i32) {
12+
let tid = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as i32;
13+
14+
// Arithmetic atomics
15+
16+
intrinsics::atomic_fetch_add_relaxed_i32_device(data.add(0), 10);
17+
intrinsics::atomic_fetch_sub_relaxed_i32_device(data.add(1), 10);
18+
intrinsics::atomic_fetch_exch_relaxed_i32_device(data.add(2), tid);
19+
intrinsics::atomic_fetch_max_relaxed_i32_device(data.add(3), tid);
20+
intrinsics::atomic_fetch_min_relaxed_i32_device(data.add(4), tid);
21+
22+
mid::atomic_inc_bounded_relaxed_u32_device(data.add(5) as *mut u32, 17);
23+
24+
mid::atomic_dec_bounded_relaxed_u32_device(data.add(6) as *mut u32, 137);
25+
26+
intrinsics::atomic_fetch_cas_relaxed_i32_device(data.add(7), tid - 1, tid);
27+
28+
// Bitwise atomics
29+
30+
intrinsics::atomic_fetch_and_relaxed_i32_device(data.add(8), 2 * tid + 7);
31+
32+
// Match CUDA's `1 << tid` wrapping behaviour for tid >= 32 (PTX shl.b32 masks
33+
// the shift count to 5 bits, same as Rust's wrapping_shl).
34+
intrinsics::atomic_fetch_or_relaxed_i32_device(data.add(9), 1i32.wrapping_shl(tid as u32));
35+
36+
intrinsics::atomic_fetch_xor_relaxed_i32_device(data.add(10), tid);
37+
}
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
/* Demonstrates trivial use of global-memory atomic device functions, mirroring
2+
* NVIDIA's simpleAtomicIntrinsics CUDA sample.
3+
*
4+
* A 64×256 grid (16 384 threads) each performs eleven atomic operations on a
5+
* shared 11-element i32 array and the host verifies the results.
6+
*/
7+
8+
use cust::memory::{CopyDestination, DeviceBuffer};
9+
use cust::module::Module;
10+
use cust::stream::{Stream, StreamFlags};
11+
use std::error::Error;
12+
use std::time::Instant;
13+
14+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
15+
16+
const NUM_BLOCKS: u32 = 64;
17+
const NUM_THREADS: u32 = 256;
18+
const NUM_DATA: usize = 11;
19+
20+
fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool {
21+
let len = total_threads;
22+
let mut ok = true;
23+
24+
// slot 0 – atomicAdd(+10): sum of len additions of 10
25+
let expected = 10 * len as i32;
26+
if gpu_data[0] != expected {
27+
println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]);
28+
ok = false;
29+
}
30+
31+
// slot 1 – atomicSub(-10)
32+
let expected = -(10 * len as i32);
33+
if gpu_data[1] != expected {
34+
println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]);
35+
ok = false;
36+
}
37+
38+
// slot 2 – atomicExch: final value must be a valid tid in [0, len)
39+
if !(0..len as i32).contains(&gpu_data[2]) {
40+
println!("atomicExch failed: got {}", gpu_data[2]);
41+
ok = false;
42+
}
43+
44+
// slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8)
45+
let expected = {
46+
let mut v = -(1i32 << 8);
47+
for i in 0..len {
48+
v = v.max(i as i32);
49+
}
50+
v
51+
};
52+
if gpu_data[3] != expected {
53+
println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]);
54+
ok = false;
55+
}
56+
57+
// slot 4 – atomicMin
58+
let expected = {
59+
let mut v = 1i32 << 8;
60+
for i in 0..len {
61+
v = v.min(i as i32);
62+
}
63+
v
64+
};
65+
if gpu_data[4] != expected {
66+
println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]);
67+
ok = false;
68+
}
69+
70+
// slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16]
71+
if !(0..=16).contains(&gpu_data[5]) {
72+
println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]);
73+
ok = false;
74+
}
75+
76+
// slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137]
77+
if !(0..=137).contains(&gpu_data[6]) {
78+
println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]);
79+
ok = false;
80+
}
81+
82+
// slot 7 – atomicCAS: final value must be a valid tid in [0, len)
83+
if !(0..len as i32).contains(&gpu_data[7]) {
84+
println!("atomicCAS failed: got {}", gpu_data[7]);
85+
ok = false;
86+
}
87+
88+
// slot 8 – atomicAnd(2*tid+7) starting from 0xff
89+
let expected = {
90+
let mut v = 0xffi32;
91+
for i in 0..len {
92+
v &= 2 * i as i32 + 7;
93+
}
94+
v
95+
};
96+
if gpu_data[8] != expected {
97+
println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]);
98+
ok = false;
99+
}
100+
101+
// slot 9 – atomicOr(1<<tid) starting from 0.
102+
// For tid ≥ 32 the PTX shl.b32 wraps (modulo 32), same as wrapping_shl.
103+
let expected = {
104+
let mut v = 0i32;
105+
for i in 0..len {
106+
v |= 1i32.wrapping_shl(i as u32);
107+
}
108+
v
109+
};
110+
if gpu_data[9] != expected {
111+
println!("atomicOr failed: expected {expected}, got {}", gpu_data[9]);
112+
ok = false;
113+
}
114+
115+
// slot 10 – atomicXor(tid) starting from 0xff
116+
let expected = {
117+
let mut v = 0xffi32;
118+
for i in 0..len {
119+
v ^= i as i32;
120+
}
121+
v
122+
};
123+
if gpu_data[10] != expected {
124+
println!(
125+
"atomicXor failed: expected {expected}, got {}",
126+
gpu_data[10]
127+
);
128+
ok = false;
129+
}
130+
131+
ok
132+
}
133+
134+
fn main() -> Result<(), Box<dyn Error>> {
135+
println!("simpleAtomicIntrinsics starting...");
136+
137+
let _ctx = cust::quick_init()?;
138+
let module = Module::from_ptx(PTX, &[])?;
139+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
140+
141+
let mut h_data = [0i32; NUM_DATA];
142+
// AND and XOR tests start with 0xff in their slots
143+
h_data[8] = 0xff;
144+
h_data[10] = 0xff;
145+
146+
let d_data = DeviceBuffer::from_slice(&h_data)?;
147+
148+
let kernel = module.get_function("test_kernel")?;
149+
150+
let start = Instant::now();
151+
152+
unsafe {
153+
cust::launch!(
154+
kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_data.as_device_ptr())
155+
)?;
156+
}
157+
158+
stream.synchronize()?;
159+
160+
let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0;
161+
println!("Processing time: {elapsed_ms:.3} ms");
162+
163+
d_data.copy_to(&mut h_data)?;
164+
165+
let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize;
166+
let passed = compute_gold(&h_data, total_threads);
167+
168+
println!(
169+
"simpleAtomicIntrinsics completed, returned {}",
170+
if passed { "OK" } else { "ERROR!" }
171+
);
172+
173+
if !passed {
174+
std::process::exit(1);
175+
}
176+
177+
Ok(())
178+
}

0 commit comments

Comments
 (0)