|
| 1 | +/* Demonstrates trivial use of global-memory atomic device functions, mirroring |
| 2 | + * NVIDIA's simpleAtomicIntrinsics CUDA sample. |
| 3 | + * |
| 4 | + * A 64×256 grid (16 384 threads) each performs eleven atomic operations on a |
| 5 | + * shared 11-element i32 array and the host verifies the results. |
| 6 | + */ |
| 7 | + |
| 8 | +use cust::memory::{CopyDestination, DeviceBuffer}; |
| 9 | +use cust::module::Module; |
| 10 | +use cust::stream::{Stream, StreamFlags}; |
| 11 | +use std::error::Error; |
| 12 | +use std::time::Instant; |
| 13 | + |
| 14 | +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); |
| 15 | + |
| 16 | +const NUM_BLOCKS: u32 = 64; |
| 17 | +const NUM_THREADS: u32 = 256; |
| 18 | +const NUM_DATA: usize = 11; |
| 19 | + |
| 20 | +fn compute_gold(gpu_data: &[i32; NUM_DATA], total_threads: usize) -> bool { |
| 21 | + let len = total_threads; |
| 22 | + let mut ok = true; |
| 23 | + |
| 24 | + // slot 0 – atomicAdd(+10): sum of len additions of 10 |
| 25 | + let expected = 10 * len as i32; |
| 26 | + if gpu_data[0] != expected { |
| 27 | + println!("atomicAdd failed: expected {expected}, got {}", gpu_data[0]); |
| 28 | + ok = false; |
| 29 | + } |
| 30 | + |
| 31 | + // slot 1 – atomicSub(-10) |
| 32 | + let expected = -(10 * len as i32); |
| 33 | + if gpu_data[1] != expected { |
| 34 | + println!("atomicSub failed: expected {expected}, got {}", gpu_data[1]); |
| 35 | + ok = false; |
| 36 | + } |
| 37 | + |
| 38 | + // slot 2 – atomicExch: final value must be a valid tid in [0, len) |
| 39 | + if !(0..len as i32).contains(&gpu_data[2]) { |
| 40 | + println!("atomicExch failed: got {}", gpu_data[2]); |
| 41 | + ok = false; |
| 42 | + } |
| 43 | + |
| 44 | + // slot 3 – atomicMax: sequential max of 0..len starting from -(1<<8) |
| 45 | + let expected = { |
| 46 | + let mut v = -(1i32 << 8); |
| 47 | + for i in 0..len { v = v.max(i as i32); } |
| 48 | + v |
| 49 | + }; |
| 50 | + if gpu_data[3] != expected { |
| 51 | + println!("atomicMax failed: expected {expected}, got {}", gpu_data[3]); |
| 52 | + ok = false; |
| 53 | + } |
| 54 | + |
| 55 | + // slot 4 – atomicMin |
| 56 | + let expected = { |
| 57 | + let mut v = 1i32 << 8; |
| 58 | + for i in 0..len { v = v.min(i as i32); } |
| 59 | + v |
| 60 | + }; |
| 61 | + if gpu_data[4] != expected { |
| 62 | + println!("atomicMin failed: expected {expected}, got {}", gpu_data[4]); |
| 63 | + ok = false; |
| 64 | + } |
| 65 | + |
| 66 | + // slot 5 – atomicInc(limit=17): each thread does bounded inc, final value in [0, 16] |
| 67 | + if !(0..=16).contains(&gpu_data[5]) { |
| 68 | + println!("atomicInc failed: expected [0, 16], got {}", gpu_data[5]); |
| 69 | + ok = false; |
| 70 | + } |
| 71 | + |
| 72 | + // slot 6 – atomicDec(limit=137): each thread does bounded dec, final value in [0, 137] |
| 73 | + if !(0..=137).contains(&gpu_data[6]) { |
| 74 | + println!("atomicDec failed: expected [0, 137], got {}", gpu_data[6]); |
| 75 | + ok = false; |
| 76 | + } |
| 77 | + |
| 78 | + // slot 7 – atomicCAS: final value must be a valid tid in [0, len) |
| 79 | + if !(0..len as i32).contains(&gpu_data[7]) { |
| 80 | + println!("atomicCAS failed: got {}", gpu_data[7]); |
| 81 | + ok = false; |
| 82 | + } |
| 83 | + |
| 84 | + // slot 8 – atomicAnd(2*tid+7) starting from 0xff |
| 85 | + let expected = { |
| 86 | + let mut v = 0xffi32; |
| 87 | + for i in 0..len { v &= 2 * i as i32 + 7; } |
| 88 | + v |
| 89 | + }; |
| 90 | + if gpu_data[8] != expected { |
| 91 | + println!("atomicAnd failed: expected {expected}, got {}", gpu_data[8]); |
| 92 | + ok = false; |
| 93 | + } |
| 94 | + |
| 95 | + // slot 9 – atomicOr(1<<tid) starting from 0. |
| 96 | + // For tid ≥ 32 the PTX shl.b32 wraps (modulo 32), same as wrapping_shl. |
| 97 | + let expected = { |
| 98 | + let mut v = 0i32; |
| 99 | + for i in 0..len { v |= 1i32.wrapping_shl(i as u32); } |
| 100 | + v |
| 101 | + }; |
| 102 | + if gpu_data[9] != expected { |
| 103 | + println!("atomicOr failed: expected {expected}, got {}", gpu_data[9]); |
| 104 | + ok = false; |
| 105 | + } |
| 106 | + |
| 107 | + // slot 10 – atomicXor(tid) starting from 0xff |
| 108 | + let expected = { |
| 109 | + let mut v = 0xffi32; |
| 110 | + for i in 0..len { v ^= i as i32; } |
| 111 | + v |
| 112 | + }; |
| 113 | + if gpu_data[10] != expected { |
| 114 | + println!("atomicXor failed: expected {expected}, got {}", gpu_data[10]); |
| 115 | + ok = false; |
| 116 | + } |
| 117 | + |
| 118 | + ok |
| 119 | +} |
| 120 | + |
| 121 | +fn main() -> Result<(), Box<dyn Error>> { |
| 122 | + println!("simpleAtomicIntrinsics starting..."); |
| 123 | + |
| 124 | + let _ctx = cust::quick_init()?; |
| 125 | + let module = Module::from_ptx(PTX, &[])?; |
| 126 | + let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; |
| 127 | + |
| 128 | + let mut h_data = [0i32; NUM_DATA]; |
| 129 | + // AND and XOR tests start with 0xff in their slots |
| 130 | + h_data[8] = 0xff; |
| 131 | + h_data[10] = 0xff; |
| 132 | + |
| 133 | + let d_data = DeviceBuffer::from_slice(&h_data)?; |
| 134 | + |
| 135 | + let kernel = module.get_function("test_kernel")?; |
| 136 | + |
| 137 | + let start = Instant::now(); |
| 138 | + |
| 139 | + unsafe { |
| 140 | + cust::launch!( |
| 141 | + kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_data.as_device_ptr()) |
| 142 | + )?; |
| 143 | + } |
| 144 | + |
| 145 | + stream.synchronize()?; |
| 146 | + |
| 147 | + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; |
| 148 | + println!("Processing time: {elapsed_ms:.3} ms"); |
| 149 | + |
| 150 | + d_data.copy_to(&mut h_data)?; |
| 151 | + |
| 152 | + let total_threads = (NUM_BLOCKS * NUM_THREADS) as usize; |
| 153 | + let passed = compute_gold(&h_data, total_threads); |
| 154 | + |
| 155 | + println!( |
| 156 | + "simpleAtomicIntrinsics completed, returned {}", |
| 157 | + if passed { "OK" } else { "ERROR!" } |
| 158 | + ); |
| 159 | + |
| 160 | + if !passed { |
| 161 | + std::process::exit(1); |
| 162 | + } |
| 163 | + |
| 164 | + Ok(()) |
| 165 | +} |
0 commit comments