-
Notifications
You must be signed in to change notification settings - Fork 230
Expand file tree
/
Copy pathmain.rs
More file actions
129 lines (106 loc) · 4.88 KB
/
main.rs
File metadata and controls
129 lines (106 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
*
* 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
* 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
* 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
*/
use cust::device::Device;
use cust::event::{Event, EventFlags};
use cust::function::{BlockSize, GridSize};
use cust::launch;
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
use cust::module::Module;
use cust::prelude::EventStatus;
use cust::stream::{Stream, StreamFlags};
use std::time::Instant;
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
fn correct_output(data: &[u32], x: u32) -> bool {
let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x);
match not_matching_element {
Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
None => println!("All elements of the array match the value!"),
}
not_matching_element.is_none()
}
fn main() -> Result<(), cust::error::CudaError> {
// Set up the context, load the module, and create a stream to run kernels in.
let _ctx = cust::quick_init();
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
println!("Device Name: {}", device.name().unwrap());
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
let increment = module
.get_function("increment")
.expect("Kernel function not found!");
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
const N: usize = 16 * 1024 * 1024;
let value = 26;
let blocks = BlockSize::xy(512, 1);
let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);
let start_event = Event::new(EventFlags::DEFAULT)?;
let stop_event = Event::new(EventFlags::DEFAULT)?;
// Create buffers for data on host-side
// Ideally should be page-locked for efficiency
let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
let mut device_a =
DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
start_event
.record(&stream)
.expect("Failed to record start_event in the CUDA stream!");
let start = Instant::now();
// # Safety
//
// Until the stop_event is triggered:
// 1. `host_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
// other than those of the operations in the stream.
unsafe {
device_a
.async_copy_from(&host_a, &stream)
.expect("Could not copy from host to device!");
}
// # Safety
//
// Number of threads * number of blocks = total number of elements.
// Hence there will not be any out-of-bounds issues.
unsafe {
let result = launch!(increment<<<grids, blocks, 0, stream>>>(
device_a.as_device_ptr(),
value
));
result.expect("Result of `increment` kernel did not process!");
}
// # Safety
//
// Until the stop_event is triggered:
// 1. `device_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. At this point, until `stop_query` yields `EventStatus::Ready`,
// `host_a` is not involved in any other operation.
unsafe {
device_a
.async_copy_to(&mut host_a, &stream)
.expect("Could not copy from device to host!");
}
stop_event
.record(&stream)
.expect("Failed to record stop_event in the CUDA stream!");
let cpu_time: u128 = start.elapsed().as_micros();
let mut counter: u64 = 0;
while stop_event.query() != Ok(EventStatus::Ready) {
counter += 1
}
let gpu_time: u128 = stop_event
.elapsed(&start_event)
.expect("Failed to calculate duration of GPU operations!")
.as_micros();
println!("Time spent executing by the GPU: {gpu_time} microseconds");
println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
println!("CPU executed {counter} iterations while waiting for GPU to finish.");
assert!(correct_output(host_a.as_slice(), value));
// Stream is synchronized as a safety measure
stream.synchronize().expect("Stream couldn't synchronize!");
println!("test PASSED");
Ok(())
// The events and the memory buffers are automatically dropped here.
}