rust-cuda/samples/introduction/async_api/src/main.rs at 3e1adba6fc79effdb78776e3121cf15915b6ae2b · Rust-GPU/rust-cuda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
 *
 * 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
 * 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
 * 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
 */

use cust::device::Device;
use cust::event::{Event, EventFlags};
use cust::function::{BlockSize, GridSize};
use cust::launch;
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
use cust::module::Module;
use cust::prelude::EventStatus;
use cust::stream::{Stream, StreamFlags};
use std::time::Instant;

static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));

fn correct_output(data: &[u32], x: u32) -> bool {
    let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x);

    match not_matching_element {
        Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
        None => println!("All elements of the array match the value!"),
    }

    not_matching_element.is_none()
}

fn main() -> Result<(), cust::error::CudaError> {
    // Set up the context, load the module, and create a stream to run kernels in.
    let _ctx = cust::quick_init();
    let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
    println!("Device Name: {}", device.name().unwrap());

    let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
    let increment = module
        .get_function("increment")
        .expect("Kernel function not found!");
    let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");

    const N: usize = 16 * 1024 * 1024;
    let value = 26;

    let blocks = BlockSize::xy(512, 1);
    let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);

    let start_event = Event::new(EventFlags::DEFAULT)?;
    let stop_event = Event::new(EventFlags::DEFAULT)?;

    // Create buffers for data on host-side
    // Ideally should be page-locked for efficiency
    let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
    let mut device_a =
        DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");

    start_event
        .record(&stream)
        .expect("Failed to record start_event in the CUDA stream!");
    let start = Instant::now();

    // # Safety
    //
    // Until the stop_event is triggered:
    // 1. `host_a` is not being modified
    // 2. Both `device_a` and `host_a` are not deallocated
    // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
    //    other than those of the operations in the stream.
    unsafe {
        device_a
            .async_copy_from(&host_a, &stream)
            .expect("Could not copy from host to device!");
    }

    // # Safety
    //
    // Number of threads * number of blocks = total number of elements.
    // Hence there will not be any out-of-bounds issues.
    unsafe {
        let result = launch!(increment<<<grids, blocks, 0, stream>>>(
            device_a.as_device_ptr(),
            value
        ));
        result.expect("Result of `increment` kernel did not process!");
    }

    // # Safety
    //
    // Until the stop_event is triggered:
    // 1. `device_a` is not being modified
    // 2. Both `device_a` and `host_a` are not deallocated
    // 3. At this point, until `stop_query` yields `EventStatus::Ready`,
    //    `host_a` is not involved in any other operation.
    unsafe {
        device_a
            .async_copy_to(&mut host_a, &stream)
            .expect("Could not copy from device to host!");
    }

    stop_event
        .record(&stream)
        .expect("Failed to record stop_event in the CUDA stream!");
    let cpu_time: u128 = start.elapsed().as_micros();

    let mut counter: u64 = 0;
    while stop_event.query() != Ok(EventStatus::Ready) {
        counter += 1
    }

    let gpu_time: u128 = stop_event
        .elapsed(&start_event)
        .expect("Failed to calculate duration of GPU operations!")
        .as_micros();

    println!("Time spent executing by the GPU: {gpu_time} microseconds");
    println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
    println!("CPU executed {counter} iterations while waiting for GPU to finish.");

    assert!(correct_output(host_a.as_slice(), value));

    // Stream is synchronized as a safety measure
    stream.synchronize().expect("Stream couldn't synchronize!");

    println!("test PASSED");
    Ok(())

    // The events and the memory buffers are automatically dropped here.
}