cuda-python/cuda_core/examples/saxpy.py at 87ef800e3ebce67144a3100511328af5bb5dc05e · mdboom/cuda-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ################################################################################
#
# This example demonstrates a templated CUDA kernel (SAXPY) compiled and
# launched with cuda.core, using CuPy arrays. The kernel is instantiated
# for both float and double.
#
# ################################################################################

# /// script
# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "cupy-cuda13x"]
# ///


import sys

from cuda import pathfinder

print(pathfinder.load_nvidia_dynamic_lib("nvrtc"))

import cupy as cp

from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch

# compute out = a * x + y
code = """
template<typename T>
__global__ void saxpy(const T a,
                      const T* x,
                      const T* y,
                      T* out,
                      size_t N) {
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
        out[i] = a * x[i] + y[i];
    }
}
"""


def main():
    dev = Device()
    dev.set_current()
    stream = dev.create_stream()
    buf = None

    try:
        # prepare program
        program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
        prog = Program(code, code_type="c++", options=program_options)

        # Note the use of the `name_expressions` argument to specify the template
        # instantiations of the kernel that we will use. For non-templated kernels,
        # `name_expressions` will simply contain the name of the kernels.
        mod = prog.compile(
            "cubin",
            logs=sys.stdout,
            name_expressions=("saxpy<float>", "saxpy<double>"),
        )

        # run in single precision
        kernel = mod.get_kernel("saxpy<float>")
        dtype = cp.float32

        # prepare input/output
        size = cp.uint64(64)
        a = dtype(10)
        rng = cp.random.default_rng()
        x = rng.random(size, dtype=dtype)
        y = rng.random(size, dtype=dtype)
        out = cp.empty_like(x)
        dev.sync()  # cupy runs on a different stream from stream, so sync before accessing

        # prepare launch
        block = 32
        grid = int((size + block - 1) // block)
        config = LaunchConfig(grid=grid, block=block)
        kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)

        # launch kernel on stream
        launch(stream, config, kernel, *kernel_args)
        stream.sync()

        # check result
        assert cp.allclose(out, a * x + y)

        # let's repeat again, this time allocates our own out buffer instead of cupy's
        # run in double precision
        kernel = mod.get_kernel("saxpy<double>")
        dtype = cp.float64

        # prepare input
        size = cp.uint64(128)
        a = dtype(42)
        x = rng.random(size, dtype=dtype)
        y = rng.random(size, dtype=dtype)
        dev.sync()

        # prepare output
        buf = dev.allocate(
            size * 8,  # = dtype.itemsize
            stream=stream,
        )

        # prepare launch
        block = 64
        grid = int((size + block - 1) // block)
        config = LaunchConfig(grid=grid, block=block)
        kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)

        # launch kernel on stream
        launch(stream, config, kernel, *kernel_args)
        stream.sync()

        # check result
        # we wrap output buffer as a cupy array for simplicity
        out = cp.ndarray(
            size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
        )
        assert cp.allclose(out, a * x + y)
    finally:
        # cupy cleans up automatically the rest
        if buf is not None:
            buf.close(stream)
        stream.close()


if __name__ == "__main__":
    main()