forked from NVIDIA/cuda-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsaxpy.py
More file actions
132 lines (106 loc) · 3.89 KB
/
saxpy.py
File metadata and controls
132 lines (106 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This example demonstrates a templated CUDA kernel (SAXPY) compiled and
# launched with cuda.core, using CuPy arrays. The kernel is instantiated
# for both float and double.
#
# ################################################################################
# /// script
# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "cupy-cuda13x"]
# ///
import sys
from cuda import pathfinder
print(pathfinder.load_nvidia_dynamic_lib("nvrtc"))
import cupy as cp
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
# compute out = a * x + y
code = """
template<typename T>
__global__ void saxpy(const T a,
const T* x,
const T* y,
T* out,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
out[i] = a * x[i] + y[i];
}
}
"""
def main():
dev = Device()
dev.set_current()
stream = dev.create_stream()
buf = None
try:
# prepare program
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
# Note the use of the `name_expressions` argument to specify the template
# instantiations of the kernel that we will use. For non-templated kernels,
# `name_expressions` will simply contain the name of the kernels.
mod = prog.compile(
"cubin",
logs=sys.stdout,
name_expressions=("saxpy<float>", "saxpy<double>"),
)
# run in single precision
kernel = mod.get_kernel("saxpy<float>")
dtype = cp.float32
# prepare input/output
size = cp.uint64(64)
a = dtype(10)
rng = cp.random.default_rng()
x = rng.random(size, dtype=dtype)
y = rng.random(size, dtype=dtype)
out = cp.empty_like(x)
dev.sync() # cupy runs on a different stream from stream, so sync before accessing
# prepare launch
block = 32
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
# launch kernel on stream
launch(stream, config, kernel, *kernel_args)
stream.sync()
# check result
assert cp.allclose(out, a * x + y)
# let's repeat again, this time allocates our own out buffer instead of cupy's
# run in double precision
kernel = mod.get_kernel("saxpy<double>")
dtype = cp.float64
# prepare input
size = cp.uint64(128)
a = dtype(42)
x = rng.random(size, dtype=dtype)
y = rng.random(size, dtype=dtype)
dev.sync()
# prepare output
buf = dev.allocate(
size * 8, # = dtype.itemsize
stream=stream,
)
# prepare launch
block = 64
grid = int((size + block - 1) // block)
config = LaunchConfig(grid=grid, block=block)
kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
# launch kernel on stream
launch(stream, config, kernel, *kernel_args)
stream.sync()
# check result
# we wrap output buffer as a cupy array for simplicity
out = cp.ndarray(
size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
)
assert cp.allclose(out, a * x + y)
finally:
# cupy cleans up automatically the rest
if buf is not None:
buf.close(stream)
stream.close()
if __name__ == "__main__":
main()