forked from NVIDIA/cuda-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvector_add.py
More file actions
64 lines (51 loc) · 1.74 KB
/
vector_add.py
File metadata and controls
64 lines (51 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This demo illustrates how to use `cuda.core` to compile and launch a simple
# vector addition kernel.
#
# ################################################################################
import cupy as cp
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
# compute c = a + b
code = """
template<typename T>
__global__ void vector_add(const T* A,
const T* B,
T* C,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
C[i] = A[i] + B[i];
}
}
"""
dev = Device()
dev.set_current()
stream = dev.create_stream()
# prepare program
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
# run in single precision
kernel = mod.get_kernel("vector_add<float>")
dtype = cp.float32
# prepare input/output
size = 50000
rng = cp.random.default_rng()
a = rng.random(size, dtype=dtype)
b = rng.random(size, dtype=dtype)
c = cp.empty_like(a)
# cupy runs on a different stream from stream, so sync before accessing
dev.sync()
# prepare launch
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)
# launch kernel on stream
launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
stream.sync()
# check result
assert cp.allclose(c, a + b)