-
Notifications
You must be signed in to change notification settings - Fork 270
Expand file tree
/
Copy pathvector_add_drv.py
More file actions
121 lines (96 loc) · 3.48 KB
/
vector_add_drv.py
File metadata and controls
121 lines (96 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright 2021-2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
# ################################################################################
#
# This example demonstrates vector addition using the CUDA Driver API with
# unified virtual addressing.
#
# ################################################################################
# /// script
# dependencies = ["cuda_bindings>13.2.1", "numpy"]
# ///
import ctypes
import math
import sys
import numpy as np
from cuda.bindings import driver as cuda
from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device_drv, requirement_not_met
vector_add_drv = """\
/* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 3
* of the programming guide with some additions like error checking.
*
*/
// Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
"""
def main():
n = 50000
nbytes = n * np.dtype(np.float32).itemsize
# Initialize
check_cuda_errors(cuda.cuInit(0))
cu_device = find_cuda_device_drv()
# Create context
cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
uva_supported = check_cuda_errors(
cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device)
)
if not uva_supported:
requirement_not_met("Accessing pageable memory directly requires UVA")
kernel_helper = KernelHelper(vector_add_drv, int(cu_device))
_vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
# Allocate input vectors h_A and h_B in host memory
h_a = np.random.rand(n).astype(dtype=np.float32)
h_b = np.random.rand(n).astype(dtype=np.float32)
h_c = np.random.rand(n).astype(dtype=np.float32)
# Allocate vectors in device memory
d_a = check_cuda_errors(cuda.cuMemAlloc(nbytes))
d_b = check_cuda_errors(cuda.cuMemAlloc(nbytes))
d_c = check_cuda_errors(cuda.cuMemAlloc(nbytes))
# Copy vectors from host memory to device memory
check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes))
check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes))
# Grid/Block configuration
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
# Launch the CUDA kernel
check_cuda_errors(
cuda.cuLaunchKernel(
_vec_add_kernel,
blocks_per_grid,
1,
1,
threads_per_block,
1,
1,
0,
0,
kernel_args,
0,
)
)
# Copy result from device memory to host memory
# h_C contains the result in host memory
check_cuda_errors(cuda.cuMemcpyDtoH(h_c, d_c, nbytes))
for i in range(n):
sum_all = h_a[i] + h_b[i]
if math.fabs(h_c[i] - sum_all) > 1e-7:
break
# Free device memory
check_cuda_errors(cuda.cuMemFree(d_a))
check_cuda_errors(cuda.cuMemFree(d_b))
check_cuda_errors(cuda.cuMemFree(d_c))
check_cuda_errors(cuda.cuCtxDestroy(cu_context))
if i + 1 != n:
print("Result = FAIL", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()