|
26 | 26 | except ImportError: |
27 | 27 | print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) |
28 | 28 | FFI = None |
29 | | -try: |
30 | | - import cupy as cp |
31 | | -except ImportError: |
32 | | - print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) |
33 | | - cp = None |
34 | 29 | import numpy as np |
35 | 30 |
|
36 | | -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch |
37 | 31 | from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory |
38 | 32 |
|
39 | 33 | # ################################################################################ |
|
54 | 48 | func_name = "inplace_plus_arange_N" |
55 | 49 | func_sig = f"void {func_name}(int* data, size_t N)" |
56 | 50 |
|
57 | | -# Here is a concrete (very naive!) implementation on CPU: |
58 | | -if FFI: |
| 51 | + |
| 52 | +# Now we are prepared to run the code from the user's perspective! |
| 53 | +# |
| 54 | +# ################################################################################ |
| 55 | + |
| 56 | + |
| 57 | +# Below, as a user we want to perform the said in-place operation on a CPU |
| 58 | +# or GPU, by calling the corresponding function implemented "elsewhere" |
| 59 | +# (in the body of run function). |
| 60 | + |
| 61 | + |
| 62 | +# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both |
| 63 | +# of which are supported by StridedMemoryView). |
| 64 | +@args_viewable_as_strided_memory((0,)) |
| 65 | +def my_func(arr): |
| 66 | + global cpu_func |
| 67 | + global cpu_prog |
| 68 | + # Create a memory view over arr (assumed to be a 1D array of int32). The stream |
| 69 | + # ordering is taken care of, so that arr can be safely accessed on our work |
| 70 | + # stream (ordered after a data stream on which arr is potentially prepared). |
| 71 | + view = arr.view(-1) |
| 72 | + assert isinstance(view, StridedMemoryView) |
| 73 | + assert len(view.shape) == 1 |
| 74 | + assert view.dtype == np.int32 |
| 75 | + assert not view.is_device_accessible |
| 76 | + |
| 77 | + size = view.shape[0] |
| 78 | + # DLPack also supports host arrays. We want to know if the array data is |
| 79 | + # accessible from the GPU, and dispatch to the right routine accordingly. |
| 80 | + cpu_func(cpu_prog.cast("int*", view.ptr), size) |
| 81 | + |
| 82 | + |
| 83 | +def run(): |
| 84 | + global my_func |
| 85 | + if not FFI: |
| 86 | + return |
| 87 | + # Here is a concrete (very naive!) implementation on CPU: |
59 | 88 | cpu_code = string.Template(r""" |
60 | 89 | extern "C" |
61 | 90 | $func_sig { |
|
76 | 105 | extra_compile_args=["-std=c++11"], |
77 | 106 | ) |
78 | 107 | temp_dir = tempfile.mkdtemp() |
| 108 | + saved_sys_path = sys.path.copy() |
79 | 109 | try: |
80 | 110 | cpu_prog.compile(tmpdir=temp_dir) |
81 | | - finally: |
82 | | - shutil.rmtree(temp_dir) |
83 | | - saved_sys_path = sys.path |
84 | | - try: |
| 111 | + |
85 | 112 | sys.path.append(temp_dir) |
86 | 113 | cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) |
87 | | - finally: |
88 | | - sys.path = saved_sys_path |
89 | | - shutil.rmtree(temp_dir) |
90 | | - |
91 | | -# Here is a concrete (again, very naive!) implementation on GPU: |
92 | | -if cp: |
93 | | - gpu_code = string.Template(r""" |
94 | | - extern "C" |
95 | | - __global__ $func_sig { |
96 | | - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; |
97 | | - const size_t stride_size = gridDim.x * blockDim.x; |
98 | | - for (size_t i = tid; i < N; i += stride_size) { |
99 | | - data[i] += i; |
100 | | - } |
101 | | - } |
102 | | - """).substitute(func_sig=func_sig) |
103 | | - |
104 | | - # To know the GPU's compute capability, we need to identify which GPU to use. |
105 | | - dev = Device(0) |
106 | | - dev.set_current() |
107 | | - arch = "".join(f"{i}" for i in dev.compute_capability) |
108 | | - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) |
109 | | - mod = gpu_prog.compile(target_type="cubin") |
110 | | - gpu_ker = mod.get_kernel(func_name) |
111 | | - |
112 | | -# Now we are prepared to run the code from the user's perspective! |
113 | | -# |
114 | | -# ################################################################################ |
115 | | - |
116 | 114 |
|
117 | | -# Below, as a user we want to perform the said in-place operation on either CPU |
118 | | -# or GPU, by calling the corresponding function implemented "elsewhere" (done above). |
119 | | - |
120 | | - |
121 | | -# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both |
122 | | -# of which are supported by StridedMemoryView). |
123 | | -@args_viewable_as_strided_memory((0,)) |
124 | | -def my_func(arr, work_stream): |
125 | | - # Create a memory view over arr (assumed to be a 1D array of int32). The stream |
126 | | - # ordering is taken care of, so that arr can be safely accessed on our work |
127 | | - # stream (ordered after a data stream on which arr is potentially prepared). |
128 | | - view = arr.view(work_stream.handle if work_stream else -1) |
129 | | - assert isinstance(view, StridedMemoryView) |
130 | | - assert len(view.shape) == 1 |
131 | | - assert view.dtype == np.int32 |
132 | | - |
133 | | - size = view.shape[0] |
134 | | - # DLPack also supports host arrays. We want to know if the array data is |
135 | | - # accessible from the GPU, and dispatch to the right routine accordingly. |
136 | | - if view.is_device_accessible: |
137 | | - block = 256 |
138 | | - grid = (size + block - 1) // block |
139 | | - config = LaunchConfig(grid=grid, block=block) |
140 | | - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) |
141 | | - # Here we're being conservative and synchronize over our work stream, |
142 | | - # assuming we do not know the data stream; if we know then we could |
143 | | - # just order the data stream after the work stream here, e.g. |
144 | | - # |
145 | | - # data_stream.wait(work_stream) |
146 | | - # |
147 | | - # without an expensive synchronization (with respect to the host). |
148 | | - work_stream.sync() |
149 | | - else: |
150 | | - cpu_func(cpu_prog.cast("int*", view.ptr), size) |
151 | | - |
152 | | - |
153 | | -# This takes the GPU path |
154 | | -if cp: |
155 | | - s = dev.create_stream() |
156 | | - try: |
157 | | - # Create input array on GPU |
158 | | - arr_gpu = cp.ones(1024, dtype=cp.int32) |
159 | | - print(f"before: {arr_gpu[:10]=}") |
160 | | - |
161 | | - # Run the workload |
162 | | - my_func(arr_gpu, s) |
163 | | - |
164 | | - # Check the result |
165 | | - print(f"after: {arr_gpu[:10]=}") |
166 | | - assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) |
167 | | - finally: |
168 | | - s.close() |
169 | | - |
170 | | -# This takes the CPU path |
171 | | -if FFI: |
172 | | - try: |
173 | 115 | # Create input array on CPU |
174 | 116 | arr_cpu = np.zeros(1024, dtype=np.int32) |
175 | 117 | print(f"before: {arr_cpu[:10]=}") |
176 | 118 |
|
177 | 119 | # Run the workload |
178 | | - my_func(arr_cpu, None) |
| 120 | + my_func(arr_cpu) |
179 | 121 |
|
180 | 122 | # Check the result |
181 | 123 | print(f"after: {arr_cpu[:10]=}") |
182 | 124 | assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) |
183 | 125 | finally: |
| 126 | + sys.path = saved_sys_path |
184 | 127 | # to allow FFI module to unload, we delete references to |
185 | 128 | # to cpu_func |
186 | 129 | del cpu_func, my_func |
187 | 130 | # clean up temp directory |
188 | 131 | shutil.rmtree(temp_dir) |
| 132 | + |
| 133 | + |
| 134 | +if __name__ == "__main__": |
| 135 | + run() |
0 commit comments