|
25 | 25 | launch, |
26 | 26 | ) |
27 | 27 |
|
28 | | -if np.__version__ < "2.1.0": |
29 | | - print("This example requires NumPy 2.1.0 or later", file=sys.stderr) |
30 | | - sys.exit(0) |
31 | | - |
32 | | -# Kernel for memory operations |
33 | | -code = """ |
34 | | -extern "C" |
35 | | -__global__ void memory_ops(float* device_data, |
36 | | - float* pinned_data, |
37 | | - size_t N) { |
38 | | - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; |
39 | | - if (tid < N) { |
40 | | - // Access device memory |
41 | | - device_data[tid] = device_data[tid] + 1.0f; |
42 | | -
|
43 | | - // Access pinned memory (zero-copy from GPU) |
44 | | - pinned_data[tid] = pinned_data[tid] * 3.0f; |
| 28 | + |
| 29 | +def main(): |
| 30 | + if np.__version__ < "2.1.0": |
| 31 | + print("This example requires NumPy 2.1.0 or later", file=sys.stderr) |
| 32 | + sys.exit(0) |
| 33 | + |
| 34 | + # Kernel for memory operations |
| 35 | + code = """ |
| 36 | + extern "C" |
| 37 | + __global__ void memory_ops(float* device_data, float* pinned_data, size_t N) { |
| 38 | + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; |
| 39 | + if (tid < N) { |
| 40 | + // Access device memory |
| 41 | + device_data[tid] = device_data[tid] + 1.0f; |
| 42 | +
|
| 43 | + // Access pinned memory (zero-copy from GPU) |
| 44 | + pinned_data[tid] = pinned_data[tid] * 3.0f; |
| 45 | + } |
45 | 46 | } |
46 | | -} |
47 | | -""" |
48 | | - |
49 | | -dev = Device() |
50 | | -dev.set_current() |
51 | | -stream = dev.create_stream() |
52 | | -# tell CuPy to use our stream as the current stream: |
53 | | -cp.cuda.ExternalStream(int(stream.handle)).use() |
54 | | - |
55 | | -# Compile kernel |
56 | | -program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") |
57 | | -prog = Program(code, code_type="c++", options=program_options) |
58 | | -mod = prog.compile("cubin") |
59 | | -kernel = mod.get_kernel("memory_ops") |
60 | | - |
61 | | -# Create different memory resources |
62 | | -device_mr = dev.memory_resource |
63 | | -pinned_mr = LegacyPinnedMemoryResource() |
64 | | - |
65 | | -# Allocate different types of memory |
66 | | -size = 1024 |
67 | | -dtype = cp.float32 |
68 | | -element_size = dtype().itemsize |
69 | | -total_size = size * element_size |
70 | | - |
71 | | -# 1. Device Memory (GPU-only) |
72 | | -device_buffer = device_mr.allocate(total_size, stream=stream) |
73 | | -device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) |
74 | | - |
75 | | -# 2. Pinned Memory (CPU memory, GPU accessible) |
76 | | -pinned_buffer = pinned_mr.allocate(total_size, stream=stream) |
77 | | -pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) |
78 | | - |
79 | | -# Initialize data |
80 | | -rng = cp.random.default_rng() |
81 | | -device_array[:] = rng.random(size, dtype=dtype) |
82 | | -pinned_array[:] = rng.random(size, dtype=dtype).get() |
83 | | - |
84 | | -# Store original values for verification |
85 | | -device_original = device_array.copy() |
86 | | -pinned_original = pinned_array.copy() |
87 | | - |
88 | | -# Sync before kernel launch |
89 | | -stream.sync() |
90 | | - |
91 | | -# Launch kernel |
92 | | -block = 256 |
93 | | -grid = (size + block - 1) // block |
94 | | -config = LaunchConfig(grid=grid, block=block) |
95 | | - |
96 | | -launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) |
97 | | -stream.sync() |
98 | | - |
99 | | -# Verify kernel operations |
100 | | -assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" |
101 | | -assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" |
102 | | - |
103 | | -# Copy data between different memory types |
104 | | -print("\nCopying data between memory types...") |
105 | | - |
106 | | -# Copy from device to pinned memory |
107 | | -device_buffer.copy_to(pinned_buffer, stream=stream) |
108 | | -stream.sync() |
109 | | - |
110 | | -# Verify the copy operation |
111 | | -assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" |
112 | | - |
113 | | -# Create a new device buffer and copy from pinned |
114 | | -new_device_buffer = device_mr.allocate(total_size, stream=stream) |
115 | | -new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) |
116 | | - |
117 | | -pinned_buffer.copy_to(new_device_buffer, stream=stream) |
118 | | -stream.sync() |
119 | | - |
120 | | -# Verify the copy operation |
121 | | -assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" |
122 | | - |
123 | | -# Clean up |
124 | | -device_buffer.close(stream) |
125 | | -pinned_buffer.close(stream) |
126 | | -new_device_buffer.close(stream) |
127 | | -stream.close() |
128 | | -cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream |
129 | | - |
130 | | -# Verify buffers are properly closed |
131 | | -assert device_buffer.handle == 0, "Device buffer should be closed" |
132 | | -assert pinned_buffer.handle == 0, "Pinned buffer should be closed" |
133 | | -assert new_device_buffer.handle == 0, "New device buffer should be closed" |
134 | | - |
135 | | -print("Memory management example completed!") |
| 47 | + """ |
| 48 | + |
| 49 | + dev = Device() |
| 50 | + dev.set_current() |
| 51 | + stream = dev.create_stream() |
| 52 | + # tell CuPy to use our stream as the current stream: |
| 53 | + cp.cuda.ExternalStream(int(stream.handle)).use() |
| 54 | + |
| 55 | + # Compile kernel |
| 56 | + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") |
| 57 | + prog = Program(code, code_type="c++", options=program_options) |
| 58 | + mod = prog.compile("cubin") |
| 59 | + kernel = mod.get_kernel("memory_ops") |
| 60 | + |
| 61 | + # Create different memory resources |
| 62 | + device_mr = dev.memory_resource |
| 63 | + pinned_mr = LegacyPinnedMemoryResource() |
| 64 | + |
| 65 | + # Allocate different types of memory |
| 66 | + size = 1024 |
| 67 | + dtype = cp.float32 |
| 68 | + element_size = dtype().itemsize |
| 69 | + total_size = size * element_size |
| 70 | + |
| 71 | + # 1. Device Memory (GPU-only) |
| 72 | + device_buffer = device_mr.allocate(total_size, stream=stream) |
| 73 | + device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) |
| 74 | + |
| 75 | + # 2. Pinned Memory (CPU memory, GPU accessible) |
| 76 | + pinned_buffer = pinned_mr.allocate(total_size, stream=stream) |
| 77 | + pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) |
| 78 | + |
| 79 | + # Initialize data |
| 80 | + rng = cp.random.default_rng() |
| 81 | + device_array[:] = rng.random(size, dtype=dtype) |
| 82 | + pinned_array[:] = rng.random(size, dtype=dtype).get() |
| 83 | + |
| 84 | + # Store original values for verification |
| 85 | + device_original = device_array.copy() |
| 86 | + pinned_original = pinned_array.copy() |
| 87 | + |
| 88 | + # Sync before kernel launch |
| 89 | + stream.sync() |
| 90 | + |
| 91 | + # Launch kernel |
| 92 | + block = 256 |
| 93 | + grid = (size + block - 1) // block |
| 94 | + config = LaunchConfig(grid=grid, block=block) |
| 95 | + |
| 96 | + launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) |
| 97 | + stream.sync() |
| 98 | + |
| 99 | + # Verify kernel operations |
| 100 | + assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" |
| 101 | + assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" |
| 102 | + |
| 103 | + # Copy data between different memory types |
| 104 | + print("\nCopying data between memory types...") |
| 105 | + |
| 106 | + # Copy from device to pinned memory |
| 107 | + device_buffer.copy_to(pinned_buffer, stream=stream) |
| 108 | + stream.sync() |
| 109 | + |
| 110 | + # Verify the copy operation |
| 111 | + assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" |
| 112 | + |
| 113 | + # Create a new device buffer and copy from pinned |
| 114 | + new_device_buffer = device_mr.allocate(total_size, stream=stream) |
| 115 | + new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) |
| 116 | + |
| 117 | + pinned_buffer.copy_to(new_device_buffer, stream=stream) |
| 118 | + stream.sync() |
| 119 | + |
| 120 | + # Verify the copy operation |
| 121 | + assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" |
| 122 | + |
| 123 | + # Clean up |
| 124 | + device_buffer.close(stream) |
| 125 | + pinned_buffer.close(stream) |
| 126 | + new_device_buffer.close(stream) |
| 127 | + stream.close() |
| 128 | + cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream |
| 129 | + |
| 130 | + # Verify buffers are properly closed |
| 131 | + assert device_buffer.handle == 0, "Device buffer should be closed" |
| 132 | + assert pinned_buffer.handle == 0, "Pinned buffer should be closed" |
| 133 | + assert new_device_buffer.handle == 0, "New device buffer should be closed" |
| 134 | + |
| 135 | + print("Memory management example completed!") |
| 136 | + |
| 137 | + |
| 138 | +if __name__ == "__main__": |
| 139 | + main() |
0 commit comments