-
Notifications
You must be signed in to change notification settings - Fork 266
Expand file tree
/
Copy pathmemory_ops.py
More file actions
139 lines (109 loc) · 4.37 KB
/
memory_ops.py
File metadata and controls
139 lines (109 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This demo illustrates:
#
# 1. How to use different memory resources to allocate and manage memory
# 2. How to copy data between different memory types
# 3. How to use DLPack to interoperate with other libraries
#
# ################################################################################
import sys
import cupy as cp
import numpy as np
from cuda.core import (
Device,
LaunchConfig,
LegacyPinnedMemoryResource,
Program,
ProgramOptions,
launch,
)
def main():
if np.__version__ < "2.1.0":
print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
sys.exit(0)
# Kernel for memory operations
code = """
extern "C"
__global__ void memory_ops(float* device_data, float* pinned_data, size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
// Access device memory
device_data[tid] = device_data[tid] + 1.0f;
// Access pinned memory (zero-copy from GPU)
pinned_data[tid] = pinned_data[tid] * 3.0f;
}
}
"""
dev = Device()
dev.set_current()
stream = dev.create_stream()
# tell CuPy to use our stream as the current stream:
cp.cuda.ExternalStream(int(stream.handle)).use()
# Compile kernel
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin")
kernel = mod.get_kernel("memory_ops")
# Create different memory resources
device_mr = dev.memory_resource
pinned_mr = LegacyPinnedMemoryResource()
# Allocate different types of memory
size = 1024
dtype = cp.float32
element_size = dtype().itemsize
total_size = size * element_size
# 1. Device Memory (GPU-only)
device_buffer = device_mr.allocate(total_size, stream=stream)
device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
# 2. Pinned Memory (CPU memory, GPU accessible)
pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
# Initialize data
rng = cp.random.default_rng()
device_array[:] = rng.random(size, dtype=dtype)
pinned_array[:] = rng.random(size, dtype=dtype).get()
# Store original values for verification
device_original = device_array.copy()
pinned_original = pinned_array.copy()
# Sync before kernel launch
stream.sync()
# Launch kernel
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)
launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
stream.sync()
# Verify kernel operations
assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
# Copy data between different memory types
print("\nCopying data between memory types...")
# Copy from device to pinned memory
device_buffer.copy_to(pinned_buffer, stream=stream)
stream.sync()
# Verify the copy operation
assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
# Create a new device buffer and copy from pinned
new_device_buffer = device_mr.allocate(total_size, stream=stream)
new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
pinned_buffer.copy_to(new_device_buffer, stream=stream)
stream.sync()
# Verify the copy operation
assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
# Clean up
device_buffer.close(stream)
pinned_buffer.close(stream)
new_device_buffer.close(stream)
stream.close()
cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream
# Verify buffers are properly closed
assert device_buffer.handle == 0, "Device buffer should be closed"
assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
assert new_device_buffer.handle == 0, "New device buffer should be closed"
print("Memory management example completed!")
if __name__ == "__main__":
main()