cuda-python/cuda_core/examples/memory_ops.py at 4a7c9f06d2d84aa56d4b576e331ad32fbe7ea025 · NVIDIA/cuda-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ################################################################################
#
# This demo illustrates:
#
#   1. How to use different memory resources to allocate and manage memory
#   2. How to copy data between different memory types
#   3. How to use DLPack to interoperate with other libraries
#
# ################################################################################

import sys

import cupy as cp
import numpy as np
from cuda.core import (
    Device,
    LaunchConfig,
    LegacyPinnedMemoryResource,
    Program,
    ProgramOptions,
    launch,
)


def main():
    if np.__version__ < "2.1.0":
        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
        sys.exit(0)

    # Kernel for memory operations
    code = """
    extern "C"
    __global__ void memory_ops(float* device_data, float* pinned_data, size_t N) {
        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
        if (tid < N) {
            // Access device memory
            device_data[tid] = device_data[tid] + 1.0f;

            // Access pinned memory (zero-copy from GPU)
            pinned_data[tid] = pinned_data[tid] * 3.0f;
        }
    }
    """

    dev = Device()
    dev.set_current()
    stream = dev.create_stream()
    # tell CuPy to use our stream as the current stream:
    cp.cuda.ExternalStream(int(stream.handle)).use()

    # Compile kernel
    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
    prog = Program(code, code_type="c++", options=program_options)
    mod = prog.compile("cubin")
    kernel = mod.get_kernel("memory_ops")

    # Create different memory resources
    device_mr = dev.memory_resource
    pinned_mr = LegacyPinnedMemoryResource()

    # Allocate different types of memory
    size = 1024
    dtype = cp.float32
    element_size = dtype().itemsize
    total_size = size * element_size

    # 1. Device Memory (GPU-only)
    device_buffer = device_mr.allocate(total_size, stream=stream)
    device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)

    # 2. Pinned Memory (CPU memory, GPU accessible)
    pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
    pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)

    # Initialize data
    rng = cp.random.default_rng()
    device_array[:] = rng.random(size, dtype=dtype)
    pinned_array[:] = rng.random(size, dtype=dtype).get()

    # Store original values for verification
    device_original = device_array.copy()
    pinned_original = pinned_array.copy()

    # Sync before kernel launch
    stream.sync()

    # Launch kernel
    block = 256
    grid = (size + block - 1) // block
    config = LaunchConfig(grid=grid, block=block)

    launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
    stream.sync()

    # Verify kernel operations
    assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
    assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"

    # Copy data between different memory types
    print("\nCopying data between memory types...")

    # Copy from device to pinned memory
    device_buffer.copy_to(pinned_buffer, stream=stream)
    stream.sync()

    # Verify the copy operation
    assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"

    # Create a new device buffer and copy from pinned
    new_device_buffer = device_mr.allocate(total_size, stream=stream)
    new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)

    pinned_buffer.copy_to(new_device_buffer, stream=stream)
    stream.sync()

    # Verify the copy operation
    assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"

    # Clean up
    device_buffer.close(stream)
    pinned_buffer.close(stream)
    new_device_buffer.close(stream)
    stream.close()
    cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream

    # Verify buffers are properly closed
    assert device_buffer.handle == 0, "Device buffer should be closed"
    assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
    assert new_device_buffer.handle == 0, "New device buffer should be closed"

    print("Memory management example completed!")


if __name__ == "__main__":
    main()