Add recent cuda.core examples and docs pixi workflow (#1865)

cpcloud · web-flow · commit 974ed1d86e17 · 2026-04-08T10:30:57.000Z
Cover newer graph, memory resource, and StridedMemoryView APIs with runnable examples, and make the Sphinx docs environment reproducible so examples and docs are easier to iterate locally.

Made-with: Cursor
diff --git a/cuda_core/docs/README.md b/cuda_core/docs/README.md
@@ -5,6 +5,13 @@
 3. Build the docs with `./build_docs.sh`.
 4. The html artifacts should be available under both `./build/html/latest` and `./build/html/<version>`.
 
+For local development, `cuda_core/pixi.toml` now includes a dedicated `docs`
+environment that mirrors the CI Sphinx dependencies:
+
+- From `cuda_core/`, run `pixi run docs-build` to build the full versioned docs output.
+- Run `pixi run docs-build-latest` to iterate on just the `latest` docs.
+- Run `pixi run docs-debug` for a serial, verbose Sphinx build that is easier to debug.
+
 Alternatively, we can build all the docs at once by running [`cuda_python/docs/build_all_docs.sh`](../../cuda_python/docs/build_all_docs.sh).
 
 To publish the docs with the built version, it is important to note that the html files of older versions
diff --git a/cuda_core/docs/build_docs.sh b/cuda_core/docs/build_docs.sh
@@ -5,6 +5,9 @@
 
 set -ex
 
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+cd "${SCRIPT_DIR}"
+
 if [[ "$#" == "0" ]]; then
     LATEST_ONLY="0"
 elif [[ "$#" == "1" && "$1" == "latest-only" ]]; then
@@ -21,13 +24,11 @@ if [[ -z "${SPHINX_CUDA_CORE_VER}" ]]; then
                                   | awk -F'+' '{print $1}')
 fi
 
-# build the docs (in parallel)
-SPHINXOPTS="-j 4 -d build/.doctrees" make html
-
-# for debugging/developing (conf.py), please comment out the above line and
-# use the line below instead, as we must build in serial to avoid getting
-# obsecure Sphinx errors
-#SPHINXOPTS="-v" make html
+# build the docs. Allow callers to override SPHINXOPTS for serial/debug runs.
+if [[ -z "${SPHINXOPTS:-}" ]]; then
+    SPHINXOPTS="-j 4 -d build/.doctrees"
+fi
+make html
 
 # to support version dropdown menu
 cp ./versions.json build/html
diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst
@@ -68,13 +68,16 @@ a few iterations to ensure correctness.
 
 ``cuda.core`` offers a :func:`~utils.args_viewable_as_strided_memory` decorator for
 extracting the metadata (such as pointer address, shape, strides, and dtype) from any
-Python objects supporting either CAI or DLPack and returning a :class:`~utils.StridedMemoryView` object, see the
-`strided_memory_view.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view.py>`_
-example. Alternatively, a :class:`~utils.StridedMemoryView` object can be explicitly
-constructed without using the decorator. This provides a *concrete implementation* to both
-protocols that is **array-library-agnostic**, so that all Python projects can just rely on this
-without either re-implementing (the consumer-side of) the protocols or tying to any particular
-array libraries.
+Python objects supporting either CAI or DLPack and returning a :class:`~utils.StridedMemoryView`
+object. See the
+`strided_memory_view_constructors.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_constructors.py>`_
+example for the explicit constructors, or
+`strided_memory_view_cpu.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_cpu.py>`_
+and
+`strided_memory_view_gpu.py <https://github.com/NVIDIA/cuda-python/blob/main/cuda_core/examples/strided_memory_view_gpu.py>`_
+for decorator-based workflows. This provides a *concrete implementation* to both protocols that is
+**array-library-agnostic**, so that all Python projects can just rely on this without either
+re-implementing (the consumer-side of) the protocols or tying to any particular array libraries.
 
 The :attr:`~utils.StridedMemoryView.is_device_accessible` attribute can be used to check
 whether or not the underlying buffer can be accessed on GPU.
diff --git a/cuda_core/examples/graph_update.py b/cuda_core/examples/graph_update.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates Graph.update() by reusing the same executable graph
+# with a new capture that has the same topology but different kernel arguments.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "numpy>=2.1"]
+# ///
+
+import sys
+
+import numpy as np
+
+from cuda.core import (
+    Device,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    launch,
+)
+
+code = """
+extern "C" __global__ void add_one(int* value) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        *value += 1;
+    }
+}
+"""
+
+
+def build_increment_graph(device, kernel, target_ptr):
+    builder = device.create_graph_builder().begin_building()
+    config = LaunchConfig(grid=1, block=1)
+    launch(builder, config, kernel, target_ptr)
+    launch(builder, config, kernel, target_ptr)
+    return builder.end_building()
+
+
+def main():
+    if np.lib.NumpyVersion(np.__version__) < "2.1.0":
+        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
+        sys.exit(1)
+
+    device = Device()
+    device.set_current()
+    stream = device.create_stream()
+    pinned_mr = LegacyPinnedMemoryResource()
+    buffer = None
+    initial_capture = None
+    update_capture = None
+    graph = None
+
+    try:
+        options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
+        program = Program(code, code_type="c++", options=options)
+        module = program.compile("cubin")
+        kernel = module.get_kernel("add_one")
+
+        buffer = pinned_mr.allocate(2 * np.dtype(np.int32).itemsize)
+        values = np.from_dlpack(buffer).view(np.int32)
+        values[:] = 0
+
+        initial_capture = build_increment_graph(device, kernel, values[0:].ctypes.data)
+        update_capture = build_increment_graph(device, kernel, values[1:].ctypes.data)
+        graph = initial_capture.complete()
+
+        graph.upload(stream)
+        graph.launch(stream)
+        stream.sync()
+        assert tuple(values) == (2, 0)
+
+        graph.update(update_capture)
+        graph.upload(stream)
+        graph.launch(stream)
+        stream.sync()
+        assert tuple(values) == (2, 2)
+
+        print("Graph.update() reused the executable graph with a new target pointer.")
+        print(f"Final host values: {tuple(values)}")
+    finally:
+        if graph is not None:
+            graph.close()
+        if update_capture is not None:
+            update_capture.close()
+        if initial_capture is not None:
+            initial_capture.close()
+        if buffer is not None:
+            buffer.close()
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/memory_pool_resources.py b/cuda_core/examples/memory_pool_resources.py
@@ -0,0 +1,141 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the newer memory-pool APIs by combining
+# PinnedMemoryResource, ManagedMemoryResource, and GraphMemoryResource in one
+# workflow.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc", "numpy>=2.1"]
+# ///
+
+import sys
+
+import numpy as np
+
+from cuda.core import (
+    Device,
+    GraphMemoryResource,
+    LaunchConfig,
+    ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
+    PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
+    Program,
+    ProgramOptions,
+    launch,
+)
+
+code = """
+extern "C" __global__ void scale_and_bias(float* data, size_t size, float scale, float bias) {
+    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const unsigned int stride = blockDim.x * gridDim.x;
+    for (size_t i = tid; i < size; i += stride) {
+        data[i] = data[i] * scale + bias;
+    }
+}
+"""
+
+
+def main():
+    if np.lib.NumpyVersion(np.__version__) < "2.1.0":
+        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
+        sys.exit(1)
+
+    device = Device()
+    device.set_current()
+    stream = device.create_stream()
+
+    managed_mr = None
+    pinned_mr = None
+    graph_mr = None
+    managed_buffer = None
+    pinned_buffer = None
+    graph_capture = None
+    graph = None
+
+    try:
+        options = ProgramOptions(std="c++17", arch=f"sm_{device.arch}")
+        program = Program(code, code_type="c++", options=options)
+        module = program.compile("cubin")
+        kernel = module.get_kernel("scale_and_bias")
+
+        size = 256
+        dtype = np.float32
+        nbytes = size * dtype().itemsize
+        config = LaunchConfig(grid=(size + 127) // 128, block=128)
+
+        managed_options = ManagedMemoryResourceOptions(
+            preferred_location=device.device_id,
+            preferred_location_type="device",
+        )
+        managed_mr = ManagedMemoryResource(options=managed_options)
+
+        pinned_options = {"ipc_enabled": False}
+        host_numa_id = getattr(device.properties, "host_numa_id", -1)
+        if host_numa_id >= 0:
+            pinned_options["numa_id"] = host_numa_id
+        pinned_mr = PinnedMemoryResource(options=PinnedMemoryResourceOptions(**pinned_options))
+
+        graph_mr = GraphMemoryResource(device)
+
+        managed_buffer = managed_mr.allocate(nbytes, stream=stream)
+        pinned_buffer = pinned_mr.allocate(nbytes, stream=stream)
+
+        managed_array = np.from_dlpack(managed_buffer).view(np.float32)
+        pinned_array = np.from_dlpack(pinned_buffer).view(np.float32)
+
+        managed_array[:] = np.arange(size, dtype=dtype)
+        managed_original = managed_array.copy()
+        stream.sync()
+
+        managed_buffer.copy_to(pinned_buffer, stream=stream)
+        stream.sync()
+        assert np.array_equal(pinned_array, managed_original)
+
+        graph_builder = device.create_graph_builder().begin_building("relaxed")
+        scratch_buffer = graph_mr.allocate(nbytes, stream=graph_builder)
+        scratch_buffer.copy_from(managed_buffer, stream=graph_builder)
+        launch(graph_builder, config, kernel, scratch_buffer, np.uint64(size), np.float32(2.0), np.float32(1.0))
+        managed_buffer.copy_from(scratch_buffer, stream=graph_builder)
+        scratch_buffer.close()
+
+        graph_capture = graph_builder.end_building()
+        graph = graph_capture.complete()
+        graph.upload(stream)
+        graph.launch(stream)
+        stream.sync()
+
+        np.testing.assert_allclose(managed_array, managed_original * 2 + 1)
+        managed_buffer.copy_to(pinned_buffer, stream=stream)
+        stream.sync()
+        np.testing.assert_allclose(pinned_array, managed_original * 2 + 1)
+
+        print(f"PinnedMemoryResource numa_id: {pinned_mr.numa_id}")
+        print(f"ManagedMemoryResource preferred_location: {managed_mr.preferred_location}")
+        print(f"GraphMemoryResource reserved high watermark: {graph_mr.attributes.reserved_mem_high}")
+    finally:
+        if graph is not None:
+            graph.close()
+        if graph_capture is not None:
+            graph_capture.close()
+        if pinned_buffer is not None:
+            pinned_buffer.close(stream)
+        if managed_buffer is not None:
+            managed_buffer.close(stream)
+        if graph_mr is not None:
+            graph_mr.close()
+        if pinned_mr is not None:
+            pinned_mr.close()
+        if managed_mr is not None:
+            managed_mr.close()
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/strided_memory_view_constructors.py b/cuda_core/examples/strided_memory_view_constructors.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the explicit StridedMemoryView constructors for
+# __array_interface__, DLPack, __cuda_array_interface__, and Buffer objects.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "cupy-cuda13x", "numpy>=2.1"]
+# ///
+
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuda.core import Device
+from cuda.core.utils import StridedMemoryView
+
+
+def dense_c_strides(shape):
+    if not shape:
+        return ()
+
+    strides = [1] * len(shape)
+    for index in range(len(shape) - 2, -1, -1):
+        strides[index] = strides[index + 1] * shape[index + 1]
+    return tuple(strides)
+
+
+def main():
+    if np.lib.NumpyVersion(np.__version__) < "2.1.0":
+        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
+        sys.exit(1)
+
+    device = Device()
+    device.set_current()
+    stream = device.create_stream()
+    buffer = None
+
+    try:
+        host_array = np.arange(12, dtype=np.int16).reshape(3, 4)
+        host_view = StridedMemoryView.from_array_interface(host_array)
+        host_dlpack_view = StridedMemoryView.from_dlpack(host_array, stream_ptr=-1)
+
+        assert host_view.shape == host_array.shape
+        assert host_view.size == host_array.size
+        assert not host_view.is_device_accessible
+        assert np.array_equal(np.from_dlpack(host_view), host_array)
+        assert np.array_equal(np.from_dlpack(host_dlpack_view), host_array)
+
+        gpu_array = cp.arange(12, dtype=cp.float32).reshape(3, 4)
+        dlpack_view = StridedMemoryView.from_dlpack(gpu_array, stream_ptr=stream.handle)
+        cai_view = StridedMemoryView.from_cuda_array_interface(gpu_array, stream_ptr=stream.handle)
+
+        cp.testing.assert_array_equal(cp.from_dlpack(dlpack_view), gpu_array)
+        cp.testing.assert_array_equal(cp.from_dlpack(cai_view), gpu_array)
+
+        buffer = device.memory_resource.allocate(gpu_array.nbytes, stream=stream)
+        buffer_array = cp.from_dlpack(buffer).view(dtype=cp.float32).reshape(gpu_array.shape)
+        buffer_array[...] = gpu_array
+        device.sync()
+
+        buffer_view = StridedMemoryView.from_buffer(
+            buffer,
+            shape=gpu_array.shape,
+            strides=dense_c_strides(gpu_array.shape),
+            dtype=np.dtype(np.float32),
+        )
+        cp.testing.assert_array_equal(cp.from_dlpack(buffer_view), gpu_array)
+
+        print("Constructed StridedMemoryView objects from array, DLPack, CAI, and Buffer inputs.")
+    finally:
+        if buffer is not None:
+            buffer.close(stream)
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py