Skip to content

Commit 327012c

Browse files
[doc-only] cuda.core v0.6.0 release notes (#1651)
* [doc-only] cuda.core v0.6.0 release notes Merge 0.6.x-notes.rst into 0.6.0-notes.rst and add comprehensive release notes covering all changes since v0.5.1. Co-authored-by: Cursor <cursoragent@cursor.com> * [doc-only] Add missing docstrings for new APIs - Add ProgramOptions.extra_sources and use_libdevice to class docstring - Add docstrings to StridedMemoryView.from_cuda_array_interface, from_array_interface, and from_any_interface - Add LEGACY_DEFAULT_STREAM and PER_THREAD_DEFAULT_STREAM to api.rst Co-authored-by: Cursor <cursoragent@cursor.com> * [doc-only] Address review feedback on v0.6.0 release notes Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 6f90edb commit 327012c

File tree

5 files changed

+125
-55
lines changed

5 files changed

+125
-55
lines changed

cuda_core/cuda/core/_memoryview.pyx

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,24 +152,63 @@ cdef class StridedMemoryView:
152152

153153
@classmethod
154154
def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
155+
"""Create a view from an object supporting the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol.
156+
157+
Parameters
158+
----------
159+
obj : object
160+
An object implementing the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol
161+
(via ``__dlpack__``).
162+
stream_ptr : int, optional
163+
Stream pointer for synchronization. If ``None``, no synchronization is performed.
164+
"""
155165
cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
156166
view_as_dlpack(obj, stream_ptr, buf)
157167
return buf
158168

159169
@classmethod
160170
def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
171+
"""Create a view from an object supporting the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
172+
173+
Parameters
174+
----------
175+
obj : object
176+
An object implementing the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
177+
stream_ptr : int, optional
178+
Stream pointer for synchronization. If ``None``, no synchronization is performed.
179+
"""
161180
cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
162181
view_as_cai(obj, stream_ptr, buf)
163182
return buf
164183

165184
@classmethod
166185
def from_array_interface(cls, obj: object) -> StridedMemoryView:
186+
"""Create a view from an object supporting the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol.
187+
188+
Parameters
189+
----------
190+
obj : object
191+
An object implementing the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol (e.g., a numpy array).
192+
"""
167193
cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
168194
view_as_array_interface(obj, buf)
169195
return buf
170196

171197
@classmethod
172198
def from_any_interface(cls, obj: object, stream_ptr: int | None = None) -> StridedMemoryView:
199+
"""Create a view by automatically selecting the best available protocol.
200+
201+
Tries `DLPack <https://dmlc.github.io/dlpack/latest/>`_ first, then falls back to
202+
`__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
203+
204+
Parameters
205+
----------
206+
obj : object
207+
An object implementing `DLPack <https://dmlc.github.io/dlpack/latest/>`_ or
208+
`__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
209+
stream_ptr : int, optional
210+
Stream pointer for synchronization. If ``None``, no synchronization is performed.
211+
"""
173212
if check_has_dlpack(obj):
174213
return cls.from_dlpack(obj, stream_ptr)
175214
return cls.from_cuda_array_interface(obj, stream_ptr)

cuda_core/cuda/core/_program.pyx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,16 @@ class ProgramOptions:
304304
instantiate_templates_in_pch : bool, optional
305305
Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
306306
Default: False
307+
extra_sources : list of 2-tuples or tuple of 2-tuples, optional
308+
Additional NVVM IR modules to compile together with the main program, specified as
309+
``((name1, source1), (name2, source2), ...)``. Each name is a string identifier used
310+
in diagnostic messages. Each source can be a string (textual LLVM IR) or bytes/bytearray
311+
(LLVM bitcode). Only supported for the NVVM backend.
312+
Default: None
313+
use_libdevice : bool, optional
314+
Load NVIDIA's `libdevice <https://docs.nvidia.com/cuda/libdevice-users-guide/>`_
315+
math builtins library. Only supported for the NVVM backend.
316+
Default: False
307317
"""
308318

309319
name: str | None = "default_program"

cuda_core/docs/source/api.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,18 @@ CUDA runtime
4949
LaunchConfig
5050
VirtualMemoryResourceOptions
5151

52+
.. data:: LEGACY_DEFAULT_STREAM
53+
54+
The legacy default CUDA stream. All devices share the same legacy default
55+
stream, and work launched on it is not concurrent with work on any other
56+
stream.
57+
58+
.. data:: PER_THREAD_DEFAULT_STREAM
59+
60+
The per-thread default CUDA stream. Each host thread has its own per-thread
61+
default stream, and work launched on it can execute concurrently with work
62+
on other non-blocking streams.
63+
5264

5365
CUDA compilation toolchain
5466
--------------------------
Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,84 @@
1-
.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
.. SPDX-License-Identifier: Apache-2.0
33
44
.. currentmodule:: cuda.core
55

66
``cuda.core`` 0.6.0 Release Notes
77
==================================
88

9+
10+
Highlights
11+
----------
12+
13+
- Added the ``cuda.core.system`` module for NVML-based system and device queries.
14+
- Several :class:`~utils.StridedMemoryView` improvements, including bfloat16 dlpack support
15+
and numpy array interoperability.
16+
- Improved support for Python object protocols across core API classes.
17+
- Performance improvements through Cythonization and reduced Python overhead.
18+
19+
20+
Breaking Changes
21+
----------------
22+
23+
- Building ``cuda.core`` from source now requires ``cuda-bindings`` >= 12.9.0, due to Cython-level
24+
dependencies on the NVVM bindings (``cynvvm``). Pre-built wheels are unaffected. The previous
25+
minimum was 12.8.0.
26+
27+
928
New features
1029
------------
1130

12-
- Added public access to default CUDA streams via module-level constants ``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM``
31+
- Added the ``cuda.core.system`` module for NVML-based system and device queries, including
32+
device attributes, clocks, temperatures, fans, events, and PCI information.
1333

14-
Users can now access default streams directly from the ``cuda.core`` namespace:
34+
- :class:`~utils.StridedMemoryView` improvements:
1535

16-
.. code-block:: python
36+
- Added ``from_array_interface`` constructor for creating views from numpy arrays.
37+
- Improved structured dtype array support.
38+
- Added bfloat16 dlpack support when the optional ``ml_dtypes`` package is installed.
1739

18-
from cuda.core import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
40+
- Added public access to default CUDA streams via module-level constants
41+
``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM``, replacing the previous
42+
workaround of using ``Stream.from_handle(0)``.
1943

20-
# Use legacy default stream (synchronizes with all blocking streams)
21-
LEGACY_DEFAULT_STREAM.sync()
44+
- Added :meth:`Kernel.from_handle` for wrapping an existing ``CUfunction`` handle into a
45+
:class:`Kernel` object, enabling interoperability with foreign CUDA handles.
2246

23-
# Use per-thread default stream (non-blocking, thread-local)
24-
PER_THREAD_DEFAULT_STREAM.sync()
47+
- Added ``__eq__``, ``__hash__``, ``__weakref__``, and ``__repr__`` support for core API classes
48+
including :class:`Buffer`, :class:`LaunchConfig`, :class:`Kernel`, :class:`ObjectCode`,
49+
:class:`Stream`, and :class:`Event`.
2550

26-
The legacy default stream synchronizes with all blocking streams in the same CUDA context, ensuring strict ordering but potentially limiting concurrency. The per-thread default stream is local to the calling thread and does not synchronize with other streams, enabling concurrent execution in multi-threaded applications.
51+
- Added NVVM ``extra_sources`` and ``use_libdevice`` options to :class:`ProgramOptions` for
52+
multi-module NVVM compilation and automatic libdevice loading.
53+
54+
- Added CUDA version compatibility check at import time to detect mismatches between
55+
``cuda.core`` and the installed ``cuda-bindings`` version.
2756

28-
This replaces the previous undocumented workaround of using ``Stream.from_handle(0)`` to access the legacy default stream.
2957

3058
Fixes and enhancements
31-
-----------------------
59+
----------------------
60+
61+
- Eliminated spurious CUDA driver errors during interpreter shutdown by ensuring
62+
resources are destroyed in the correct order.
63+
64+
- Fixed a bug preventing weak references to core API objects.
65+
66+
- Fixed zero-sized allocations in legacy memory resources, which previously failed on
67+
certain platforms.
68+
69+
- Improved performance by Cythonizing :class:`Program` and :class:`ObjectCode` internals.
70+
71+
- Reduced :class:`~utils.StridedMemoryView` construction overhead.
72+
73+
- ``__hash__`` and ``__eq__`` on core API classes no longer require a CUDA context.
74+
75+
- Device attribute queries now gracefully handle unsupported attributes on older CUDA
76+
drivers, returning sensible defaults instead of raising errors.
77+
78+
- Added a warning when :class:`ManagedMemoryResource` is created on platforms without
79+
concurrent managed access support.
80+
81+
- Reduced wheel and installed package sizes by excluding Cython source files and build
82+
artifacts from distribution packages.
3283

33-
None.
84+
- Slightly improved typing support.

cuda_core/docs/source/release/0.6.x-notes.rst

Lines changed: 0 additions & 42 deletions
This file was deleted.

0 commit comments

Comments
 (0)