[doc-only] cuda.core v0.6.0 release notes (#1651)

Andy-Jost · cursoragent · web-flow · commit 327012cab863 · 2026-02-18T16:27:15.000-08:00
* [doc-only] cuda.core v0.6.0 release notes

Merge 0.6.x-notes.rst into 0.6.0-notes.rst and add comprehensive
release notes covering all changes since v0.5.1.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* [doc-only] Add missing docstrings for new APIs

- Add ProgramOptions.extra_sources and use_libdevice to class docstring
- Add docstrings to StridedMemoryView.from_cuda_array_interface,
  from_array_interface, and from_any_interface
- Add LEGACY_DEFAULT_STREAM and PER_THREAD_DEFAULT_STREAM to api.rst

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* [doc-only] Address review feedback on v0.6.0 release notes

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

---------

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
@@ -152,24 +152,63 @@ cdef class StridedMemoryView:
 
     @classmethod
     def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
+        """Create a view from an object supporting the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `DLPack <https://dmlc.github.io/dlpack/latest/>`_ protocol
+            (via ``__dlpack__``).
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
         cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
         view_as_dlpack(obj, stream_ptr, buf)
         return buf
 
     @classmethod
     def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView:
+        """Create a view from an object supporting the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_ protocol.
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
         cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
         view_as_cai(obj, stream_ptr, buf)
         return buf
 
     @classmethod
     def from_array_interface(cls, obj: object) -> StridedMemoryView:
+        """Create a view from an object supporting the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing the `__array_interface__ <https://numpy.org/doc/stable/reference/arrays.interface.html>`_ protocol (e.g., a numpy array).
+        """
         cdef StridedMemoryView buf = StridedMemoryView.__new__(cls)
         view_as_array_interface(obj, buf)
         return buf
 
     @classmethod
     def from_any_interface(cls, obj: object, stream_ptr: int | None = None) -> StridedMemoryView:
+        """Create a view by automatically selecting the best available protocol.
+
+        Tries `DLPack <https://dmlc.github.io/dlpack/latest/>`_ first, then falls back to
+        `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
+
+        Parameters
+        ----------
+        obj : object
+            An object implementing `DLPack <https://dmlc.github.io/dlpack/latest/>`_ or
+            `__cuda_array_interface__ <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_.
+        stream_ptr : int, optional
+            Stream pointer for synchronization. If ``None``, no synchronization is performed.
+        """
         if check_has_dlpack(obj):
             return cls.from_dlpack(obj, stream_ptr)
         return cls.from_cuda_array_interface(obj, stream_ptr)
diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx
@@ -304,6 +304,16 @@ class ProgramOptions:
     instantiate_templates_in_pch : bool, optional
         Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
         Default: False
+    extra_sources : list of 2-tuples or tuple of 2-tuples, optional
+        Additional NVVM IR modules to compile together with the main program, specified as
+        ``((name1, source1), (name2, source2), ...)``. Each name is a string identifier used
+        in diagnostic messages. Each source can be a string (textual LLVM IR) or bytes/bytearray
+        (LLVM bitcode). Only supported for the NVVM backend.
+        Default: None
+    use_libdevice : bool, optional
+        Load NVIDIA's `libdevice <https://docs.nvidia.com/cuda/libdevice-users-guide/>`_
+        math builtins library. Only supported for the NVVM backend.
+        Default: False
     """
 
     name: str | None = "default_program"
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -49,6 +49,18 @@ CUDA runtime
    LaunchConfig
    VirtualMemoryResourceOptions
 
+.. data:: LEGACY_DEFAULT_STREAM
+
+   The legacy default CUDA stream. All devices share the same legacy default
+   stream, and work launched on it is not concurrent with work on any other
+   stream.
+
+.. data:: PER_THREAD_DEFAULT_STREAM
+
+   The per-thread default CUDA stream. Each host thread has its own per-thread
+   default stream, and work launched on it can execute concurrently with work
+   on other non-blocking streams.
+
 
 CUDA compilation toolchain
 --------------------------
diff --git a/cuda_core/docs/source/release/0.6.0-notes.rst b/cuda_core/docs/source/release/0.6.0-notes.rst
@@ -1,33 +1,84 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: Apache-2.0
 
 .. currentmodule:: cuda.core
 
 ``cuda.core`` 0.6.0 Release Notes
 ==================================
 
+
+Highlights
+----------
+
+- Added the ``cuda.core.system`` module for NVML-based system and device queries.
+- Several :class:`~utils.StridedMemoryView` improvements, including bfloat16 dlpack support
+  and numpy array interoperability.
+- Improved support for Python object protocols across core API classes.
+- Performance improvements through Cythonization and reduced Python overhead.
+
+
+Breaking Changes
+----------------
+
+- Building ``cuda.core`` from source now requires ``cuda-bindings`` >= 12.9.0, due to Cython-level
+  dependencies on the NVVM bindings (``cynvvm``). Pre-built wheels are unaffected. The previous
+  minimum was 12.8.0.
+
+
 New features
 ------------
 
-- Added public access to default CUDA streams via module-level constants ``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM``
+- Added the ``cuda.core.system`` module for NVML-based system and device queries, including
+  device attributes, clocks, temperatures, fans, events, and PCI information.
 
-  Users can now access default streams directly from the ``cuda.core`` namespace:
+- :class:`~utils.StridedMemoryView` improvements:
 
-  .. code-block:: python
+  - Added ``from_array_interface`` constructor for creating views from numpy arrays.
+  - Improved structured dtype array support.
+  - Added bfloat16 dlpack support when the optional ``ml_dtypes`` package is installed.
 
-      from cuda.core import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
+- Added public access to default CUDA streams via module-level constants
+  ``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM``, replacing the previous
+  workaround of using ``Stream.from_handle(0)``.
 
-      # Use legacy default stream (synchronizes with all blocking streams)
-      LEGACY_DEFAULT_STREAM.sync()
+- Added :meth:`Kernel.from_handle` for wrapping an existing ``CUfunction`` handle into a
+  :class:`Kernel` object, enabling interoperability with foreign CUDA handles.
 
-      # Use per-thread default stream (non-blocking, thread-local)
-      PER_THREAD_DEFAULT_STREAM.sync()
+- Added ``__eq__``, ``__hash__``, ``__weakref__``, and ``__repr__`` support for core API classes
+  including :class:`Buffer`, :class:`LaunchConfig`, :class:`Kernel`, :class:`ObjectCode`,
+  :class:`Stream`, and :class:`Event`.
 
-  The legacy default stream synchronizes with all blocking streams in the same CUDA context, ensuring strict ordering but potentially limiting concurrency. The per-thread default stream is local to the calling thread and does not synchronize with other streams, enabling concurrent execution in multi-threaded applications.
+- Added NVVM ``extra_sources`` and ``use_libdevice`` options to :class:`ProgramOptions` for
+  multi-module NVVM compilation and automatic libdevice loading.
+
+- Added CUDA version compatibility check at import time to detect mismatches between
+  ``cuda.core`` and the installed ``cuda-bindings`` version.
 
-  This replaces the previous undocumented workaround of using ``Stream.from_handle(0)`` to access the legacy default stream.
 
 Fixes and enhancements
------------------------
+----------------------
+
+- Eliminated spurious CUDA driver errors during interpreter shutdown by ensuring
+  resources are destroyed in the correct order.
+
+- Fixed a bug preventing weak references to core API objects.
+
+- Fixed zero-sized allocations in legacy memory resources, which previously failed on
+  certain platforms.
+
+- Improved performance by Cythonizing :class:`Program` and :class:`ObjectCode` internals.
+
+- Reduced :class:`~utils.StridedMemoryView` construction overhead.
+
+- ``__hash__`` and ``__eq__`` on core API classes no longer require a CUDA context.
+
+- Device attribute queries now gracefully handle unsupported attributes on older CUDA
+  drivers, returning sensible defaults instead of raising errors.
+
+- Added a warning when :class:`ManagedMemoryResource` is created on platforms without
+  concurrent managed access support.
+
+- Reduced wheel and installed package sizes by excluding Cython source files and build
+  artifacts from distribution packages.
 
-None.
+- Slightly improved typing support.
diff --git a/cuda_core/docs/source/release/0.6.x-notes.rst b/cuda_core/docs/source/release/0.6.x-notes.rst