NVIDIA
diff --git a/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 6 additions & 7 deletions b/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎cuda_core/cuda/core/_graphics.pyx‎
Lines changed: 7 additions & 6 deletions b/‎cuda_core/cuda/core/_graphics.pyx‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎cuda_core/cuda/core/_layout.pyx‎
Lines changed: 1 addition & 1 deletion b/‎cuda_core/cuda/core/_layout.pyx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 12 additions & 12 deletions b/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_graph_memory_resource.pyx‎
Lines changed: 7 additions & 7 deletions b/‎cuda_core/cuda/core/_memory/_graph_memory_resource.pyx‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_ipc.pyx‎
Lines changed: 6 additions & 1 deletion b/‎cuda_core/cuda/core/_memory/_ipc.pyx‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎cuda_core/cuda/core/_memory/_legacy.py‎
Lines changed: 9 additions & 10 deletions b/‎cuda_core/cuda/core/_memory/_legacy.py‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_memory_pool.pyx‎
Lines changed: 15 additions & 14 deletions b/‎cuda_core/cuda/core/_memory/_memory_pool.pyx‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_virtual_memory_resource.py‎
Lines changed: 16 additions & 6 deletions b/‎cuda_core/cuda/core/_memory/_virtual_memory_resource.py‎
Lines changed: 16 additions & 6 deletions
@@ -1321,14 +1321,12 @@ class Device:
         cdef Context ctx = self._context
         return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
 
-    def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
         using the memory resource currently associated with this Device.
 
-        Parameter `stream` is optional, using a default stream by default.
-
         Note
         ----
         Device must be initialized.
@@ -1337,9 +1335,10 @@ class Device:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`~_stream.Stream`, optional
-            The stream establishing the stream ordering semantic.
-            Default value of `None` uses default stream.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream establishing the stream ordering semantic.
+            Must be passed explicitly; pass ``self.default_stream`` to use
+            the default stream.
 
         Returns
         -------
@@ -1348,7 +1347,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        return self.memory_resource.allocate(size, stream)
+        return self.memory_resource.allocate(size, stream=stream)
 
     def sync(self):
         """Synchronize the device.
 
@@ -12,7 +12,7 @@ from cuda.core._resource_handles cimport (
     as_intptr,
 )
 from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
-from cuda.core._stream cimport Stream, Stream_accept, default_stream
+from cuda.core._stream cimport Stream, Stream_accept
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 __all__ = ['GraphicsResource']
@@ -206,7 +206,7 @@ cdef class GraphicsResource:
             return None
         return self._mapped_buffer
 
-    def map(self, *, stream: Stream | None = None) -> Buffer:
+    def map(self, *, stream: Stream) -> Buffer:
         """Map this graphics resource for CUDA access.
 
         After mapping, a CUDA device pointer into the underlying graphics
@@ -220,9 +220,10 @@ cdef class GraphicsResource:
 
         Parameters
         ----------
-        stream : :class:`~cuda.core.Stream`, optional
-            The CUDA stream on which to perform the mapping. If ``None``,
-            the current default stream is used.
+        stream : :class:`~cuda.core.Stream`
+            Keyword-only. The CUDA stream on which to perform the mapping.
+            Must be passed explicitly; pass ``device.default_stream`` to use
+            the default stream.
 
         Returns
         -------
@@ -248,7 +249,7 @@ cdef class GraphicsResource:
         if self._get_mapped_buffer() is not None:
             raise RuntimeError("GraphicsResource is already mapped")
 
-        s_obj = default_stream() if stream is None else Stream_accept(stream)
+        s_obj = Stream_accept(stream)
         raw = as_cu(self._handle)
         cy_stream = as_cu(s_obj._h_stream)
         with nogil:
 
@@ -460,7 +460,7 @@ cdef class _StridedLayout:
                 required_size = layout.required_size_in_bytes()
                 # allocate the memory on the device
                 device.set_current()
-                mem = device.allocate(required_size)
+                mem = device.allocate(required_size, stream=device.default_stream)
                 # create a view on the newly allocated device memory
                 b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
                 return b_view
 
@@ -54,7 +54,7 @@ cdef void _mr_dealloc_callback(
         stream = None
         if h_stream:
             stream = Stream._from_handle(Stream, h_stream)
-        mr.deallocate(int(ptr), size, stream)
+        mr.deallocate(int(ptr), size, stream=stream)
     except Exception as exc:
         print(f"Warning: mr.deallocate() failed during Buffer destruction: {exc}",
               file=sys.stderr)
@@ -220,7 +220,7 @@ cdef class Buffer:
             if self._memory_resource is None:
                 raise ValueError("a destination buffer must be provided (this "
                                  "buffer does not have a memory_resource)")
-            dst = self._memory_resource.allocate(src_size, s)
+            dst = self._memory_resource.allocate(src_size, stream=s)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
@@ -495,17 +495,17 @@ cdef class MemoryResource:
     resource's respective property.)
     """
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the allocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
 
         Returns
         -------
@@ -515,7 +515,7 @@ cdef class MemoryResource:
         """
         raise TypeError("MemoryResource.allocate must be implemented by subclasses.")
 
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, *, stream: Stream | GraphBuilder):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -524,10 +524,10 @@ cdef class MemoryResource:
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the deallocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the deallocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
         """
         raise TypeError("MemoryResource.deallocate must be implemented by subclasses.")
 
 
@@ -14,7 +14,7 @@ from cuda.core._resource_handles cimport (
     as_cu,
 )
 
-from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
@@ -104,19 +104,19 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
         self._device_id = device_id
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
         """
         Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return GMR_allocate(self, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        return GMR_allocate(self, size, s)
 
-    def deallocate(self, ptr: "DevicePointerT", size_t size, stream: Stream | GraphBuilder | None = None):
+    def deallocate(self, ptr: "DevicePointerT", size_t size, *, stream: Stream | GraphBuilder):
         """
         Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return GMR_deallocate(ptr, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        return GMR_deallocate(ptr, size, s)
 
     def close(self):
         """No operation (provided for compatibility)."""
 
@@ -172,7 +172,12 @@ cdef Buffer Buffer_from_ipc_descriptor(
     if not mr.is_ipc_enabled:
         raise RuntimeError("Memory resource is not IPC-enabled")
     if stream is None:
-        # Note: match this behavior to _MemPool.allocate()
+        # Buffer.from_ipc_descriptor's stream is not used to schedule work;
+        # it only seeds the deallocation stream stored in the handle. Like
+        # Buffer.close(stream=None) and GraphicsResource.unmap(stream=None),
+        # this is a legitimate exception to the "explicit stream" rule from
+        # issue #2001: None means "fall back to the default stream when the
+        # buffer is later released".
         stream = default_stream()
     cdef Stream s = <Stream>stream
     cdef DevicePtrHandle h_ptr = deviceptr_import_ipc(
 
@@ -27,25 +27,25 @@ class LegacyPinnedMemoryResource(MemoryResource):
 
     # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
 
-    def allocate(self, size, stream=None) -> Buffer:
+    def allocate(self, size, *, stream) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
-            Currently ignored
+        stream : Stream
+            Keyword-only. Currently ignored, but must be passed explicitly;
+            pass ``device.default_stream`` to use the default stream.
 
         Returns
         -------
         Buffer
             The allocated buffer object, which is accessible on both host and device.
         """
-        if stream is None:
-            from cuda.core._stream import default_stream
+        from cuda.core._stream import Stream_accept
 
-            stream = default_stream()
+        Stream_accept(stream)
         if size:
             err, ptr = driver.cuMemAllocHost(size)
             raise_if_driver_error(err)
@@ -96,11 +96,10 @@ def __init__(self, device_id):
 
         self._device_id = Device(device_id).device_id
 
-    def allocate(self, size, stream=None) -> Buffer:
-        if stream is None:
-            from cuda.core._stream import default_stream
+    def allocate(self, size, *, stream) -> Buffer:
+        from cuda.core._stream import Stream_accept
 
-            stream = default_stream()
+        Stream_accept(stream)
         if size:
             err, ptr = driver.cuMemAlloc(size)
             raise_if_driver_error(err)
 
@@ -11,7 +11,7 @@ from libc.string cimport memset
 from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
 from cuda.core._memory cimport _ipc
-from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._resource_handles cimport (
     MemoryPoolHandle,
     DevicePtrHandle,
@@ -122,16 +122,17 @@ cdef class _MemPool(MemoryResource):
         """
         _MP_close(self)
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, an internal stream is used.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the allocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
 
         Returns
         -------
@@ -141,10 +142,10 @@ cdef class _MemPool(MemoryResource):
         """
         if self.is_mapped:
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return _MP_allocate(self, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        return _MP_allocate(self, size, s)
 
-    def deallocate(self, ptr: "DevicePointerT", size_t size, stream: Stream | GraphBuilder | None = None):
+    def deallocate(self, ptr: "DevicePointerT", size_t size, *, stream: Stream | GraphBuilder):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -153,13 +154,13 @@ cdef class _MemPool(MemoryResource):
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the deallocation asynchronously.
-            If the buffer is deallocated without an explicit stream, the allocation stream
-            is used.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the deallocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
         """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        _MP_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        _MP_deallocate(self, <uintptr_t>ptr, size, s)
 
     @property
     def attributes(self) -> _MemPoolAttributes:
 
@@ -470,7 +470,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
 
         return descs
 
-    def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
+    def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
         """
         Allocate a buffer of the given size using CUDA virtual memory.
 
@@ -479,7 +479,9 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
         size : int
             The size in bytes of the buffer to allocate.
         stream : Stream, optional
-            CUDA stream to associate with the allocation (not currently supported).
+            Keyword-only. VMR uses ``cuMemCreate`` / ``cuMemMap`` which are
+            synchronous and not stream-ordered, so a stream is not needed.
+            If one is provided, it is validated and otherwise unused.
 
         Returns
         -------
@@ -488,8 +490,6 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
 
         Raises
         ------
-        NotImplementedError
-            If a stream is provided or if the location type is not device memory.
         CUDAError
             If any CUDA driver API call fails during allocation.
 
@@ -501,7 +501,9 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
         specified in the resource's configuration.
         """
         if stream is not None:
-            raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
+            from cuda.core._stream import Stream_accept
+
+            Stream_accept(stream)
 
         config = self.config
         # ---- Build allocation properties ----
@@ -554,10 +556,18 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
         return buf
 
-    def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None:  # noqa: ARG002
+    def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None:
         """
         Deallocate memory on the device using CUDA VMM APIs.
+
+        ``stream`` is unused (VMR is synchronous) but is validated when
+        provided; ``None`` is accepted because the C++ GC callback passes it
+        when no allocation stream was recorded.
         """
+        if stream is not None:
+            from cuda.core._stream import Stream_accept
+
+            Stream_accept(stream)
         result, handle = driver.cuMemRetainAllocationHandle(ptr)
         raise_if_driver_error(result)
         (result,) = driver.cuMemUnmap(ptr, size)