@@ -24,7 +24,7 @@ from cuda.core._resource_handles cimport (
2424)
2525from cuda.core.typing import DevicePointerType
2626
27- from cuda.core._stream cimport Stream, Stream_accept
27+ from cuda.core._stream cimport Stream, Stream_accept, default_stream
2828from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value
2929
3030import sys
@@ -49,12 +49,24 @@ cdef void _mr_dealloc_callback(
4949 size_t size,
5050 const StreamHandle& h_stream,
5151) noexcept:
52- """ Called by the C++ deleter to deallocate via MemoryResource.deallocate."""
52+ """ Called by the C++ deleter to deallocate via MemoryResource.deallocate.
53+
54+ This is the C++ teardown path: there is no Python caller frame from
55+ which to obtain a stream. If the device-pointer handle was created
56+ without ``set_deallocation_stream`` being called (e.g. buffers minted
57+ via ``Buffer.from_handle(ptr, size, mr=mr)`` from DLPack import,
58+ third-party adapters, or other foreign sources), ``h_stream`` is
59+ empty here. Stream-ordered MR ``deallocate`` overrides reject
60+ ``stream=None`` (issue #2001), so without a fallback the destructor
61+ would print a warning and leak the allocation. Fall back to the
62+ legacy/per-thread default stream so the free still happens; this is
63+ the unique exception to the "no implicit default-stream fallback"
64+ policy because the teardown has no other source of truth.
65+ """
66+ cdef Stream stream
5367 try :
54- stream = None
55- if h_stream:
56- stream = Stream._from_handle(Stream, h_stream)
57- mr.deallocate(int (ptr), size, stream)
68+ stream = Stream._from_handle(Stream, h_stream) if h_stream else default_stream()
69+ mr.deallocate(int (ptr), size, stream = stream)
5870 except Exception as exc:
5971 print (f" Warning: mr.deallocate() failed during Buffer destruction: {exc}" ,
6072 file = sys.stderr)
@@ -119,7 +131,11 @@ cdef class Buffer:
119131
120132 @staticmethod
121133 def _reduce_helper (mr , ipc_descriptor ):
122- return Buffer.from_ipc_descriptor(mr, ipc_descriptor)
134+ # The parent process's stream is not portable across processes, so the
135+ # pickle path cannot thread an explicit stream through. Seed the
136+ # imported buffer's deallocation with the current context's default
137+ # stream; the receiver can override via buffer.close(stream).
138+ return Buffer.from_ipc_descriptor(mr, ipc_descriptor, stream = default_stream())
123139
124140 def __reduce__ (self ):
125141 # Must not serialize the parent's stream!
@@ -158,9 +174,20 @@ cdef class Buffer:
158174 @classmethod
159175 def from_ipc_descriptor(
160176 cls , mr: DeviceMemoryResource | PinnedMemoryResource , ipc_descriptor: IPCBufferDescriptor ,
161- stream: Stream = None
177+ *, stream: Stream
162178 ) -> Buffer:
163- """Import a buffer that was exported from another process."""
179+ """Import a buffer that was exported from another process.
180+
181+ Parameters
182+ ----------
183+ mr : :obj:`~_memory.DeviceMemoryResource` | :obj:`~_memory.PinnedMemoryResource`
184+ The IPC-enabled memory resource matching the exporting process.
185+ ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
186+ The descriptor exported from another process.
187+ stream : :obj:`~_stream.Stream`
188+ Keyword-only. The stream used for asynchronous deallocation when
189+ the buffer is closed or garbage collected.
190+ """
164191 return _ipc.Buffer_from_ipc_descriptor(cls , mr , ipc_descriptor , stream )
165192
166193 @property
@@ -215,7 +242,7 @@ cdef class Buffer:
215242 if self._memory_resource is None:
216243 raise ValueError("a destination buffer must be provided (this "
217244 "buffer does not have a memory_resource )")
218- dst = self ._memory_resource.allocate(src_size, s)
245+ dst = self ._memory_resource.allocate(src_size, stream = s)
219246
220247 cdef size_t dst_size = dst._size
221248 if dst_size != src_size:
@@ -490,17 +517,17 @@ cdef class MemoryResource:
490517 resource's respective property.)
491518 """
492519
493- def allocate (self , size_t size , stream: Stream | GraphBuilder | None = None ) -> Buffer:
520+ def allocate (self , size_t size , *, stream: Stream | GraphBuilder ) -> Buffer:
494521 """Allocate a buffer of the requested size.
495522
496523 Parameters
497524 ----------
498525 size : int
499526 The size of the buffer to allocate , in bytes.
500- stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
501- The stream on which to perform the allocation asynchronously.
502- If None , it is up to each memory resource implementation to decide
503- and document the behavior .
527+ stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
528+ Keyword-only. The stream on which to perform the allocation
529+ asynchronously. Must be passed explicitly; pass
530+ ``device.default_stream`` to use the default stream .
504531
505532 Returns
506533 -------
@@ -510,7 +537,7 @@ cdef class MemoryResource:
510537 """
511538 raise TypeError("MemoryResource.allocate must be implemented by subclasses.")
512539
513- def deallocate(self , ptr: DevicePointerType , size_t size , stream: Stream | GraphBuilder | None = None ):
540+ def deallocate(self , ptr: DevicePointerType , size_t size , *, stream: Stream | GraphBuilder ):
514541 """ Deallocate a buffer previously allocated by this resource.
515542
516543 Parameters
@@ -519,10 +546,10 @@ cdef class MemoryResource:
519546 The pointer or handle to the buffer to deallocate.
520547 size : int
521548 The size of the buffer to deallocate, in bytes.
522- stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
523- The stream on which to perform the deallocation asynchronously.
524- If None, it is up to each memory resource implementation to decide
525- and document the behavior .
549+ stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
550+ Keyword-only. The stream on which to perform the deallocation
551+ asynchronously. Must be passed explicitly; pass
552+ ``device.default_stream`` to use the default stream .
526553 """
527554 raise TypeError (" MemoryResource.deallocate must be implemented by subclasses." )
528555
0 commit comments