Skip to content

Commit 9943b1e

Browse files
cuda.core: require explicit stream for stream-scheduling APIs (#2001)
Removes the implicit fallback to default_stream() (or NULL) on APIs that schedule work on a stream. `stream` is now a required keyword-only argument; `Stream_accept(None)` raises TypeError. Affected APIs: - MemoryResource.allocate / deallocate and overrides on DeviceMemoryResource, PinnedMemoryResource, ManagedMemoryResource, LegacyPinnedMemoryResource, GraphMemoryResource. - Device.allocate. - GraphicsResource.map. - KernelOccupancy.max_potential_cluster_size / max_active_clusters. - Graph.launch (stream was previously positional). Stream_accept is promoted to cpdef so the pure-Python legacy/sync resources can call it. Also fixes a latent bug uncovered while doing this: the C++ MR deallocation callback in Buffer's GC path was calling `mr.deallocate(ptr, size, stream)` positionally, which would fail with the new keyword-only signature for every garbage-collected DeviceMemoryResource/GraphMemoryResource buffer. Switched to `stream=stream`. VirtualMemoryResource is exempt because cuMemCreate / cuMemMap are synchronous and not stream-ordered; it now accepts (and validates) an optional stream instead of rejecting any non-None value. Buffer.from_ipc_descriptor is also exempt: stream there only seeds the deallocation stream stored in the handle (no work is scheduled), the same shape as Buffer.close(stream=None). Tests, examples, and the v1.0.0 release note are updated accordingly. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 64e2e6a commit 9943b1e

46 files changed

Lines changed: 336 additions & 286 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cuda_core/cuda/core/_device.pyx

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,14 +1321,12 @@ class Device:
13211321
cdef Context ctx = self._context
13221322
return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
13231323

1324-
def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
1324+
def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer:
13251325
"""Allocate device memory from a specified stream.
13261326

13271327
Allocates device memory of `size` bytes on the specified `stream`
13281328
using the memory resource currently associated with this Device.
13291329

1330-
Parameter `stream` is optional, using a default stream by default.
1331-
13321330
Note
13331331
----
13341332
Device must be initialized.
@@ -1337,9 +1335,10 @@ class Device:
13371335
----------
13381336
size : int
13391337
Number of bytes to allocate.
1340-
stream : :obj:`~_stream.Stream`, optional
1341-
The stream establishing the stream ordering semantic.
1342-
Default value of `None` uses default stream.
1338+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
1339+
Keyword-only. The stream establishing the stream ordering semantic.
1340+
Must be passed explicitly; pass ``self.default_stream`` to use
1341+
the default stream.
13431342

13441343
Returns
13451344
-------
@@ -1348,7 +1347,7 @@ class Device:
13481347

13491348
"""
13501349
self._check_context_initialized()
1351-
return self.memory_resource.allocate(size, stream)
1350+
return self.memory_resource.allocate(size, stream=stream)
13521351

13531352
def sync(self):
13541353
"""Synchronize the device.

cuda_core/cuda/core/_graphics.pyx

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ from cuda.core._resource_handles cimport (
1212
as_intptr,
1313
)
1414
from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
15-
from cuda.core._stream cimport Stream, Stream_accept, default_stream
15+
from cuda.core._stream cimport Stream, Stream_accept
1616
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
1717

1818
__all__ = ['GraphicsResource']
@@ -206,7 +206,7 @@ cdef class GraphicsResource:
206206
return None
207207
return self._mapped_buffer
208208

209-
def map(self, *, stream: Stream | None = None) -> Buffer:
209+
def map(self, *, stream: Stream) -> Buffer:
210210
"""Map this graphics resource for CUDA access.
211211

212212
After mapping, a CUDA device pointer into the underlying graphics
@@ -220,9 +220,10 @@ cdef class GraphicsResource:
220220

221221
Parameters
222222
----------
223-
stream : :class:`~cuda.core.Stream`, optional
224-
The CUDA stream on which to perform the mapping. If ``None``,
225-
the current default stream is used.
223+
stream : :class:`~cuda.core.Stream`
224+
Keyword-only. The CUDA stream on which to perform the mapping.
225+
Must be passed explicitly; pass ``device.default_stream`` to use
226+
the default stream.
226227

227228
Returns
228229
-------
@@ -248,7 +249,7 @@ cdef class GraphicsResource:
248249
if self._get_mapped_buffer() is not None:
249250
raise RuntimeError("GraphicsResource is already mapped")
250251

251-
s_obj = default_stream() if stream is None else Stream_accept(stream)
252+
s_obj = Stream_accept(stream)
252253
raw = as_cu(self._handle)
253254
cy_stream = as_cu(s_obj._h_stream)
254255
with nogil:

cuda_core/cuda/core/_layout.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ cdef class _StridedLayout:
460460
required_size = layout.required_size_in_bytes()
461461
# allocate the memory on the device
462462
device.set_current()
463-
mem = device.allocate(required_size)
463+
mem = device.allocate(required_size, stream=device.default_stream)
464464
# create a view on the newly allocated device memory
465465
b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
466466
return b_view

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ cdef void _mr_dealloc_callback(
5454
stream = None
5555
if h_stream:
5656
stream = Stream._from_handle(Stream, h_stream)
57-
mr.deallocate(int(ptr), size, stream)
57+
mr.deallocate(int(ptr), size, stream=stream)
5858
except Exception as exc:
5959
print(f"Warning: mr.deallocate() failed during Buffer destruction: {exc}",
6060
file=sys.stderr)
@@ -220,7 +220,7 @@ cdef class Buffer:
220220
if self._memory_resource is None:
221221
raise ValueError("a destination buffer must be provided (this "
222222
"buffer does not have a memory_resource)")
223-
dst = self._memory_resource.allocate(src_size, s)
223+
dst = self._memory_resource.allocate(src_size, stream=s)
224224

225225
cdef size_t dst_size = dst._size
226226
if dst_size != src_size:
@@ -495,17 +495,17 @@ cdef class MemoryResource:
495495
resource's respective property.)
496496
"""
497497

498-
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
498+
def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
499499
"""Allocate a buffer of the requested size.
500500

501501
Parameters
502502
----------
503503
size : int
504504
The size of the buffer to allocate, in bytes.
505-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
506-
The stream on which to perform the allocation asynchronously.
507-
If None, it is up to each memory resource implementation to decide
508-
and document the behavior.
505+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
506+
Keyword-only. The stream on which to perform the allocation
507+
asynchronously. Must be passed explicitly; pass
508+
``device.default_stream`` to use the default stream.
509509

510510
Returns
511511
-------
@@ -515,7 +515,7 @@ cdef class MemoryResource:
515515
"""
516516
raise TypeError("MemoryResource.allocate must be implemented by subclasses.")
517517

518-
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
518+
def deallocate(self, ptr: DevicePointerT, size_t size, *, stream: Stream | GraphBuilder):
519519
"""Deallocate a buffer previously allocated by this resource.
520520
521521
Parameters
@@ -524,10 +524,10 @@ cdef class MemoryResource:
524524
The pointer or handle to the buffer to deallocate.
525525
size : int
526526
The size of the buffer to deallocate, in bytes.
527-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
528-
The stream on which to perform the deallocation asynchronously.
529-
If None, it is up to each memory resource implementation to decide
530-
and document the behavior.
527+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
528+
Keyword-only. The stream on which to perform the deallocation
529+
asynchronously. Must be passed explicitly; pass
530+
``device.default_stream`` to use the default stream.
531531
"""
532532
raise TypeError("MemoryResource.deallocate must be implemented by subclasses.")
533533

cuda_core/cuda/core/_memory/_graph_memory_resource.pyx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ from cuda.core._resource_handles cimport (
1414
as_cu,
1515
)
1616

17-
from cuda.core._stream cimport default_stream, Stream_accept, Stream
17+
from cuda.core._stream cimport Stream_accept, Stream
1818
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
1919

2020
from functools import cache
@@ -104,19 +104,19 @@ cdef class cyGraphMemoryResource(MemoryResource):
104104
def __cinit__(self, int device_id):
105105
self._device_id = device_id
106106

107-
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
107+
def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
108108
"""
109109
Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
110110
"""
111-
stream = Stream_accept(stream) if stream is not None else default_stream()
112-
return GMR_allocate(self, size, <Stream> stream)
111+
cdef Stream s = Stream_accept(stream)
112+
return GMR_allocate(self, size, s)
113113

114-
def deallocate(self, ptr: "DevicePointerT", size_t size, stream: Stream | GraphBuilder | None = None):
114+
def deallocate(self, ptr: "DevicePointerT", size_t size, *, stream: Stream | GraphBuilder):
115115
"""
116116
Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
117117
"""
118-
stream = Stream_accept(stream) if stream is not None else default_stream()
119-
return GMR_deallocate(ptr, size, <Stream> stream)
118+
cdef Stream s = Stream_accept(stream)
119+
return GMR_deallocate(ptr, size, s)
120120

121121
def close(self):
122122
"""No operation (provided for compatibility)."""

cuda_core/cuda/core/_memory/_ipc.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,12 @@ cdef Buffer Buffer_from_ipc_descriptor(
172172
if not mr.is_ipc_enabled:
173173
raise RuntimeError("Memory resource is not IPC-enabled")
174174
if stream is None:
175-
# Note: match this behavior to _MemPool.allocate()
175+
# Buffer.from_ipc_descriptor's stream is not used to schedule work;
176+
# it only seeds the deallocation stream stored in the handle. Like
177+
# Buffer.close(stream=None) and GraphicsResource.unmap(stream=None),
178+
# this is a legitimate exception to the "explicit stream" rule from
179+
# issue #2001: None means "fall back to the default stream when the
180+
# buffer is later released".
176181
stream = default_stream()
177182
cdef Stream s = <Stream>stream
178183
cdef DevicePtrHandle h_ptr = deviceptr_import_ipc(

cuda_core/cuda/core/_memory/_legacy.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,25 +27,25 @@ class LegacyPinnedMemoryResource(MemoryResource):
2727

2828
# TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
2929

30-
def allocate(self, size, stream=None) -> Buffer:
30+
def allocate(self, size, *, stream) -> Buffer:
3131
"""Allocate a buffer of the requested size.
3232
3333
Parameters
3434
----------
3535
size : int
3636
The size of the buffer to allocate, in bytes.
37-
stream : Stream, optional
38-
Currently ignored
37+
stream : Stream
38+
Keyword-only. Currently ignored, but must be passed explicitly;
39+
pass ``device.default_stream`` to use the default stream.
3940
4041
Returns
4142
-------
4243
Buffer
4344
The allocated buffer object, which is accessible on both host and device.
4445
"""
45-
if stream is None:
46-
from cuda.core._stream import default_stream
46+
from cuda.core._stream import Stream_accept
4747

48-
stream = default_stream()
48+
Stream_accept(stream)
4949
if size:
5050
err, ptr = driver.cuMemAllocHost(size)
5151
raise_if_driver_error(err)
@@ -96,11 +96,10 @@ def __init__(self, device_id):
9696

9797
self._device_id = Device(device_id).device_id
9898

99-
def allocate(self, size, stream=None) -> Buffer:
100-
if stream is None:
101-
from cuda.core._stream import default_stream
99+
def allocate(self, size, *, stream) -> Buffer:
100+
from cuda.core._stream import Stream_accept
102101

103-
stream = default_stream()
102+
Stream_accept(stream)
104103
if size:
105104
err, ptr = driver.cuMemAlloc(size)
106105
raise_if_driver_error(err)

cuda_core/cuda/core/_memory/_memory_pool.pyx

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ from libc.string cimport memset
1111
from cuda.bindings cimport cydriver
1212
from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
1313
from cuda.core._memory cimport _ipc
14-
from cuda.core._stream cimport default_stream, Stream_accept, Stream
14+
from cuda.core._stream cimport Stream_accept, Stream
1515
from cuda.core._resource_handles cimport (
1616
MemoryPoolHandle,
1717
DevicePtrHandle,
@@ -122,16 +122,17 @@ cdef class _MemPool(MemoryResource):
122122
"""
123123
_MP_close(self)
124124

125-
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
125+
def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
126126
"""Allocate a buffer of the requested size.
127127

128128
Parameters
129129
----------
130130
size : int
131131
The size of the buffer to allocate, in bytes.
132-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
133-
The stream on which to perform the allocation asynchronously.
134-
If None, an internal stream is used.
132+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
133+
Keyword-only. The stream on which to perform the allocation
134+
asynchronously. Must be passed explicitly; pass
135+
``device.default_stream`` to use the default stream.
135136

136137
Returns
137138
-------
@@ -141,10 +142,10 @@ cdef class _MemPool(MemoryResource):
141142
"""
142143
if self.is_mapped:
143144
raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
144-
stream = Stream_accept(stream) if stream is not None else default_stream()
145-
return _MP_allocate(self, size, <Stream> stream)
145+
cdef Stream s = Stream_accept(stream)
146+
return _MP_allocate(self, size, s)
146147

147-
def deallocate(self, ptr: "DevicePointerT", size_t size, stream: Stream | GraphBuilder | None = None):
148+
def deallocate(self, ptr: "DevicePointerT", size_t size, *, stream: Stream | GraphBuilder):
148149
"""Deallocate a buffer previously allocated by this resource.
149150
150151
Parameters
@@ -153,13 +154,13 @@ cdef class _MemPool(MemoryResource):
153154
The pointer or handle to the buffer to deallocate.
154155
size : int
155156
The size of the buffer to deallocate, in bytes.
156-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
157-
The stream on which to perform the deallocation asynchronously.
158-
If the buffer is deallocated without an explicit stream, the allocation stream
159-
is used.
157+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
158+
Keyword-only. The stream on which to perform the deallocation
159+
asynchronously. Must be passed explicitly; pass
160+
``device.default_stream`` to use the default stream.
160161
"""
161-
stream = Stream_accept(stream) if stream is not None else default_stream()
162-
_MP_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
162+
cdef Stream s = Stream_accept(stream)
163+
_MP_deallocate(self, <uintptr_t>ptr, size, s)
163164

164165
@property
165166
def attributes(self) -> _MemPoolAttributes:

cuda_core/cuda/core/_memory/_virtual_memory_resource.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
470470

471471
return descs
472472

473-
def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
473+
def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
474474
"""
475475
Allocate a buffer of the given size using CUDA virtual memory.
476476
@@ -479,7 +479,9 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
479479
size : int
480480
The size in bytes of the buffer to allocate.
481481
stream : Stream, optional
482-
CUDA stream to associate with the allocation (not currently supported).
482+
Keyword-only. VMR uses ``cuMemCreate`` / ``cuMemMap`` which are
483+
synchronous and not stream-ordered, so a stream is not needed.
484+
If one is provided, it is validated and otherwise unused.
483485
484486
Returns
485487
-------
@@ -488,8 +490,6 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
488490
489491
Raises
490492
------
491-
NotImplementedError
492-
If a stream is provided or if the location type is not device memory.
493493
CUDAError
494494
If any CUDA driver API call fails during allocation.
495495
@@ -501,7 +501,9 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
501501
specified in the resource's configuration.
502502
"""
503503
if stream is not None:
504-
raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
504+
from cuda.core._stream import Stream_accept
505+
506+
Stream_accept(stream)
505507

506508
config = self.config
507509
# ---- Build allocation properties ----
@@ -554,10 +556,18 @@ def allocate(self, size: int, stream: Stream | None = None) -> Buffer:
554556
buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
555557
return buf
556558

557-
def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None: # noqa: ARG002
559+
def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None:
558560
"""
559561
Deallocate memory on the device using CUDA VMM APIs.
562+
563+
``stream`` is unused (VMR is synchronous) but is validated when
564+
provided; ``None`` is accepted because the C++ GC callback passes it
565+
when no allocation stream was recorded.
560566
"""
567+
if stream is not None:
568+
from cuda.core._stream import Stream_accept
569+
570+
Stream_accept(stream)
561571
result, handle = driver.cuMemRetainAllocationHandle(ptr)
562572
raise_if_driver_error(result)
563573
(result,) = driver.cuMemUnmap(ptr, size)

0 commit comments

Comments
 (0)