Skip to content

Commit 7c4debe

Browse files
cuda.core: align deallocate signatures and revert Graph.launch (#2001)
- Make `deallocate` keyword-only on the synchronous resources (`LegacyPinnedMemoryResource`, `_SynchronousMemoryResource`, `VirtualMemoryResource`) so every memory-resource API obeys the kw-only rule, with `stream=None` as the default since these resources do not actually use the stream. - Revert `Graph.launch` to take `stream` positionally. It is the same shape as the kernel `launch(stream, config, kernel, *args)` API (already exempt in the issue) and shouldn't be the odd one out. - Tighten `VirtualMemoryResource.deallocate` docstring to match `allocate`. - Mark unused lambda args in `test_pass_object` as `_stream` to silence ARG005. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 23d8d8b commit 7c4debe

20 files changed

Lines changed: 124 additions & 130 deletions

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,8 @@ cdef class Buffer:
174174
ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
175175
The descriptor exported from another process.
176176
stream : :obj:`~_stream.Stream`
177-
Keyword-only. The stream stored in the imported buffer's handle
178-
and used for asynchronous deallocation when the buffer is closed
179-
or garbage collected. Pass ``device.default_stream`` to use the
180-
default stream.
177+
Keyword-only. The stream used for asynchronous deallocation when
178+
the buffer is closed or garbage collected.
181179
"""
182180
return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
183181

cuda_core/cuda/core/_memory/_legacy.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
if TYPE_CHECKING:
1010
from cuda.core._memory._buffer import DevicePointerT
11+
from cuda.core._stream import Stream
1112

1213
from cuda.core._memory._buffer import Buffer, MemoryResource
1314
from cuda.core._utils.cuda_utils import (
@@ -35,8 +36,8 @@ def allocate(self, size, *, stream) -> Buffer:
3536
size : int
3637
The size of the buffer to allocate, in bytes.
3738
stream : Stream
38-
Keyword-only. Currently ignored, but must be passed explicitly;
39-
pass ``device.default_stream`` to use the default stream.
39+
Keyword-only. Currently ignored; pass ``device.default_stream`` to
40+
use the default stream.
4041
4142
Returns
4243
-------
@@ -53,7 +54,7 @@ def allocate(self, size, *, stream) -> Buffer:
5354
ptr = 0
5455
return Buffer._init(ptr, size, self)
5556

56-
def deallocate(self, ptr: DevicePointerT, size, stream):
57+
def deallocate(self, ptr: DevicePointerT, size, *, stream: Stream | None = None):
5758
"""Deallocate a buffer previously allocated by this resource.
5859
5960
Parameters
@@ -62,8 +63,9 @@ def deallocate(self, ptr: DevicePointerT, size, stream):
6263
The pointer or handle to the buffer to deallocate.
6364
size : int
6465
The size of the buffer to deallocate, in bytes.
65-
stream : Stream
66-
The stream on which to perform the deallocation synchronously.
66+
stream : Stream, optional
67+
Keyword-only. If provided, ``stream.sync()`` is called before the
68+
host allocation is freed. ``None`` skips the sync.
6769
"""
6870
if stream is not None:
6971
stream.sync()
@@ -107,7 +109,7 @@ def allocate(self, size, *, stream) -> Buffer:
107109
ptr = 0
108110
return Buffer._init(ptr, size, self)
109111

110-
def deallocate(self, ptr, size, stream):
112+
def deallocate(self, ptr, size, *, stream: Stream | None = None):
111113
if stream is not None:
112114
stream.sync()
113115
if size:

cuda_core/cuda/core/_memory/_virtual_memory_resource.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -479,9 +479,8 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
479479
size : int
480480
The size in bytes of the buffer to allocate.
481481
stream : Stream, optional
482-
Keyword-only. VMR uses ``cuMemCreate`` / ``cuMemMap`` which are
483-
synchronous and not stream-ordered, so a stream is not needed.
484-
If one is provided, it is validated and otherwise unused.
482+
Keyword-only. Unused because virtual memory operations are
483+
synchronous.
485484
486485
Returns
487486
-------
@@ -556,13 +555,19 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
556555
buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
557556
return buf
558557

559-
def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None:
558+
def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> None:
560559
"""
561560
Deallocate memory on the device using CUDA VMM APIs.
562561
563-
``stream`` is unused (VMR is synchronous) but is validated when
564-
provided; ``None`` is accepted because the C++ GC callback passes it
565-
when no allocation stream was recorded.
562+
Parameters
563+
----------
564+
ptr : int
565+
The pointer to the memory to deallocate.
566+
size : int
567+
The size in bytes of the memory to deallocate.
568+
stream : Stream, optional
569+
Keyword-only. Unused because virtual memory operations are
570+
synchronous.
566571
"""
567572
if stream is not None:
568573
from cuda.core._stream import Stream_accept

cuda_core/cuda/core/graph/_graph_builder.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -862,13 +862,13 @@ class Graph:
862862
"""
863863
handle_return(driver.cuGraphUpload(self._mnff.graph, stream.handle))
864864

865-
def launch(self, *, stream: Stream):
865+
def launch(self, stream: Stream):
866866
"""Launches the graph in a stream.
867867
868868
Parameters
869869
----------
870870
stream : :obj:`~_stream.Stream`
871-
Keyword-only. The stream in which to launch the graph.
871+
The stream in which to launch the graph.
872872
873873
"""
874874
handle_return(driver.cuGraphLaunch(self._mnff.graph, stream.handle))

cuda_core/docs/source/release/1.0.0-notes.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ Breaking changes
129129
- :meth:`KernelOccupancy.max_potential_cluster_size` and
130130
:meth:`KernelOccupancy.max_active_clusters`.
131131
- :meth:`Buffer.from_ipc_descriptor` (``stream`` was previously positional).
132-
- :meth:`graph.Graph.launch` (``stream`` was previously positional).
133132

134133
:class:`VirtualMemoryResource` is exempt; it now accepts and validates
135134
an optional stream instead of rejecting any non-``None`` value.

cuda_core/examples/cuda_graphs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def main():
120120
# Execute the entire graph with a single launch
121121
print("Executing graph...", file=sys.stderr)
122122
start_time = time.time()
123-
graph.launch(stream=stream)
123+
graph.launch(stream)
124124
stream.sync()
125125
end_time = time.time()
126126

cuda_core/examples/graph_update.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,13 @@ def main():
7272
graph = initial_capture.complete()
7373

7474
graph.upload(stream)
75-
graph.launch(stream=stream)
75+
graph.launch(stream)
7676
stream.sync()
7777
assert tuple(values) == (2, 0)
7878

7979
graph.update(update_capture)
8080
graph.upload(stream)
81-
graph.launch(stream=stream)
81+
graph.launch(stream)
8282
stream.sync()
8383
assert tuple(values) == (2, 2)
8484

cuda_core/examples/memory_pool_resources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def main():
108108
graph_capture = graph_builder.end_building()
109109
graph = graph_capture.complete()
110110
graph.upload(stream)
111-
graph.launch(stream=stream)
111+
graph.launch(stream)
112112
stream.sync()
113113

114114
np.testing.assert_allclose(managed_array, managed_original * 2 + 1)

cuda_core/tests/graph/test_device_launch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def test_device_launch_basic(init_cuda):
115115
outer_graph = gb_outer.end_building().complete()
116116

117117
# Launch outer graph (which triggers device-side launch of inner graph)
118-
outer_graph.launch(stream=stream)
118+
outer_graph.launch(stream)
119119
stream.sync()
120120

121121
# Verify result
@@ -167,7 +167,7 @@ def test_device_launch_multiple(init_cuda):
167167
# Launch multiple times
168168
num_launches = 5
169169
for _ in range(num_launches):
170-
outer_graph.launch(stream=stream)
170+
outer_graph.launch(stream)
171171
stream.sync()
172172

173173
# Verify result

cuda_core/tests/graph/test_graph_builder.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_graph_straight(init_cuda):
3434

3535
# Sanity upload and launch
3636
graph.upload(launch_stream)
37-
graph.launch(stream=launch_stream)
37+
graph.launch(launch_stream)
3838
launch_stream.sync()
3939

4040

@@ -66,7 +66,7 @@ def test_graph_fork_join(init_cuda):
6666

6767
# Sanity upload and launch
6868
graph.upload(launch_stream)
69-
graph.launch(stream=launch_stream)
69+
graph.launch(launch_stream)
7070
launch_stream.sync()
7171

7272

@@ -135,7 +135,7 @@ def test_graph_repeat_capture(init_cuda):
135135
graph = gb.end_building().complete()
136136

137137
# Run the graph once
138-
graph.launch(stream=launch_stream)
138+
graph.launch(launch_stream)
139139
launch_stream.sync()
140140
assert arr[0] == 1
141141

@@ -144,9 +144,9 @@ def test_graph_repeat_capture(init_cuda):
144144
gb.begin_building()
145145

146146
# Graph can be re-launched
147-
graph.launch(stream=launch_stream)
148-
graph.launch(stream=launch_stream)
149-
graph.launch(stream=launch_stream)
147+
graph.launch(launch_stream)
148+
graph.launch(launch_stream)
149+
graph.launch(launch_stream)
150150
launch_stream.sync()
151151
assert arr[0] == 4
152152

@@ -181,7 +181,7 @@ def my_callback():
181181
gb.callback(my_callback)
182182
graph = gb.end_building().complete()
183183

184-
graph.launch(stream=launch_stream)
184+
graph.launch(launch_stream)
185185
launch_stream.sync()
186186

187187
assert results == [42]
@@ -202,7 +202,7 @@ def read_byte(data):
202202
gb.callback(read_byte, user_data=bytes([0xAB]))
203203
graph = gb.end_building().complete()
204204

205-
graph.launch(stream=launch_stream)
205+
graph.launch(launch_stream)
206206
launch_stream.sync()
207207

208208
assert result[0] == 0xAB
@@ -251,7 +251,7 @@ def test_graph_child_graph(init_cuda):
251251
# Parent updates first value, child updates second value
252252
assert arr[0] == 0
253253
assert arr[1] == 0
254-
graph.launch(stream=launch_stream)
254+
graph.launch(launch_stream)
255255
launch_stream.sync()
256256
assert arr[0] == 2
257257
assert arr[1] == 3

0 commit comments

Comments
 (0)