NVIDIA
diff --git a/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 2 additions & 4 deletions b/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_legacy.py‎
Lines changed: 8 additions & 6 deletions b/‎cuda_core/cuda/core/_memory/_legacy.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_virtual_memory_resource.py‎
Lines changed: 12 additions & 7 deletions b/‎cuda_core/cuda/core/_memory/_virtual_memory_resource.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎cuda_core/cuda/core/graph/_graph_builder.pyx‎
Lines changed: 2 additions & 2 deletions b/‎cuda_core/cuda/core/graph/_graph_builder.pyx‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_core/docs/source/release/1.0.0-notes.rst‎
Lines changed: 0 additions & 1 deletion b/‎cuda_core/docs/source/release/1.0.0-notes.rst‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cuda_core/examples/cuda_graphs.py‎
Lines changed: 1 addition & 1 deletion b/‎cuda_core/examples/cuda_graphs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/examples/graph_update.py‎
Lines changed: 2 additions & 2 deletions b/‎cuda_core/examples/graph_update.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_core/examples/memory_pool_resources.py‎
Lines changed: 1 addition & 1 deletion b/‎cuda_core/examples/memory_pool_resources.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/tests/graph/test_device_launch.py‎
Lines changed: 2 additions & 2 deletions b/‎cuda_core/tests/graph/test_device_launch.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_core/tests/graph/test_graph_builder.py‎
Lines changed: 9 additions & 9 deletions b/‎cuda_core/tests/graph/test_graph_builder.py‎
Lines changed: 9 additions & 9 deletions
@@ -174,10 +174,8 @@ cdef class Buffer:
         ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
             The descriptor exported from another process.
         stream : :obj:`~_stream.Stream`
-            Keyword-only. The stream stored in the imported buffer's handle
-            and used for asynchronous deallocation when the buffer is closed
-            or garbage collected. Pass ``device.default_stream`` to use the
-            default stream.
+            Keyword-only. The stream used for asynchronous deallocation when
+            the buffer is closed or garbage collected.
         """
         return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
 
 
@@ -8,6 +8,7 @@
 
 if TYPE_CHECKING:
     from cuda.core._memory._buffer import DevicePointerT
+    from cuda.core._stream import Stream
 
 from cuda.core._memory._buffer import Buffer, MemoryResource
 from cuda.core._utils.cuda_utils import (
@@ -35,8 +36,8 @@ def allocate(self, size, *, stream) -> Buffer:
         size : int
             The size of the buffer to allocate, in bytes.
         stream : Stream
-            Keyword-only. Currently ignored, but must be passed explicitly;
-            pass ``device.default_stream`` to use the default stream.
+            Keyword-only. Currently ignored; pass ``device.default_stream`` to
+            use the default stream.
 
         Returns
         -------
@@ -53,7 +54,7 @@ def allocate(self, size, *, stream) -> Buffer:
             ptr = 0
         return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr: DevicePointerT, size, stream):
+    def deallocate(self, ptr: DevicePointerT, size, *, stream: Stream | None = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -62,8 +63,9 @@ def deallocate(self, ptr: DevicePointerT, size, stream):
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : Stream
-            The stream on which to perform the deallocation synchronously.
+        stream : Stream, optional
+            Keyword-only. If provided, ``stream.sync()`` is called before the
+            host allocation is freed. ``None`` skips the sync.
         """
         if stream is not None:
             stream.sync()
@@ -107,7 +109,7 @@ def allocate(self, size, *, stream) -> Buffer:
             ptr = 0
         return Buffer._init(ptr, size, self)
 
-    def deallocate(self, ptr, size, stream):
+    def deallocate(self, ptr, size, *, stream: Stream | None = None):
         if stream is not None:
             stream.sync()
         if size:
 
@@ -479,9 +479,8 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
         size : int
             The size in bytes of the buffer to allocate.
         stream : Stream, optional
-            Keyword-only. VMR uses ``cuMemCreate`` / ``cuMemMap`` which are
-            synchronous and not stream-ordered, so a stream is not needed.
-            If one is provided, it is validated and otherwise unused.
+            Keyword-only. Unused because virtual memory operations are
+            synchronous.
 
         Returns
         -------
@@ -556,13 +555,19 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer:
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
         return buf
 
-    def deallocate(self, ptr: int, size: int, stream: Stream | None = None) -> None:
+    def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> None:
         """
         Deallocate memory on the device using CUDA VMM APIs.
 
-        ``stream`` is unused (VMR is synchronous) but is validated when
-        provided; ``None`` is accepted because the C++ GC callback passes it
-        when no allocation stream was recorded.
+        Parameters
+        ----------
+        ptr : int
+            The pointer to the memory to deallocate.
+        size : int
+            The size in bytes of the memory to deallocate.
+        stream : Stream, optional
+            Keyword-only. Unused because virtual memory operations are
+            synchronous.
         """
         if stream is not None:
             from cuda.core._stream import Stream_accept
 
@@ -862,13 +862,13 @@ class Graph:
         """
         handle_return(driver.cuGraphUpload(self._mnff.graph, stream.handle))
 
-    def launch(self, *, stream: Stream):
+    def launch(self, stream: Stream):
         """Launches the graph in a stream.
 
         Parameters
         ----------
         stream : :obj:`~_stream.Stream`
-            Keyword-only. The stream in which to launch the graph.
+            The stream in which to launch the graph.
 
         """
         handle_return(driver.cuGraphLaunch(self._mnff.graph, stream.handle))
@@ -129,7 +129,6 @@ Breaking changes
   - :meth:`KernelOccupancy.max_potential_cluster_size` and
     :meth:`KernelOccupancy.max_active_clusters`.
   - :meth:`Buffer.from_ipc_descriptor` (``stream`` was previously positional).
-  - :meth:`graph.Graph.launch` (``stream`` was previously positional).
 
   :class:`VirtualMemoryResource` is exempt; it now accepts and validates
   an optional stream instead of rejecting any non-``None`` value.
 
@@ -120,7 +120,7 @@ def main():
         # Execute the entire graph with a single launch
         print("Executing graph...", file=sys.stderr)
         start_time = time.time()
-        graph.launch(stream=stream)
+        graph.launch(stream)
         stream.sync()
         end_time = time.time()
 
 
@@ -72,13 +72,13 @@ def main():
         graph = initial_capture.complete()
 
         graph.upload(stream)
-        graph.launch(stream=stream)
+        graph.launch(stream)
         stream.sync()
         assert tuple(values) == (2, 0)
 
         graph.update(update_capture)
         graph.upload(stream)
-        graph.launch(stream=stream)
+        graph.launch(stream)
         stream.sync()
         assert tuple(values) == (2, 2)
 
 
@@ -108,7 +108,7 @@ def main():
         graph_capture = graph_builder.end_building()
         graph = graph_capture.complete()
         graph.upload(stream)
-        graph.launch(stream=stream)
+        graph.launch(stream)
         stream.sync()
 
         np.testing.assert_allclose(managed_array, managed_original * 2 + 1)
 
@@ -115,7 +115,7 @@ def test_device_launch_basic(init_cuda):
     outer_graph = gb_outer.end_building().complete()
 
     # Launch outer graph (which triggers device-side launch of inner graph)
-    outer_graph.launch(stream=stream)
+    outer_graph.launch(stream)
     stream.sync()
 
     # Verify result
@@ -167,7 +167,7 @@ def test_device_launch_multiple(init_cuda):
     # Launch multiple times
     num_launches = 5
     for _ in range(num_launches):
-        outer_graph.launch(stream=stream)
+        outer_graph.launch(stream)
     stream.sync()
 
     # Verify result
 
@@ -34,7 +34,7 @@ def test_graph_straight(init_cuda):
 
     # Sanity upload and launch
     graph.upload(launch_stream)
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
 
 
@@ -66,7 +66,7 @@ def test_graph_fork_join(init_cuda):
 
     # Sanity upload and launch
     graph.upload(launch_stream)
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
 
 
@@ -135,7 +135,7 @@ def test_graph_repeat_capture(init_cuda):
     graph = gb.end_building().complete()
 
     # Run the graph once
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
     assert arr[0] == 1
 
@@ -144,9 +144,9 @@ def test_graph_repeat_capture(init_cuda):
         gb.begin_building()
 
     # Graph can be re-launched
-    graph.launch(stream=launch_stream)
-    graph.launch(stream=launch_stream)
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
+    graph.launch(launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
     assert arr[0] == 4
 
@@ -181,7 +181,7 @@ def my_callback():
     gb.callback(my_callback)
     graph = gb.end_building().complete()
 
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
 
     assert results == [42]
@@ -202,7 +202,7 @@ def read_byte(data):
     gb.callback(read_byte, user_data=bytes([0xAB]))
     graph = gb.end_building().complete()
 
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
 
     assert result[0] == 0xAB
@@ -251,7 +251,7 @@ def test_graph_child_graph(init_cuda):
     # Parent updates first value, child updates second value
     assert arr[0] == 0
     assert arr[1] == 0
-    graph.launch(stream=launch_stream)
+    graph.launch(launch_stream)
     launch_stream.sync()
     assert arr[0] == 2
     assert arr[1] == 3