Skip to content

Commit b7f6e8c

Browse files
Merge remote-tracking branch 'origin/main' into ajost/peer-accessible-by-set-proxy
Co-authored-by: Cursor <cursoragent@cursor.com>
2 parents e95b505 + 50c19d0 commit b7f6e8c

44 files changed

Lines changed: 546 additions & 319 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cuda_core/build_hooks.py

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import glob
1212
import os
1313
import re
14-
import subprocess
1514
import sys
1615
import tempfile
1716
import zipfile
@@ -185,28 +184,6 @@ def get_sources(mod_name):
185184
# related to free-threading builds.
186185
extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"]
187186

188-
# On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC
189-
# linker can resolve the AOTI symbols (they live in torch_cpu.dll at
190-
# runtime). We generate the .lib from a .def file at build time.
191-
# Note: aoti_torch_get_current_cuda_stream lives in torch_cuda.dll and
192-
# is resolved lazily at runtime (not via the stub lib) — see
193-
# _tensor_bridge.pyx.
194-
_aoti_extra_link_args = []
195-
if sys.platform == "win32":
196-
_def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def")
197-
_lib_file = os.path.join("build", "aoti_shim.lib")
198-
os.makedirs("build", exist_ok=True)
199-
subprocess.check_call( # noqa: S603
200-
["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"], # noqa: S607
201-
stdout=subprocess.DEVNULL,
202-
)
203-
_aoti_extra_link_args = [_lib_file]
204-
205-
def get_extra_link_args(mod_name):
206-
if mod_name == "_tensor_bridge" and _aoti_extra_link_args:
207-
return extra_link_args + _aoti_extra_link_args
208-
return extra_link_args
209-
210187
ext_modules = tuple(
211188
Extension(
212189
f"cuda.core.{mod.replace(os.path.sep, '.')}",
@@ -218,7 +195,7 @@ def get_extra_link_args(mod_name):
218195
+ all_include_dirs,
219196
language="c++",
220197
extra_compile_args=extra_compile_args,
221-
extra_link_args=get_extra_link_args(mod),
198+
extra_link_args=extra_link_args,
222199
)
223200
for mod in module_names()
224201
)

cuda_core/cuda/core/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,6 @@ def _import_versioned_module():
5757
VirtualMemoryResource,
5858
VirtualMemoryResourceOptions,
5959
)
60-
from cuda.core._memoryview import (
61-
StridedMemoryView,
62-
args_viewable_as_strided_memory,
63-
)
6460
from cuda.core._module import Kernel, ObjectCode
6561
from cuda.core._program import Program, ProgramOptions
6662
from cuda.core._stream import (

cuda_core/cuda/core/_device.pyx

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,14 +1394,12 @@ class Device:
13941394
cdef Context ctx = self._context
13951395
return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
13961396

1397-
def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
1397+
def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer:
13981398
"""Allocate device memory from a specified stream.
13991399

14001400
Allocates device memory of `size` bytes on the specified `stream`
14011401
using the memory resource currently associated with this Device.
14021402

1403-
Parameter `stream` is optional, using a default stream by default.
1404-
14051403
Note
14061404
----
14071405
Device must be initialized.
@@ -1410,9 +1408,10 @@ class Device:
14101408
----------
14111409
size : int
14121410
Number of bytes to allocate.
1413-
stream : :obj:`~_stream.Stream`, optional
1414-
The stream establishing the stream ordering semantic.
1415-
Default value of `None` uses default stream.
1411+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
1412+
Keyword-only. The stream establishing the stream ordering semantic.
1413+
Must be passed explicitly; pass ``self.default_stream`` to use
1414+
the default stream.
14161415

14171416
Returns
14181417
-------
@@ -1421,7 +1420,7 @@ class Device:
14211420

14221421
"""
14231422
self._check_context_initialized()
1424-
return self.memory_resource.allocate(size, stream)
1423+
return self.memory_resource.allocate(size, stream=stream)
14251424

14261425
def sync(self):
14271426
"""Synchronize the device.

cuda_core/cuda/core/_device_resources.pyx

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,18 @@ cdef class SMResourceOptions:
106106
Preferred co-scheduled SM count; the driver tries to satisfy
107107
this but may fall back to ``coscheduled_sm_count``.
108108
(Default to ``None``)
109+
backfill : bool or Sequence[bool], optional
110+
If ``True``, allow the driver to relax the co-scheduling
111+
constraint when assigning SMs. This enables requesting
112+
arbitrary aligned SM counts that the driver would otherwise
113+
reject due to hardware topology constraints.
114+
(Default to ``False``)
109115
"""
110116

111117
count: int | SequenceABC | None = None
112118
coscheduled_sm_count: int | SequenceABC | None = None
113119
preferred_coscheduled_sm_count: int | SequenceABC | None = None
120+
backfill: bool | SequenceABC = False
114121

115122

116123
@dataclass
@@ -172,6 +179,12 @@ cdef inline int _resolve_group_count(SMResourceOptions options) except?-1:
172179
n_groups,
173180
count_is_scalar,
174181
)
182+
_validate_split_field_length(
183+
options.backfill,
184+
"backfill",
185+
n_groups,
186+
count_is_scalar,
187+
)
175188
return n_groups
176189

177190

@@ -243,6 +256,7 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
243256
cdef list counts = _broadcast_field(options.count, n_groups)
244257
cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups)
245258
cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups)
259+
cdef list backfills = _broadcast_field(options.backfill, n_groups)
246260
cdef int i
247261

248262
for i in range(n_groups):
@@ -252,7 +266,10 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
252266
params[i].coscheduledSmCount = <unsigned int>(coscheduled[i])
253267
if preferred[i] is not None:
254268
params[i].preferredCoscheduledSmCount = <unsigned int>(preferred[i])
255-
params[i].flags = 0
269+
params[i].flags = (
270+
cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL
271+
if backfills[i] else 0
272+
)
256273
return 0
257274

258275

cuda_core/cuda/core/_graphics.pyx

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ from cuda.core._resource_handles cimport (
1212
as_intptr,
1313
)
1414
from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
15-
from cuda.core._stream cimport Stream, Stream_accept, default_stream
15+
from cuda.core._stream cimport Stream, Stream_accept
1616
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
1717

1818
__all__ = ['GraphicsResource']
@@ -206,7 +206,7 @@ cdef class GraphicsResource:
206206
return None
207207
return self._mapped_buffer
208208

209-
def map(self, *, stream: Stream | None = None) -> Buffer:
209+
def map(self, *, stream: Stream) -> Buffer:
210210
"""Map this graphics resource for CUDA access.
211211

212212
After mapping, a CUDA device pointer into the underlying graphics
@@ -220,9 +220,10 @@ cdef class GraphicsResource:
220220

221221
Parameters
222222
----------
223-
stream : :class:`~cuda.core.Stream`, optional
224-
The CUDA stream on which to perform the mapping. If ``None``,
225-
the current default stream is used.
223+
stream : :class:`~cuda.core.Stream`
224+
Keyword-only. The CUDA stream on which to perform the mapping.
225+
Must be passed explicitly; pass ``device.default_stream`` to use
226+
the default stream.
226227

227228
Returns
228229
-------
@@ -248,7 +249,7 @@ cdef class GraphicsResource:
248249
if self._get_mapped_buffer() is not None:
249250
raise RuntimeError("GraphicsResource is already mapped")
250251

251-
s_obj = default_stream() if stream is None else Stream_accept(stream)
252+
s_obj = Stream_accept(stream)
252253
raw = as_cu(self._handle)
253254
cy_stream = as_cu(s_obj._h_stream)
254255
with nogil:

cuda_core/cuda/core/_include/aoti_shim.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').
55
;
66
; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations
7-
; in aoti_shim.h. build_hooks.py turns this file into the stub import library
7+
; in aoti_shim.h. setup.py turns this file into the stub import library
88
; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI
99
; symbol must be updated in both files.
1010
LIBRARY torch_cpu.dll

cuda_core/cuda/core/_include/aoti_shim.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ typedef struct AtenTensorOpaque* AtenTensorHandle;
5252

5353
/*
5454
* IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with
55-
* aoti_shim.def. On Windows, build_hooks.py turns that .def file into the
56-
* stub import library that MSVC needs to link _tensor_bridge without making
57-
* PyTorch a build-time dependency. If you add, remove, or rename an
58-
* imported AOTI symbol here, update aoti_shim.def in the same change.
55+
* aoti_shim.def. On Windows, setup.py generates that stub import library
56+
* during build_ext so MSVC can link _tensor_bridge without making PyTorch a
57+
* build-time dependency. If you add, remove, or rename an imported AOTI
58+
* symbol here, update aoti_shim.def in the same change.
5959
*
6060
* Exception: aoti_torch_get_current_cuda_stream lives in torch_cuda (not
6161
* torch_cpu) and is resolved lazily at runtime — see _tensor_bridge.pyx.

cuda_core/cuda/core/_layout.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

@@ -460,7 +460,7 @@ cdef class _StridedLayout:
460460
required_size = layout.required_size_in_bytes()
461461
# allocate the memory on the device
462462
device.set_current()
463-
mem = device.allocate(required_size)
463+
mem = device.allocate(required_size, stream=device.default_stream)
464464
# create a view on the newly allocated device memory
465465
b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
466466
return b_view

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ from cuda.core._resource_handles cimport (
2424
)
2525
from cuda.core.typing import DevicePointerType
2626

27-
from cuda.core._stream cimport Stream, Stream_accept
27+
from cuda.core._stream cimport Stream, Stream_accept, default_stream
2828
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value
2929

3030
import sys
@@ -49,12 +49,24 @@ cdef void _mr_dealloc_callback(
4949
size_t size,
5050
const StreamHandle& h_stream,
5151
) noexcept:
52-
"""Called by the C++ deleter to deallocate via MemoryResource.deallocate."""
52+
"""Called by the C++ deleter to deallocate via MemoryResource.deallocate.
53+
54+
This is the C++ teardown path: there is no Python caller frame from
55+
which to obtain a stream. If the device-pointer handle was created
56+
without ``set_deallocation_stream`` being called (e.g. buffers minted
57+
via ``Buffer.from_handle(ptr, size, mr=mr)`` from DLPack import,
58+
third-party adapters, or other foreign sources), ``h_stream`` is
59+
empty here. Stream-ordered MR ``deallocate`` overrides reject
60+
``stream=None`` (issue #2001), so without a fallback the destructor
61+
would print a warning and leak the allocation. Fall back to the
62+
legacy/per-thread default stream so the free still happens; this is
63+
the unique exception to the "no implicit default-stream fallback"
64+
policy because the teardown has no other source of truth.
65+
"""
66+
cdef Stream stream
5367
try:
54-
stream = None
55-
if h_stream:
56-
stream = Stream._from_handle(Stream, h_stream)
57-
mr.deallocate(int(ptr), size, stream)
68+
stream = Stream._from_handle(Stream, h_stream) if h_stream else default_stream()
69+
mr.deallocate(int(ptr), size, stream=stream)
5870
except Exception as exc:
5971
print(f"Warning: mr.deallocate() failed during Buffer destruction: {exc}",
6072
file=sys.stderr)
@@ -119,7 +131,11 @@ cdef class Buffer:
119131

120132
@staticmethod
121133
def _reduce_helper(mr, ipc_descriptor):
122-
return Buffer.from_ipc_descriptor(mr, ipc_descriptor)
134+
# The parent process's stream is not portable across processes, so the
135+
# pickle path cannot thread an explicit stream through. Seed the
136+
# imported buffer's deallocation with the current context's default
137+
# stream; the receiver can override via buffer.close(stream).
138+
return Buffer.from_ipc_descriptor(mr, ipc_descriptor, stream=default_stream())
123139

124140
def __reduce__(self):
125141
# Must not serialize the parent's stream!
@@ -158,9 +174,20 @@ cdef class Buffer:
158174
@classmethod
159175
def from_ipc_descriptor(
160176
cls, mr: DeviceMemoryResource | PinnedMemoryResource, ipc_descriptor: IPCBufferDescriptor,
161-
stream: Stream = None
177+
*, stream: Stream
162178
) -> Buffer:
163-
"""Import a buffer that was exported from another process."""
179+
"""Import a buffer that was exported from another process.
180+
181+
Parameters
182+
----------
183+
mr : :obj:`~_memory.DeviceMemoryResource` | :obj:`~_memory.PinnedMemoryResource`
184+
The IPC-enabled memory resource matching the exporting process.
185+
ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
186+
The descriptor exported from another process.
187+
stream : :obj:`~_stream.Stream`
188+
Keyword-only. The stream used for asynchronous deallocation when
189+
the buffer is closed or garbage collected.
190+
"""
164191
return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
165192

166193
@property
@@ -215,7 +242,7 @@ cdef class Buffer:
215242
if self._memory_resource is None:
216243
raise ValueError("a destination buffer must be provided (this "
217244
"buffer does not have a memory_resource)")
218-
dst = self._memory_resource.allocate(src_size, s)
245+
dst = self._memory_resource.allocate(src_size, stream=s)
219246

220247
cdef size_t dst_size = dst._size
221248
if dst_size != src_size:
@@ -490,17 +517,17 @@ cdef class MemoryResource:
490517
resource's respective property.)
491518
"""
492519

493-
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
520+
def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
494521
"""Allocate a buffer of the requested size.
495522

496523
Parameters
497524
----------
498525
size : int
499526
The size of the buffer to allocate, in bytes.
500-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
501-
The stream on which to perform the allocation asynchronously.
502-
If None, it is up to each memory resource implementation to decide
503-
and document the behavior.
527+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
528+
Keyword-only. The stream on which to perform the allocation
529+
asynchronously. Must be passed explicitly; pass
530+
``device.default_stream`` to use the default stream.
504531

505532
Returns
506533
-------
@@ -510,7 +537,7 @@ cdef class MemoryResource:
510537
"""
511538
raise TypeError("MemoryResource.allocate must be implemented by subclasses.")
512539

513-
def deallocate(self, ptr: DevicePointerType, size_t size, stream: Stream | GraphBuilder | None = None):
540+
def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder):
514541
"""Deallocate a buffer previously allocated by this resource.
515542
516543
Parameters
@@ -519,10 +546,10 @@ cdef class MemoryResource:
519546
The pointer or handle to the buffer to deallocate.
520547
size : int
521548
The size of the buffer to deallocate, in bytes.
522-
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
523-
The stream on which to perform the deallocation asynchronously.
524-
If None, it is up to each memory resource implementation to decide
525-
and document the behavior.
549+
stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
550+
Keyword-only. The stream on which to perform the deallocation
551+
asynchronously. Must be passed explicitly; pass
552+
``device.default_stream`` to use the default stream.
526553
"""
527554
raise TypeError("MemoryResource.deallocate must be implemented by subclasses.")
528555

cuda_core/cuda/core/_memory/_graph_memory_resource.pyx

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ from cuda.core._resource_handles cimport (
1414
as_cu,
1515
)
1616

17-
from cuda.core._stream cimport default_stream, Stream_accept, Stream
17+
from cuda.core._stream cimport Stream_accept, Stream
1818
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
1919

2020
from functools import cache
@@ -104,19 +104,19 @@ cdef class cyGraphMemoryResource(MemoryResource):
104104
def __cinit__(self, int device_id):
105105
self._device_id = device_id
106106

107-
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
107+
def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
108108
"""
109109
Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
110110
"""
111-
stream = Stream_accept(stream) if stream is not None else default_stream()
112-
return GMR_allocate(self, size, <Stream> stream)
111+
cdef Stream s = Stream_accept(stream)
112+
return GMR_allocate(self, size, s)
113113

114-
def deallocate(self, ptr: "DevicePointerType", size_t size, stream: Stream | GraphBuilder | None = None):
114+
def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder):
115115
"""
116116
Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
117117
"""
118-
stream = Stream_accept(stream) if stream is not None else default_stream()
119-
return GMR_deallocate(ptr, size, <Stream> stream)
118+
cdef Stream s = Stream_accept(stream)
119+
return GMR_deallocate(ptr, size, s)
120120

121121
def close(self):
122122
"""No operation (provided for compatibility)."""

0 commit comments

Comments
 (0)