NVIDIA
diff --git a/‎cuda_core/build_hooks.py‎
Lines changed: 1 addition & 24 deletions b/‎cuda_core/build_hooks.py‎
Lines changed: 1 addition & 24 deletions
diff --git a/‎cuda_core/cuda/core/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎cuda_core/cuda/core/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 6 additions & 7 deletions b/‎cuda_core/cuda/core/_device.pyx‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎cuda_core/cuda/core/_device_resources.pyx‎
Lines changed: 18 additions & 1 deletion b/‎cuda_core/cuda/core/_device_resources.pyx‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎cuda_core/cuda/core/_graphics.pyx‎
Lines changed: 7 additions & 6 deletions b/‎cuda_core/cuda/core/_graphics.pyx‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎cuda_core/cuda/core/_include/aoti_shim.def‎
Lines changed: 1 addition & 1 deletion b/‎cuda_core/cuda/core/_include/aoti_shim.def‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/cuda/core/_include/aoti_shim.h‎
Lines changed: 4 additions & 4 deletions b/‎cuda_core/cuda/core/_include/aoti_shim.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎cuda_core/cuda/core/_layout.pyx‎
Lines changed: 2 additions & 2 deletions b/‎cuda_core/cuda/core/_layout.pyx‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 47 additions & 20 deletions b/‎cuda_core/cuda/core/_memory/_buffer.pyx‎
Lines changed: 47 additions & 20 deletions
diff --git a/‎cuda_core/cuda/core/_memory/_graph_memory_resource.pyx‎
Lines changed: 7 additions & 7 deletions b/‎cuda_core/cuda/core/_memory/_graph_memory_resource.pyx‎
Lines changed: 7 additions & 7 deletions
@@ -11,7 +11,6 @@
 import glob
 import os
 import re
-import subprocess
 import sys
 import tempfile
 import zipfile
@@ -185,28 +184,6 @@ def get_sources(mod_name):
         # related to free-threading builds.
         extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"]
 
-    # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC
-    # linker can resolve the AOTI symbols (they live in torch_cpu.dll at
-    # runtime).  We generate the .lib from a .def file at build time.
-    # Note: aoti_torch_get_current_cuda_stream lives in torch_cuda.dll and
-    # is resolved lazily at runtime (not via the stub lib) — see
-    # _tensor_bridge.pyx.
-    _aoti_extra_link_args = []
-    if sys.platform == "win32":
-        _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def")
-        _lib_file = os.path.join("build", "aoti_shim.lib")
-        os.makedirs("build", exist_ok=True)
-        subprocess.check_call(  # noqa: S603
-            ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"],  # noqa: S607
-            stdout=subprocess.DEVNULL,
-        )
-        _aoti_extra_link_args = [_lib_file]
-
-    def get_extra_link_args(mod_name):
-        if mod_name == "_tensor_bridge" and _aoti_extra_link_args:
-            return extra_link_args + _aoti_extra_link_args
-        return extra_link_args
-
     ext_modules = tuple(
         Extension(
             f"cuda.core.{mod.replace(os.path.sep, '.')}",
@@ -218,7 +195,7 @@ def get_extra_link_args(mod_name):
             + all_include_dirs,
             language="c++",
             extra_compile_args=extra_compile_args,
-            extra_link_args=get_extra_link_args(mod),
+            extra_link_args=extra_link_args,
         )
         for mod in module_names()
     )
 
@@ -57,10 +57,6 @@ def _import_versioned_module():
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core._memoryview import (
-    StridedMemoryView,
-    args_viewable_as_strided_memory,
-)
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
 
@@ -1394,14 +1394,12 @@ class Device:
         cdef Context ctx = self._context
         return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
 
-    def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer:
         """Allocate device memory from a specified stream.
 
         Allocates device memory of `size` bytes on the specified `stream`
         using the memory resource currently associated with this Device.
 
-        Parameter `stream` is optional, using a default stream by default.
-
         Note
         ----
         Device must be initialized.
@@ -1410,9 +1408,10 @@ class Device:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`~_stream.Stream`, optional
-            The stream establishing the stream ordering semantic.
-            Default value of `None` uses default stream.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream establishing the stream ordering semantic.
+            Must be passed explicitly; pass ``self.default_stream`` to use
+            the default stream.
 
         Returns
         -------
@@ -1421,7 +1420,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        return self.memory_resource.allocate(size, stream)
+        return self.memory_resource.allocate(size, stream=stream)
 
     def sync(self):
         """Synchronize the device.
 
@@ -106,11 +106,18 @@ cdef class SMResourceOptions:
         Preferred co-scheduled SM count; the driver tries to satisfy
         this but may fall back to ``coscheduled_sm_count``.
         (Default to ``None``)
+    backfill : bool or Sequence[bool], optional
+        If ``True``, allow the driver to relax the co-scheduling
+        constraint when assigning SMs. This enables requesting
+        arbitrary aligned SM counts that the driver would otherwise
+        reject due to hardware topology constraints.
+        (Default to ``False``)
     """
 
     count: int | SequenceABC | None = None
     coscheduled_sm_count: int | SequenceABC | None = None
     preferred_coscheduled_sm_count: int | SequenceABC | None = None
+    backfill: bool | SequenceABC = False
 
 
 @dataclass
@@ -172,6 +179,12 @@ cdef inline int _resolve_group_count(SMResourceOptions options) except?-1:
         n_groups,
         count_is_scalar,
     )
+    _validate_split_field_length(
+        options.backfill,
+        "backfill",
+        n_groups,
+        count_is_scalar,
+    )
     return n_groups
 
 
@@ -243,6 +256,7 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
         cdef list counts = _broadcast_field(options.count, n_groups)
         cdef list coscheduled = _broadcast_field(options.coscheduled_sm_count, n_groups)
         cdef list preferred = _broadcast_field(options.preferred_coscheduled_sm_count, n_groups)
+        cdef list backfills = _broadcast_field(options.backfill, n_groups)
         cdef int i
 
         for i in range(n_groups):
@@ -252,7 +266,10 @@ IF CUDA_CORE_BUILD_MAJOR >= 13:
                 params[i].coscheduledSmCount = <unsigned int>(coscheduled[i])
             if preferred[i] is not None:
                 params[i].preferredCoscheduledSmCount = <unsigned int>(preferred[i])
-            params[i].flags = 0
+            params[i].flags = (
+                cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_BACKFILL
+                if backfills[i] else 0
+            )
         return 0
 
 
 
@@ -12,7 +12,7 @@ from cuda.core._resource_handles cimport (
     as_intptr,
 )
 from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
-from cuda.core._stream cimport Stream, Stream_accept, default_stream
+from cuda.core._stream cimport Stream, Stream_accept
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 __all__ = ['GraphicsResource']
@@ -206,7 +206,7 @@ cdef class GraphicsResource:
             return None
         return self._mapped_buffer
 
-    def map(self, *, stream: Stream | None = None) -> Buffer:
+    def map(self, *, stream: Stream) -> Buffer:
         """Map this graphics resource for CUDA access.
 
         After mapping, a CUDA device pointer into the underlying graphics
@@ -220,9 +220,10 @@ cdef class GraphicsResource:
 
         Parameters
         ----------
-        stream : :class:`~cuda.core.Stream`, optional
-            The CUDA stream on which to perform the mapping. If ``None``,
-            the current default stream is used.
+        stream : :class:`~cuda.core.Stream`
+            Keyword-only. The CUDA stream on which to perform the mapping.
+            Must be passed explicitly; pass ``device.default_stream`` to use
+            the default stream.
 
         Returns
         -------
@@ -248,7 +249,7 @@ cdef class GraphicsResource:
         if self._get_mapped_buffer() is not None:
             raise RuntimeError("GraphicsResource is already mapped")
 
-        s_obj = default_stream() if stream is None else Stream_accept(stream)
+        s_obj = Stream_accept(stream)
         raw = as_cu(self._handle)
         cy_stream = as_cu(s_obj._h_stream)
         with nogil:
 
@@ -4,7 +4,7 @@
 ; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').
 ;
 ; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations
-; in aoti_shim.h. build_hooks.py turns this file into the stub import library
+; in aoti_shim.h. setup.py turns this file into the stub import library
 ; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI
 ; symbol must be updated in both files.
 LIBRARY torch_cpu.dll
 
@@ -52,10 +52,10 @@ typedef struct AtenTensorOpaque* AtenTensorHandle;
 
 /*
  * IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with
- * aoti_shim.def.  On Windows, build_hooks.py turns that .def file into the
- * stub import library that MSVC needs to link _tensor_bridge without making
- * PyTorch a build-time dependency.  If you add, remove, or rename an
- * imported AOTI symbol here, update aoti_shim.def in the same change.
+ * aoti_shim.def.  On Windows, setup.py generates that stub import library
+ * during build_ext so MSVC can link _tensor_bridge without making PyTorch a
+ * build-time dependency.  If you add, remove, or rename an imported AOTI
+ * symbol here, update aoti_shim.def in the same change.
  *
  * Exception: aoti_torch_get_current_cuda_stream lives in torch_cuda (not
  * torch_cpu) and is resolved lazily at runtime — see _tensor_bridge.pyx.
 
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -460,7 +460,7 @@ cdef class _StridedLayout:
                 required_size = layout.required_size_in_bytes()
                 # allocate the memory on the device
                 device.set_current()
-                mem = device.allocate(required_size)
+                mem = device.allocate(required_size, stream=device.default_stream)
                 # create a view on the newly allocated device memory
                 b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
                 return b_view
 
@@ -24,7 +24,7 @@ from cuda.core._resource_handles cimport (
 )
 from cuda.core.typing import DevicePointerType
 
-from cuda.core._stream cimport Stream, Stream_accept
+from cuda.core._stream cimport Stream, Stream_accept, default_stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value
 
 import sys
@@ -49,12 +49,24 @@ cdef void _mr_dealloc_callback(
     size_t size,
     const StreamHandle& h_stream,
 ) noexcept:
-    """Called by the C++ deleter to deallocate via MemoryResource.deallocate."""
+    """Called by the C++ deleter to deallocate via MemoryResource.deallocate.
+
+    This is the C++ teardown path: there is no Python caller frame from
+    which to obtain a stream. If the device-pointer handle was created
+    without ``set_deallocation_stream`` being called (e.g. buffers minted
+    via ``Buffer.from_handle(ptr, size, mr=mr)`` from DLPack import,
+    third-party adapters, or other foreign sources), ``h_stream`` is
+    empty here. Stream-ordered MR ``deallocate`` overrides reject
+    ``stream=None`` (issue #2001), so without a fallback the destructor
+    would print a warning and leak the allocation. Fall back to the
+    legacy/per-thread default stream so the free still happens; this is
+    the unique exception to the "no implicit default-stream fallback"
+    policy because the teardown has no other source of truth.
+    """
+    cdef Stream stream
     try:
-        stream = None
-        if h_stream:
-            stream = Stream._from_handle(Stream, h_stream)
-        mr.deallocate(int(ptr), size, stream)
+        stream = Stream._from_handle(Stream, h_stream) if h_stream else default_stream()
+        mr.deallocate(int(ptr), size, stream=stream)
     except Exception as exc:
         print(f"Warning: mr.deallocate() failed during Buffer destruction: {exc}",
               file=sys.stderr)
@@ -119,7 +131,11 @@ cdef class Buffer:
 
     @staticmethod
     def _reduce_helper(mr, ipc_descriptor):
-        return Buffer.from_ipc_descriptor(mr, ipc_descriptor)
+        # The parent process's stream is not portable across processes, so the
+        # pickle path cannot thread an explicit stream through. Seed the
+        # imported buffer's deallocation with the current context's default
+        # stream; the receiver can override via buffer.close(stream).
+        return Buffer.from_ipc_descriptor(mr, ipc_descriptor, stream=default_stream())
 
     def __reduce__(self):
         # Must not serialize the parent's stream!
@@ -158,9 +174,20 @@ cdef class Buffer:
     @classmethod
     def from_ipc_descriptor(
         cls, mr: DeviceMemoryResource | PinnedMemoryResource, ipc_descriptor: IPCBufferDescriptor,
-        stream: Stream = None
+        *, stream: Stream
     ) -> Buffer:
-        """Import a buffer that was exported from another process."""
+        """Import a buffer that was exported from another process.
+
+        Parameters
+        ----------
+        mr : :obj:`~_memory.DeviceMemoryResource` | :obj:`~_memory.PinnedMemoryResource`
+            The IPC-enabled memory resource matching the exporting process.
+        ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor`
+            The descriptor exported from another process.
+        stream : :obj:`~_stream.Stream`
+            Keyword-only. The stream used for asynchronous deallocation when
+            the buffer is closed or garbage collected.
+        """
         return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
 
     @property
@@ -215,7 +242,7 @@ cdef class Buffer:
             if self._memory_resource is None:
                 raise ValueError("a destination buffer must be provided (this "
                                  "buffer does not have a memory_resource)")
-            dst = self._memory_resource.allocate(src_size, s)
+            dst = self._memory_resource.allocate(src_size, stream=s)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
@@ -490,17 +517,17 @@ cdef class MemoryResource:
     resource's respective property.)
     """
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
         ----------
         size : int
             The size of the buffer to allocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the allocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
 
         Returns
         -------
@@ -510,7 +537,7 @@ cdef class MemoryResource:
         """
         raise TypeError("MemoryResource.allocate must be implemented by subclasses.")
 
-    def deallocate(self, ptr: DevicePointerType, size_t size, stream: Stream | GraphBuilder | None = None):
+    def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -519,10 +546,10 @@ cdef class MemoryResource:
             The pointer or handle to the buffer to deallocate.
         size : int
             The size of the buffer to deallocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional
-            The stream on which to perform the deallocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
+        stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`
+            Keyword-only. The stream on which to perform the deallocation
+            asynchronously. Must be passed explicitly; pass
+            ``device.default_stream`` to use the default stream.
         """
         raise TypeError("MemoryResource.deallocate must be implemented by subclasses.")
 
 
@@ -14,7 +14,7 @@ from cuda.core._resource_handles cimport (
     as_cu,
 )
 
-from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
@@ -104,19 +104,19 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
         self._device_id = device_id
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+    def allocate(self, size_t size, *, stream: Stream | GraphBuilder) -> Buffer:
         """
         Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return GMR_allocate(self, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        return GMR_allocate(self, size, s)
 
-    def deallocate(self, ptr: "DevicePointerType", size_t size, stream: Stream | GraphBuilder | None = None):
+    def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder):
         """
         Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`.
         """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return GMR_deallocate(ptr, size, <Stream> stream)
+        cdef Stream s = Stream_accept(stream)
+        return GMR_deallocate(ptr, size, s)
 
     def close(self):
         """No operation (provided for compatibility)."""
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`; At runtime the symbols resolve from torch_cpu.dll (loaded by 'import torch').`
`5`	`5`	`;`
`6`	`6`	`; IMPORTANT: Keep this export list in sync with the AOTI_SHIM_API declarations`
`7`		`-; in aoti_shim.h. build_hooks.py turns this file into the stub import library`
	`7`	`+; in aoti_shim.h. setup.py turns this file into the stub import library`
`8`	`8`	`; that MSVC uses to link _tensor_bridge, so any added/removed/renamed AOTI`
`9`	`9`	`; symbol must be updated in both files.`
`10`	`10`	`LIBRARY torch_cpu.dll`