Skip to content

Commit d554acd

Browse files
Fix Buffer.from_handle(mr=...) not calling mr.deallocate() (#1619) (#1625)
The RAII resource handle migration broke the contract where Buffer.from_handle(mr=mr) calls mr.deallocate() on close or GC. Add deviceptr_create_with_mr() which invokes a registered callback at destruction time, passing the deallocation stream from the handle. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 0a00e4c commit d554acd

File tree

8 files changed

+193
-14
lines changed

8 files changed

+193
-14
lines changed

cuda_core/cuda/core/_cpp/resource_handles.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,42 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
565565
return DevicePtrHandle(box, &box->resource);
566566
}
567567

568+
// ============================================================================
569+
// MemoryResource-owned Device Pointer Handles
570+
// ============================================================================
571+
572+
static MRDeallocCallback mr_dealloc_cb = nullptr;
573+
574+
void register_mr_dealloc_callback(MRDeallocCallback cb) {
575+
mr_dealloc_cb = cb;
576+
}
577+
578+
DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr) {
579+
if (!mr) {
580+
return deviceptr_create_ref(ptr);
581+
}
582+
// GIL required when mr is provided
583+
GILAcquireGuard gil;
584+
if (!gil.acquired()) {
585+
return deviceptr_create_ref(ptr);
586+
}
587+
Py_INCREF(mr);
588+
auto box = std::shared_ptr<DevicePtrBox>(
589+
new DevicePtrBox{ptr, StreamHandle{}},
590+
[mr, size](DevicePtrBox* b) {
591+
GILAcquireGuard gil;
592+
if (gil.acquired()) {
593+
if (mr_dealloc_cb) {
594+
mr_dealloc_cb(mr, b->resource, size, b->h_stream);
595+
}
596+
Py_DECREF(mr);
597+
}
598+
delete b;
599+
}
600+
);
601+
return DevicePtrHandle(box, &box->resource);
602+
}
603+
568604
// ============================================================================
569605
// IPC Pointer Cache
570606
// ============================================================================

cuda_core/cuda/core/_cpp/resource_handles.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,23 @@ DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
239239
// If owner is nullptr, equivalent to deviceptr_create_ref.
240240
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner);
241241

242+
// Callback type for MemoryResource deallocation.
243+
// Called from the shared_ptr deleter when a handle created via
244+
// deviceptr_create_with_mr is destroyed. The implementation is responsible
245+
// for converting raw C types to Python objects and calling
246+
// mr.deallocate(ptr, size, stream).
247+
using MRDeallocCallback = void (*)(PyObject* mr, CUdeviceptr ptr,
248+
size_t size, const StreamHandle& stream);
249+
250+
// Register the MR deallocation callback.
251+
void register_mr_dealloc_callback(MRDeallocCallback cb);
252+
253+
// Create a device pointer handle whose destructor calls mr.deallocate()
254+
// via the registered callback. The mr's refcount is incremented and
255+
// decremented when the handle is released.
256+
// If mr is nullptr, equivalent to deviceptr_create_ref.
257+
DevicePtrHandle deviceptr_create_with_mr(CUdeviceptr ptr, size_t size, PyObject* mr);
258+
242259
// Import a device pointer from IPC via cuMemPoolImportPointer.
243260
// When the last reference is released, cuMemFreeAsync is called on the stored stream.
244261
// Note: Does not yet implement reference counting for nvbug 5570902.

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ from cuda.core._resource_handles cimport (
1717
DevicePtrHandle,
1818
StreamHandle,
1919
deviceptr_create_with_owner,
20+
deviceptr_create_with_mr,
21+
register_mr_dealloc_callback,
2022
as_intptr,
2123
as_cu,
2224
set_deallocation_stream,
2325
)
2426

25-
from cuda.core._stream cimport Stream_accept, Stream
27+
from cuda.core._stream cimport Stream, Stream_accept
2628
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
2729

2830
import sys
@@ -37,6 +39,30 @@ from cuda.core._dlpack import DLDeviceType, make_py_capsule
3739
from cuda.core._utils.cuda_utils import driver
3840
from cuda.core._device import Device
3941

42+
43+
# =============================================================================
44+
# MR deallocation callback (invoked from C++ shared_ptr deleter)
45+
# =============================================================================
46+
47+
cdef void _mr_dealloc_callback(
48+
object mr,
49+
cydriver.CUdeviceptr ptr,
50+
size_t size,
51+
const StreamHandle& h_stream,
52+
) noexcept:
53+
"""Called by the C++ deleter to deallocate via MemoryResource.deallocate."""
54+
try:
55+
stream = None
56+
if h_stream:
57+
stream = Stream._from_handle(Stream, h_stream)
58+
mr.deallocate(int(ptr), size, stream)
59+
except Exception as exc:
60+
print(f"Warning: mr.deallocate() failed during Buffer destruction: {exc}",
61+
file=sys.stderr)
62+
63+
register_mr_dealloc_callback(_mr_dealloc_callback)
64+
65+
4066
__all__ = ['Buffer', 'MemoryResource']
4167

4268

@@ -73,27 +99,30 @@ cdef class Buffer:
7399
@classmethod
74100
def _init(
75101
cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
76-
stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
102+
ipc_descriptor: IPCBufferDescriptor | None = None,
77103
owner : object | None = None
78104
):
79-
"""Legacy init for compatibility - creates a non-owning ref handle.
105+
"""Create a Buffer from a raw pointer.
80106
81-
Note: The stream parameter is accepted for API compatibility but is
82-
ignored since non-owning refs are never freed by the handle.
107+
When ``mr`` is provided, the buffer takes ownership: ``mr.deallocate()``
108+
is called when the buffer is closed or garbage collected. When ``owner``
109+
is provided, the owner is kept alive but no deallocation is performed.
83110
"""
84111
if mr is not None and owner is not None:
85112
raise ValueError("owner and memory resource cannot be both specified together")
86113
cdef Buffer self = Buffer.__new__(cls)
87-
self._h_ptr = deviceptr_create_with_owner(<uintptr_t>(int(ptr)), owner)
114+
cdef uintptr_t c_ptr = <uintptr_t>(int(ptr))
115+
if mr is not None:
116+
self._h_ptr = deviceptr_create_with_mr(c_ptr, size, mr)
117+
else:
118+
self._h_ptr = deviceptr_create_with_owner(c_ptr, owner)
88119
self._size = size
89120
self._memory_resource = mr
90121
self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
91122
self._owner = owner
92123
self._mem_attrs_inited = False
93124
return self
94125

95-
# No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor
96-
97126
def __reduce__(self):
98127
# Must not serialize the parent's stream!
99128
return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
@@ -112,16 +141,19 @@ cdef class Buffer:
112141
size : int
113142
Memory size of the buffer
114143
mr : :obj:`~_memory.MemoryResource`, optional
115-
Memory resource associated with the buffer
144+
Memory resource associated with the buffer. When provided,
145+
:meth:`MemoryResource.deallocate` is called when the buffer is
146+
closed or garbage collected.
116147
owner : object, optional
117148
An object holding external allocation that the ``ptr`` points to.
118149
The reference is kept as long as the buffer is alive.
119150
The ``owner`` and ``mr`` cannot be specified together.
120151

121152
Note
122153
----
123-
This creates a non-owning reference. The pointer will NOT be freed
124-
when the Buffer is closed or garbage collected.
154+
When neither ``mr`` nor ``owner`` is specified, this creates a
155+
non-owning reference. The pointer will NOT be freed when the
156+
:class:`Buffer` is closed or garbage collected.
125157
"""
126158
return Buffer._init(ptr, size, mr=mr, owner=owner)
127159

cuda_core/cuda/core/_memory/_legacy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def allocate(self, size, stream=None) -> Buffer:
5151
raise_if_driver_error(err)
5252
else:
5353
ptr = 0
54-
return Buffer._init(ptr, size, self, stream)
54+
return Buffer._init(ptr, size, self)
5555

5656
def deallocate(self, ptr: DevicePointerT, size, stream):
5757
"""Deallocate a buffer previously allocated by this resource.
@@ -106,7 +106,7 @@ def allocate(self, size, stream=None) -> Buffer:
106106
raise_if_driver_error(err)
107107
else:
108108
ptr = 0
109-
return Buffer._init(ptr, size, self, stream)
109+
return Buffer._init(ptr, size, self)
110110

111111
def deallocate(self, ptr, size, stream):
112112
if stream is not None:

cuda_core/cuda/core/_resource_handles.pxd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ cdef DevicePtrHandle deviceptr_alloc(size_t size) except+ nogil
108108
cdef DevicePtrHandle deviceptr_alloc_host(size_t size) except+ nogil
109109
cdef DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) except+ nogil
110110
cdef DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner) except+ nogil
111+
cdef DevicePtrHandle deviceptr_create_with_mr(
112+
cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil
113+
114+
# MR deallocation callback type and registration
115+
ctypedef void (*MRDeallocCallback)(
116+
object mr, cydriver.CUdeviceptr ptr, size_t size,
117+
const StreamHandle& stream) noexcept
118+
cdef void register_mr_dealloc_callback(MRDeallocCallback cb) noexcept
119+
111120
cdef DevicePtrHandle deviceptr_import_ipc(
112121
const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil
113122
cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil

cuda_core/cuda/core/_resource_handles.pyx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,16 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
9292
cydriver.CUdeviceptr ptr) except+ nogil
9393
DevicePtrHandle deviceptr_create_with_owner "cuda_core::deviceptr_create_with_owner" (
9494
cydriver.CUdeviceptr ptr, object owner) except+ nogil
95+
96+
# MR deallocation callback
97+
ctypedef void (*MRDeallocCallback)(
98+
object mr, cydriver.CUdeviceptr ptr, size_t size,
99+
const StreamHandle& stream) noexcept
100+
void register_mr_dealloc_callback "cuda_core::register_mr_dealloc_callback" (
101+
MRDeallocCallback cb) noexcept
102+
DevicePtrHandle deviceptr_create_with_mr "cuda_core::deviceptr_create_with_mr" (
103+
cydriver.CUdeviceptr ptr, size_t size, object mr) except+ nogil
104+
95105
DevicePtrHandle deviceptr_import_ipc "cuda_core::deviceptr_import_ipc" (
96106
const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) except+ nogil
97107
StreamHandle deallocation_stream "cuda_core::deallocation_stream" (

cuda_core/tests/helpers/buffers.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"DummyUnifiedMemoryResource",
1515
"make_scratch_buffer",
1616
"PatternGen",
17+
"TrackingMR",
1718
]
1819

1920

@@ -41,6 +42,37 @@ def device_id(self) -> int:
4142
return self.device
4243

4344

45+
class TrackingMR(MemoryResource):
46+
"""A MemoryResource that tracks active allocations via a dict.
47+
48+
Useful for verifying that deallocate is called at the expected time.
49+
"""
50+
51+
def __init__(self):
52+
self.active = {}
53+
54+
def allocate(self, size, stream=None):
55+
ptr = handle_return(driver.cuMemAlloc(size))
56+
self.active[int(ptr)] = size
57+
return Buffer.from_handle(ptr=ptr, size=size, mr=self)
58+
59+
def deallocate(self, ptr, size, stream=None):
60+
handle_return(driver.cuMemFree(ptr))
61+
del self.active[int(ptr)]
62+
63+
@property
64+
def is_device_accessible(self):
65+
return True
66+
67+
@property
68+
def is_host_accessible(self):
69+
return False
70+
71+
@property
72+
def device_id(self):
73+
return 0
74+
75+
4476
class PatternGen:
4577
"""
4678
Provides methods to fill a target buffer with known test patterns and

cuda_core/tests/test_memory.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from cuda.core._utils.cuda_utils import CUDAError, handle_return
4040
from cuda.core.utils import StridedMemoryView
4141
from helpers import IS_WINDOWS, supports_ipc_mempool
42-
from helpers.buffers import DummyUnifiedMemoryResource
42+
from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR
4343

4444
from conftest import (
4545
create_managed_memory_resource_or_skip,
@@ -471,6 +471,49 @@ def test_buffer_external_managed(change_device):
471471
handle_return(driver.cuMemFree(ptr))
472472

473473

474+
def test_mr_deallocate_called_on_close():
475+
"""Buffer.from_handle(mr=mr) calls mr.deallocate() on close (issue #1619)."""
476+
device = Device()
477+
device.set_current()
478+
mr = TrackingMR()
479+
buf = mr.allocate(1024)
480+
assert len(mr.active) == 1
481+
buf.close()
482+
assert len(mr.active) == 0
483+
484+
485+
def test_mr_deallocate_called_on_gc():
486+
"""Buffer.from_handle(mr=mr) calls mr.deallocate() on GC (issue #1619)."""
487+
import gc
488+
489+
device = Device()
490+
device.set_current()
491+
mr = TrackingMR()
492+
buf = mr.allocate(1024)
493+
assert len(mr.active) == 1
494+
del buf
495+
gc.collect()
496+
assert len(mr.active) == 0
497+
498+
499+
def test_mr_deallocate_receives_stream():
500+
"""Buffer.close(stream) forwards the stream to mr.deallocate() (issue #1619)."""
501+
device = Device()
502+
device.set_current()
503+
stream = device.create_stream()
504+
received = {}
505+
506+
class StreamCaptureMR(TrackingMR):
507+
def deallocate(self, ptr, size, stream=None):
508+
received["stream"] = stream
509+
super().deallocate(ptr, size, stream)
510+
511+
mr = StreamCaptureMR()
512+
buf = mr.allocate(1024)
513+
buf.close(stream)
514+
assert received["stream"].handle == stream.handle
515+
516+
474517
def test_memory_resource_and_owner_disallowed():
475518
with pytest.raises(ValueError, match="cannot be both specified together"):
476519
a = (ctypes.c_byte * 20)()

0 commit comments

Comments
 (0)