Skip to content

Commit ac3b15d

Browse files
authored
Fully clean up buffers/memory resources in tests (#2243)
Add explicit sync points and notes around `mr.close()` making sure that there is no MR shutdown with pending async buffer freeing. These can currently cause occasional dead-locks when we have many MRs in parallel. Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
1 parent 1e33ace commit ac3b15d

6 files changed

Lines changed: 42 additions & 4 deletions

File tree

cuda_core/tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ def ipc_memory_resource(request, ipc_device):
290290
assert mr.is_ipc_enabled
291291
yield mr
292292
mr.close()
293+
# TODO(seberg): Make sure the `mr` and it's buffers are fully torn down.
294+
# May be unnecessary as `mr.close()` is not parallel with other work.
295+
ipc_device.sync()
293296

294297

295298
@pytest.fixture

cuda_core/tests/memory_ipc/test_peer_access.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent):
9292
assert process.exitcode == 0
9393

9494
buffer.close()
95+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
96+
dev1.sync()
9597
mr.close()
9698

9799
def child_main(self, mr, buffer):
@@ -129,4 +131,6 @@ def child_main(self, mr, buffer):
129131
PatternGen(dev0, NBYTES).verify_buffer(buffer, seed=False)
130132

131133
buffer.close()
134+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
135+
dev1.sync()
132136
mr.close()

cuda_core/tests/memory_ipc/test_send_buffers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def test_main(self, ipc_device, nmrs):
2929
device = ipc_device
3030
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
3131
mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
32+
buffers = []
3233

3334
try:
3435
# Allocate and fill memory.
@@ -54,6 +55,10 @@ def test_main(self, ipc_device, nmrs):
5455
pgen.verify_buffer(buffer, seed=True)
5556
buffer.close()
5657
finally:
58+
for buffer in buffers:
59+
buffer.close()
60+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
61+
device.sync()
5762
for mr in mrs:
5863
mr.close()
5964

cuda_core/tests/memory_ipc/test_workerpool.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def test_main(self, ipc_device, nmrs):
3535
device = ipc_device
3636
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
3737
mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
38+
buffers = []
3839

3940
try:
4041
buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -45,8 +46,11 @@ def test_main(self, ipc_device, nmrs):
4546
pgen = PatternGen(device, NBYTES)
4647
for buffer in buffers:
4748
pgen.verify_buffer(buffer, seed=True)
48-
buffer.close()
4949
finally:
50+
for buffer in buffers:
51+
buffer.close()
52+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
53+
device.sync()
5054
for mr in mrs:
5155
mr.close()
5256

@@ -77,6 +81,7 @@ def test_main(self, ipc_device, nmrs):
7781
device = ipc_device
7882
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
7983
mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
84+
buffers = []
8085

8186
try:
8287
buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -90,8 +95,11 @@ def test_main(self, ipc_device, nmrs):
9095
pgen = PatternGen(device, NBYTES)
9196
for buffer in buffers:
9297
pgen.verify_buffer(buffer, seed=True)
93-
buffer.close()
9498
finally:
99+
for buffer in buffers:
100+
buffer.close()
101+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
102+
device.sync()
95103
for mr in mrs:
96104
mr.close()
97105

@@ -127,6 +135,7 @@ def test_main(self, ipc_device, nmrs):
127135
device = ipc_device
128136
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
129137
mrs = [DeviceMemoryResource(device, options=options) for _ in range(nmrs)]
138+
buffers = []
130139

131140
try:
132141
buffers = [mr.allocate(NBYTES, stream=device.default_stream) for mr, _ in zip(cycle(mrs), range(NTASKS))]
@@ -137,8 +146,11 @@ def test_main(self, ipc_device, nmrs):
137146
pgen = PatternGen(device, NBYTES)
138147
for buffer in buffers:
139148
pgen.verify_buffer(buffer, seed=True)
140-
buffer.close()
141149
finally:
150+
for buffer in buffers:
151+
buffer.close()
152+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
153+
device.sync()
142154
for mr in mrs:
143155
mr.close()
144156

cuda_core/tests/test_memory.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,6 +1103,8 @@ def test_device_memory_resource_with_options(init_cuda):
11031103
device.sync()
11041104
dst_buffer.close()
11051105
src_buffer.close()
1106+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
1107+
device.sync()
11061108

11071109

11081110
def test_pinned_memory_resource_with_options(init_cuda):
@@ -1149,6 +1151,8 @@ def test_pinned_memory_resource_with_options(init_cuda):
11491151
device.sync()
11501152
dst_buffer.close()
11511153
src_buffer.close()
1154+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
1155+
device.sync()
11521156

11531157

11541158
def test_managed_memory_resource_with_options(init_cuda):
@@ -1365,6 +1369,8 @@ def test_mempool_ipc_errors(mempool_device):
13651369
Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream)
13661370

13671371
buffer.close()
1372+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
1373+
device.sync()
13681374

13691375

13701376
def test_pinned_mempool_ipc_basic():
@@ -1405,6 +1411,8 @@ def test_pinned_mempool_ipc_basic():
14051411
assert ipc_desc.size == 1024
14061412

14071413
buffer.close()
1414+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
1415+
device.sync()
14081416
mr.close()
14091417

14101418

@@ -1436,6 +1444,8 @@ def test_pinned_mempool_ipc_errors():
14361444
Buffer.from_ipc_descriptor(mr, handle, stream=device.default_stream)
14371445

14381446
buffer.close()
1447+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
1448+
device.sync()
14391449
mr.close()
14401450

14411451

cuda_core/tests/test_object_protocols.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,11 @@ def sample_ipc_buffer_descriptor(ipc_device):
233233
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
234234
mr = DeviceMemoryResource(ipc_device, options=options)
235235
buf = mr.allocate(64, stream=ipc_device.default_stream)
236-
return buf.ipc_descriptor
236+
descriptor = buf.ipc_descriptor
237+
buf.close()
238+
# TODO(seberg): 2026-06: mr close may be unsafe with incomplete `buf.close()`
239+
ipc_device.sync()
240+
return descriptor
237241

238242

239243
@pytest.fixture

0 commit comments

Comments
 (0)