Skip to content

Commit 6fb39c1

Browse files
cpcloudclaude
andcommitted
Fix IPC memory pool leak in nightly repeated test runs
IPC test fixtures and test methods create DeviceMemoryResource and PinnedMemoryResource instances that are never closed. Under single-run CI this is harmless, but nightly CI runs tests with pytest-repeat --count=100, accumulating ~200+ leaked CUDA mempools and POSIX fd exports per test file until the process segfaults around iteration 83. Convert the ipc_memory_resource fixture from return to yield with explicit close() teardown, and close locally-created memory resources in test_errors, test_send_buffers, and test_workerpool after use. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7a9a248 commit 6fb39c1

File tree

4 files changed

+16
-1
lines changed

4 files changed

+16
-1
lines changed

cuda_core/tests/conftest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,8 @@ def ipc_memory_resource(request, ipc_device):
183183
mr = PinnedMemoryResource(options=options)
184184

185185
assert mr.is_ipc_enabled
186-
return mr
186+
yield mr
187+
mr.close()
187188

188189

189190
@pytest.fixture

cuda_core/tests/memory_ipc/test_errors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def test_main(self, ipc_device, ipc_memory_resource):
2626
# from PARENT_ACTION.
2727
self.device = ipc_device
2828
self.mr = ipc_memory_resource
29+
self._extra_mrs = []
2930

3031
# Start a child process to generate error info.
3132
pipe = [multiprocessing.Queue() for _ in range(2)]
@@ -43,6 +44,9 @@ def test_main(self, ipc_device, ipc_memory_resource):
4344
process.join(timeout=CHILD_TIMEOUT_SEC)
4445
assert process.exitcode == 0
4546

47+
for mr in self._extra_mrs:
48+
mr.close()
49+
4650
def child_main(self, pipe, device, mr):
4751
"""Child process that pushes IPC errors to a shared pipe for testing."""
4852
self.device = device
@@ -78,6 +82,7 @@ class TestImportWrongMR(ChildErrorHarness):
7882
def PARENT_ACTION(self, queue):
7983
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
8084
mr2 = DeviceMemoryResource(self.device, options=options)
85+
self._extra_mrs.append(mr2)
8186
buffer = mr2.allocate(NBYTES)
8287
queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer
8388

@@ -117,6 +122,7 @@ class TestDanglingBuffer(ChildErrorHarness):
117122
def PARENT_ACTION(self, queue):
118123
options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
119124
mr2 = DeviceMemoryResource(self.device, options=options)
125+
self._extra_mrs.append(mr2)
120126
self.buffer = mr2.allocate(NBYTES)
121127
buffer_s = pickle.dumps(self.buffer)
122128
queue.put(buffer_s) # Note: mr2 not sent

cuda_core/tests/memory_ipc/test_send_buffers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ def test_main(self, ipc_device, nmrs):
4545
for buffer in buffers:
4646
pgen.verify_buffer(buffer, seed=True)
4747
buffer.close()
48+
for mr in mrs:
49+
mr.close()
4850

4951
def child_main(self, device, buffers):
5052
device.set_current()

cuda_core/tests/memory_ipc/test_workerpool.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def test_main(self, ipc_device, nmrs):
4242
for buffer in buffers:
4343
pgen.verify_buffer(buffer, seed=True)
4444
buffer.close()
45+
for mr in mrs:
46+
mr.close()
4547

4648
def process_buffer(self, buffer):
4749
device = Device(buffer.memory_resource.device_id)
@@ -82,6 +84,8 @@ def test_main(self, ipc_device, nmrs):
8284
for buffer in buffers:
8385
pgen.verify_buffer(buffer, seed=True)
8486
buffer.close()
87+
for mr in mrs:
88+
mr.close()
8589

8690
def process_buffer(self, mr_idx, buffer_desc):
8791
mr = self.mrs[mr_idx]
@@ -124,6 +128,8 @@ def test_main(self, ipc_device, nmrs):
124128
for buffer in buffers:
125129
pgen.verify_buffer(buffer, seed=True)
126130
buffer.close()
131+
for mr in mrs:
132+
mr.close()
127133

128134
def process_buffer(self, device, buffer_s):
129135
device.set_current()

0 commit comments

Comments
 (0)