Skip to content

Commit 494f53c

Browse files
Mark IPC tests as flaky and increase child timeout (#1686)
Add pytest-rerunfailures to test dependencies and apply @pytest.mark.flaky(reruns=2) to all IPC multiprocessing tests. Increase CHILD_TIMEOUT_SEC from 20 to 30 seconds to reduce spurious failures on busy CI runners. Closes #1622 Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent b6f2344 commit 494f53c

File tree

10 files changed

+44
-19
lines changed

10 files changed

+44
-19
lines changed

cuda_core/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ cu12 = ["cuda-bindings[all]==12.*"]
5555
cu13 = ["cuda-bindings[all]==13.*"]
5656

5757
[dependency-groups]
58-
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat"]
58+
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures"]
5959
ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
6060
test-cu12 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy
6161
test-cu13 = [ {include-group = "ml-dtypes" }, "cuda-core[test]", "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy

cuda_core/tests/memory_ipc/test_errors.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing
55
import pickle
66
import re
77

8+
import pytest
89
from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
910
from cuda.core._utils.cuda_utils import CUDAError
1011

11-
CHILD_TIMEOUT_SEC = 20
12+
CHILD_TIMEOUT_SEC = 30
1213
NBYTES = 64
1314
POOL_SIZE = 2097152
1415

@@ -17,6 +18,7 @@ class ChildErrorHarness:
1718
"""Test harness for checking errors in child processes. Subclasses override
1819
PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
1920

21+
@pytest.mark.flaky(reruns=2)
2022
def test_main(self, ipc_device, ipc_memory_resource):
2123
"""Parent process that checks child errors."""
2224
# Attach fixtures to this object for convenience. These can be accessed

cuda_core/tests/memory_ipc/test_event_ipc.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
@@ -10,13 +10,14 @@
1010
from helpers.logging import TimestampedLogger
1111

1212
ENABLE_LOGGING = False # Set True for test debugging and development
13-
CHILD_TIMEOUT_SEC = 20
13+
CHILD_TIMEOUT_SEC = 30
1414
NBYTES = 64
1515

1616

1717
class TestEventIpc:
1818
"""Check the basic usage of IPC-enabled events with a latch kernel."""
1919

20+
@pytest.mark.flaky(reruns=2)
2021
def test_main(self, ipc_device, ipc_memory_resource):
2122
log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
2223
device = ipc_device
@@ -93,6 +94,7 @@ def child_main(self, log, q_in, q_out):
9394
log("done")
9495

9596

97+
@pytest.mark.flaky(reruns=2)
9698
def test_event_is_monadic(ipc_device):
9799
"""Check that IPC-enabled events are always bound and cannot be reset."""
98100
device = ipc_device
@@ -108,6 +110,7 @@ def test_event_is_monadic(ipc_device):
108110
stream.record(e)
109111

110112

113+
@pytest.mark.flaky(reruns=2)
111114
@pytest.mark.parametrize(
112115
"options", [{"ipc_enabled": True, "enable_timing": True}, EventOptions(ipc_enabled=True, enable_timing=True)]
113116
)
@@ -125,6 +128,7 @@ class TestIpcEventProperties:
125128
process.
126129
"""
127130

131+
@pytest.mark.flaky(reruns=2)
128132
@pytest.mark.parametrize("busy_waited_sync", [True, False])
129133
@pytest.mark.parametrize("use_options_cls", [True, False])
130134
@pytest.mark.parametrize("use_option_kw", [True, False])

cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
"""Test for duplicate IPC buffer imports.
@@ -16,7 +16,7 @@
1616
from cuda.core import Buffer, Device
1717
from helpers.logging import TimestampedLogger
1818

19-
CHILD_TIMEOUT_SEC = 20
19+
CHILD_TIMEOUT_SEC = 30
2020
NBYTES = 64
2121
POOL_SIZE = 2097152
2222

@@ -60,6 +60,7 @@ def _set_start_method(self):
6060
with contextlib.suppress(RuntimeError):
6161
mp.set_start_method("spawn", force=True)
6262

63+
@pytest.mark.flaky(reruns=2)
6364
def test_main(self, ipc_device, ipc_memory_resource):
6465
log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
6566
ipc_device.set_current()

cuda_core/tests/memory_ipc/test_leaks.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import contextlib
@@ -14,7 +14,7 @@
1414
HAVE_PSUTIL = True
1515
import pytest
1616

17-
CHILD_TIMEOUT_SEC = 20
17+
CHILD_TIMEOUT_SEC = 30
1818
NBYTES = 64
1919

2020
USING_FDS = platform.system() == "Linux"
@@ -23,6 +23,7 @@
2323
)
2424

2525

26+
@pytest.mark.flaky(reruns=2)
2627
@skip_if_unrunnable
2728
def test_alloc_handle(ipc_memory_resource):
2829
"""Check for fd leaks in get_allocation_handle."""
@@ -79,6 +80,7 @@ def __reduce__(self):
7980
raise RuntimeError("Irreducible")
8081

8182

83+
@pytest.mark.flaky(reruns=2)
8284
@skip_if_unrunnable
8385
@pytest.mark.parametrize(
8486
"getobject",

cuda_core/tests/memory_ipc/test_memory_ipc.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
55

6+
import pytest
67
from cuda.core import Buffer, DeviceMemoryResource
78
from helpers.buffers import PatternGen
89

9-
CHILD_TIMEOUT_SEC = 20
10+
CHILD_TIMEOUT_SEC = 30
1011
NBYTES = 64
1112
NWORKERS = 2
1213
NTASKS = 2
1314

1415

1516
class TestIpcMempool:
17+
@pytest.mark.flaky(reruns=2)
1618
def test_main(self, ipc_device, ipc_memory_resource):
1719
"""Test IPC with memory pools."""
1820
# Set up the IPC-enabled memory pool and share it.
@@ -54,6 +56,7 @@ def child_main(self, device, mr, queue):
5456

5557

5658
class TestIPCMempoolMultiple:
59+
@pytest.mark.flaky(reruns=2)
5760
def test_main(self, ipc_device, ipc_memory_resource):
5861
"""Test IPC with memory pools using multiple processes."""
5962
# Construct an IPC-enabled memory resource and share it with two children.
@@ -104,6 +107,7 @@ def child_main(self, device, mr, seed, queue):
104107

105108

106109
class TestIPCSharedAllocationHandleAndBufferDescriptors:
110+
@pytest.mark.flaky(reruns=2)
107111
def test_main(self, ipc_device, ipc_memory_resource):
108112
"""
109113
Demonstrate that a memory pool allocation handle can be reused for IPC
@@ -154,6 +158,7 @@ def child_main(self, device, alloc_handle, seed, queue):
154158

155159

156160
class TestIPCSharedAllocationHandleAndBufferObjects:
161+
@pytest.mark.flaky(reruns=2)
157162
def test_main(self, ipc_device, ipc_memory_resource):
158163
"""
159164
Demonstrate that a memory pool allocation handle can be reused for IPC

cuda_core/tests/memory_ipc/test_peer_access.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
@@ -8,7 +8,7 @@
88
from cuda.core._utils.cuda_utils import CUDAError
99
from helpers.buffers import PatternGen
1010

11-
CHILD_TIMEOUT_SEC = 20
11+
CHILD_TIMEOUT_SEC = 30
1212
NBYTES = 64
1313
POOL_SIZE = 2097152
1414

@@ -19,6 +19,7 @@ class TestPeerAccessNotPreservedOnImport:
1919
is sent to another process via IPC, and that peer access can be set after import.
2020
"""
2121

22+
@pytest.mark.flaky(reruns=2)
2223
def test_main(self, mempool_device_x2):
2324
dev0, dev1 = mempool_device_x2
2425

@@ -57,6 +58,7 @@ class TestBufferPeerAccessAfterImport:
5758
setting peer access on the imported memory resource, and that access can be revoked.
5859
"""
5960

61+
@pytest.mark.flaky(reruns=2)
6062
@pytest.mark.parametrize("grant_access_in_parent", [True, False])
6163
def test_main(self, mempool_device_x2, grant_access_in_parent):
6264
dev0, dev1 = mempool_device_x2

cuda_core/tests/memory_ipc/test_send_buffers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
@@ -8,14 +8,15 @@
88
from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
99
from helpers.buffers import PatternGen
1010

11-
CHILD_TIMEOUT_SEC = 20
11+
CHILD_TIMEOUT_SEC = 30
1212
NBYTES = 64
1313
NMRS = 3
1414
NTASKS = 7
1515
POOL_SIZE = 2097152
1616

1717

1818
class TestIpcSendBuffers:
19+
@pytest.mark.flaky(reruns=2)
1920
@pytest.mark.parametrize("nmrs", (1, NMRS))
2021
def test_main(self, ipc_device, nmrs):
2122
"""Test passing buffers sourced from multiple memory resources."""
@@ -67,6 +68,7 @@ class TestIpcReexport:
6768
re-exported from B to C.
6869
"""
6970

71+
@pytest.mark.flaky(reruns=2)
7072
def test_main(self, ipc_device, ipc_memory_resource):
7173
# Set up the device.
7274
device = ipc_device

cuda_core/tests/memory_ipc/test_serialize.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
55
import multiprocessing.reduction
66
import os
77

8+
import pytest
89
from cuda.core import Buffer, Device, DeviceMemoryResource
910
from helpers.buffers import PatternGen
1011

11-
CHILD_TIMEOUT_SEC = 20
12+
CHILD_TIMEOUT_SEC = 30
1213
NBYTES = 64
1314
POOL_SIZE = 2097152
1415

@@ -21,6 +22,7 @@ class TestObjectSerializationDirect:
2122
it on the other end and demonstrate buffer sharing.
2223
"""
2324

25+
@pytest.mark.flaky(reruns=2)
2426
def test_main(self, ipc_device, ipc_memory_resource):
2527
device = ipc_device
2628
mr = ipc_memory_resource
@@ -76,6 +78,7 @@ def child_main(self, conn):
7678

7779

7880
class TestObjectSerializationWithMR:
81+
@pytest.mark.flaky(reruns=2)
7982
def test_main(self, ipc_device, ipc_memory_resource):
8083
"""Test sending IPC memory objects to a child through a queue."""
8184
device = ipc_device
@@ -131,6 +134,7 @@ class TestObjectPassing:
131134
in multiprocessing (e.g., Queue) work.
132135
"""
133136

137+
@pytest.mark.flaky(reruns=2)
134138
def test_main(self, ipc_device, ipc_memory_resource):
135139
# Define the objects.
136140
device = ipc_device

cuda_core/tests/memory_ipc/test_workerpool.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
import multiprocessing as mp
@@ -9,7 +9,7 @@
99
from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
1010
from helpers.buffers import PatternGen
1111

12-
CHILD_TIMEOUT_SEC = 20
12+
CHILD_TIMEOUT_SEC = 30
1313
NBYTES = 64
1414
NWORKERS = 2
1515
NMRS = 3
@@ -26,6 +26,7 @@ class TestIpcWorkerPool:
2626
resource (duplicates are ignored on the receiving end).
2727
"""
2828

29+
@pytest.mark.flaky(reruns=2)
2930
@pytest.mark.parametrize("nmrs", (1, NMRS))
3031
def test_main(self, ipc_device, nmrs):
3132
device = ipc_device
@@ -62,6 +63,7 @@ def init_worker(mrs):
6263
"""Called during child process initialization to store received memory resources."""
6364
TestIpcWorkerPoolUsingIPCDescriptors.mrs = mrs
6465

66+
@pytest.mark.flaky(reruns=2)
6567
@pytest.mark.parametrize("nmrs", (1, NMRS))
6668
def test_main(self, ipc_device, nmrs):
6769
device = ipc_device
@@ -106,6 +108,7 @@ def init_worker(mrs):
106108
# Passing mrs implicitly registers them.
107109
pass
108110

111+
@pytest.mark.flaky(reruns=2)
109112
@pytest.mark.parametrize("nmrs", (1, NMRS))
110113
def test_main(self, ipc_device, nmrs):
111114
device = ipc_device

0 commit comments

Comments
 (0)