Skip to content

Commit 2849053

Browse files
rwgkcursoragent
andauthored
test: xfail Windows MCDM mempool OOM setup failures (#2000)
* test: xfail Windows mempool OOM cases Work around nvbugs5815123 by treating OOM returns from mempool setup in affected tests as expected failures on Windows. Unsupported configurations still skip normally, while other platforms continue to fail on unexpected OOMs. Made-with: Cursor * test: limit mempool OOM xfail to MCDM Use NVML to confirm the CUDA device is running on Windows MCDM before treating mempool OOM setup failures as expected. If the MCDM check cannot be completed, leave the original test failure visible. Made-with: Cursor * test: centralize mempool OOM xfail helper Move the Windows MCDM detection and mempool OOM xfail handling into a shared test helper so cuda.bindings and cuda.core tests use the same workaround logic. Made-with: Cursor * test: keep MCDM detection fallback in xfail helper Let the MCDM detector report only the detected state and keep the broad fallback in the mempool OOM xfail path, where detection failures should leave the original test failure visible. Made-with: Cursor * test: simplify MCDM helper device lookup Use getattr for the shared mempool helper so it accepts device objects and raw ordinals without extra branching. Co-authored-by: Cursor <cursoragent@cursor.com> * test: restore managed helper skip naming Keep the established managed-memory test helper name so call sites stay readable, while documenting that Windows MCDM mempool OOM setup failures are xfailed rather than skipped. Co-authored-by: Cursor <cursoragent@cursor.com> * test: rename pinned helper for xfail flow Clarify pinned mempool test setup by keeping skip for capability checks and using xfail naming for the Windows MCDM constructor workaround. Co-authored-by: Cursor <cursoragent@cursor.com> * test: tolerate missing mempool xfail helper Allow cuda_core tests to run against older cuda.bindings artifacts by falling back when the mempool xfail helper is unavailable, so collection succeeds without the new OOM xfail behavior. Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent ad9bc92 commit 2849053

6 files changed

Lines changed: 123 additions & 7 deletions

File tree

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
import sys
5+
6+
import pytest
7+
8+
from cuda.bindings import driver, runtime
9+
10+
11+
def is_windows_mcdm_device(device=0):
12+
if sys.platform != "win32":
13+
return False
14+
import cuda.bindings.nvml as nvml
15+
16+
device_id = int(getattr(device, "device_id", device))
17+
(err,) = driver.cuInit(0)
18+
if err != driver.CUresult.CUDA_SUCCESS:
19+
return False
20+
err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
21+
if err != driver.CUresult.CUDA_SUCCESS:
22+
return False
23+
pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
24+
nvml.init_v2()
25+
try:
26+
handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
27+
current, _ = nvml.device_get_driver_model_v2(handle)
28+
return current == nvml.DriverModel.DRIVER_MCDM
29+
finally:
30+
nvml.shutdown()
31+
32+
33+
def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
34+
if api_name is not None and not isinstance(api_name, str):
35+
device = api_name
36+
api_name = None
37+
38+
is_oom = err_or_exc in (
39+
driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY,
40+
runtime.cudaError_t.cudaErrorMemoryAllocation,
41+
) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc)
42+
43+
if not is_oom:
44+
return
45+
try:
46+
is_windows_mcdm = is_windows_mcdm_device(device)
47+
except Exception:
48+
# If MCDM detection fails, leave the primary test failure visible.
49+
return
50+
if not is_windows_mcdm:
51+
return
52+
53+
api_context = f"{api_name} " if api_name else ""
54+
pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")

cuda_bindings/tests/test_cuda.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import cuda.bindings.driver as cuda
1313
import cuda.bindings.runtime as cudart
1414
from cuda.bindings import driver
15+
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
1516

1617

1718
def driverVersionLessThan(target):
@@ -270,6 +271,7 @@ def test_cuda_memPool_attr():
270271

271272
attr_list = [None] * 8
272273
err, pool = cuda.cuMemPoolCreate(poolProps)
274+
xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id)
273275
assert err == cuda.CUresult.CUDA_SUCCESS
274276

275277
for idx, attr in enumerate(
@@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device):
468470
params.bytesize = allocSize
469471

470472
err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params)
473+
if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY:
474+
(destroy_err,) = cuda.cuGraphDestroy(graph)
475+
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
476+
(destroy_err,) = cuda.cuStreamDestroy(stream)
477+
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
478+
xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device)
471479
assert err == cuda.CUresult.CUDA_SUCCESS
472480
err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr)
473481
assert err == cuda.CUresult.CUDA_SUCCESS

cuda_bindings/tests/test_cudart.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import cuda.bindings.runtime as cudart
1212
from cuda import pathfinder
1313
from cuda.bindings import runtime
14+
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
1415

1516

1617
def isSuccess(err):
@@ -432,6 +433,7 @@ def test_cudart_MemPool_attr():
432433

433434
attr_list = [None] * 8
434435
err, pool = cudart.cudaMemPoolCreate(poolProps)
436+
xfail_if_mempool_oom(err, "cudaMemPoolCreate", poolProps.location.id)
435437
assertSuccess(err)
436438

437439
for idx, attr in enumerate(

cuda_core/tests/conftest.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,17 @@
2727
PinnedMemoryResourceOptions,
2828
_device,
2929
)
30-
from cuda.core._utils.cuda_utils import handle_return
30+
from cuda.core._utils.cuda_utils import CUDAError, handle_return
31+
32+
try:
33+
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
34+
except ModuleNotFoundError:
35+
# Older cuda.bindings artifacts (for example 12.9.x backports) do not ship
36+
# this helper yet. In that case, keep the primary failure visible instead of
37+
# xfail-ing the known Windows MCDM mempool setup issue.
38+
def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
39+
return
40+
3141

3242
# Import shared test helpers for tests across subprojects.
3343
# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
@@ -61,21 +71,56 @@ def skip_if_managed_memory_unsupported(device):
6171
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
6272
try:
6373
ManagedMemoryResource()
74+
except CUDAError as e:
75+
xfail_if_mempool_oom(e, device)
76+
raise
6477
except RuntimeError as e:
6578
if "requires CUDA 13.0" in str(e):
6679
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
6780
raise
6881

6982

70-
def create_managed_memory_resource_or_skip(*args, **kwargs):
83+
def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
84+
# Keep the established "skip" helper name for call-site readability, even though
85+
# Windows MCDM mempool OOM setup failures are xfailed instead of skipped.
7186
try:
7287
return ManagedMemoryResource(*args, **kwargs)
88+
except CUDAError as e:
89+
xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
90+
raise
7391
except RuntimeError as e:
7492
if "requires CUDA 13.0" in str(e):
7593
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
7694
raise
7795

7896

97+
def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
98+
try:
99+
return PinnedMemoryResource(*args, **kwargs)
100+
except CUDAError as e:
101+
xfail_if_mempool_oom(e, xfail_device)
102+
raise
103+
104+
105+
def _device_id_from_resource_options(device, args, kwargs):
106+
if device is not None:
107+
return device
108+
options = kwargs.get("options")
109+
if options is None and args:
110+
options = args[0]
111+
if options is None:
112+
return 0
113+
if isinstance(options, dict):
114+
preferred_location = options.get("preferred_location")
115+
preferred_location_type = options.get("preferred_location_type")
116+
else:
117+
preferred_location = getattr(options, "preferred_location", None)
118+
preferred_location_type = getattr(options, "preferred_location_type", None)
119+
if preferred_location_type in (None, "device") and isinstance(preferred_location, int) and preferred_location >= 0:
120+
return preferred_location
121+
return 0
122+
123+
79124
@pytest.fixture(scope="session", autouse=True)
80125
def session_setup():
81126
# Always init CUDA.

cuda_core/tests/test_managed_memory_warning.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
import pytest
1414

1515
import cuda.bindings
16+
from conftest import xfail_if_mempool_oom
1617
from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions
1718
from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning
19+
from cuda.core._utils.cuda_utils import CUDAError
1820

1921
_cuda_major = int(cuda.bindings.__version__.split(".")[0])
2022

@@ -47,8 +49,12 @@ def device_without_concurrent_managed_access(init_cuda):
4749
@requires_cuda_13
4850
def test_default_pool_error_without_concurrent_access(device_without_concurrent_managed_access):
4951
"""ManagedMemoryResource() raises RuntimeError when the default pool doesn't support managed."""
50-
with pytest.raises(RuntimeError, match="does not support managed allocations"):
51-
ManagedMemoryResource()
52+
try:
53+
with pytest.raises(RuntimeError, match="does not support managed allocations"):
54+
ManagedMemoryResource()
55+
except CUDAError as exc:
56+
xfail_if_mempool_oom(exc, device_without_concurrent_managed_access)
57+
raise
5258

5359

5460
@requires_cuda_13

cuda_core/tests/test_memory.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from conftest import (
2424
create_managed_memory_resource_or_skip,
25+
create_pinned_memory_resource_or_xfail,
2526
skip_if_managed_memory_unsupported,
2627
skip_if_pinned_memory_unsupported,
2728
)
@@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind):
639640
mr = DeviceMemoryResource(device)
640641
else:
641642
skip_if_pinned_memory_unsupported(device)
642-
mr = PinnedMemoryResource()
643+
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
643644
assert mr.is_managed is False
644645
buf = mr.allocate(1024)
645646
assert buf.is_managed is False
@@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda):
684685

685686
device.set_current()
686687

687-
mr = PinnedMemoryResource()
688+
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
688689
assert mr.is_device_accessible
689690
assert mr.is_host_accessible
690691

@@ -1581,7 +1582,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
15811582
pytest.skip("Device does not support mempool operations")
15821583
elif MR is PinnedMemoryResource:
15831584
skip_if_pinned_memory_unsupported(device)
1584-
mr = MR()
1585+
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
15851586
elif MR is ManagedMemoryResource:
15861587
skip_if_managed_memory_unsupported(device)
15871588
mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id))

0 commit comments

Comments
 (0)