test: xfail Windows MCDM mempool OOM setup failures (#2000)

rwgk · cursoragent · web-flow · commit 2849053e0605 · 2026-05-01T13:50:06.000-07:00
* test: xfail Windows mempool OOM cases

Work around nvbugs5815123 by treating OOM returns from mempool setup in affected tests as expected failures on Windows. Unsupported configurations still skip normally, while other platforms continue to fail on unexpected OOMs.

Made-with: Cursor

* test: limit mempool OOM xfail to MCDM

Use NVML to confirm the CUDA device is running on Windows MCDM before treating mempool OOM setup failures as expected. If the MCDM check cannot be completed, leave the original test failure visible.

Made-with: Cursor

* test: centralize mempool OOM xfail helper

Move the Windows MCDM detection and mempool OOM xfail handling into a shared test helper so cuda.bindings and cuda.core tests use the same workaround logic.

Made-with: Cursor

* test: keep MCDM detection fallback in xfail helper

Let the MCDM detector report only the detected state and keep the broad fallback in the mempool OOM xfail path, where detection failures should leave the original test failure visible.

Made-with: Cursor

* test: simplify MCDM helper device lookup

Use getattr for the shared mempool helper so it accepts device objects and raw ordinals without extra branching.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* test: restore managed helper skip naming

Keep the established managed-memory test helper name so call sites stay readable, while documenting that Windows MCDM mempool OOM setup failures are xfailed rather than skipped.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* test: rename pinned helper for xfail flow

Clarify pinned mempool test setup by keeping skip for capability checks and using xfail naming for the Windows MCDM constructor workaround.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* test: tolerate missing mempool xfail helper

Allow cuda_core tests to run against older cuda.bindings artifacts by falling back when the mempool xfail helper is unavailable, so collection succeeds without the new OOM xfail behavior.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

---------

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/mempool.py b/cuda_bindings/cuda/bindings/_test_helpers/mempool.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import sys
+
+import pytest
+
+from cuda.bindings import driver, runtime
+
+
+def is_windows_mcdm_device(device=0):
+    if sys.platform != "win32":
+        return False
+    import cuda.bindings.nvml as nvml
+
+    device_id = int(getattr(device, "device_id", device))
+    (err,) = driver.cuInit(0)
+    if err != driver.CUresult.CUDA_SUCCESS:
+        return False
+    err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
+    if err != driver.CUresult.CUDA_SUCCESS:
+        return False
+    pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
+    nvml.init_v2()
+    try:
+        handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
+        current, _ = nvml.device_get_driver_model_v2(handle)
+        return current == nvml.DriverModel.DRIVER_MCDM
+    finally:
+        nvml.shutdown()
+
+
+def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
+    if api_name is not None and not isinstance(api_name, str):
+        device = api_name
+        api_name = None
+
+    is_oom = err_or_exc in (
+        driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY,
+        runtime.cudaError_t.cudaErrorMemoryAllocation,
+    ) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc)
+
+    if not is_oom:
+        return
+    try:
+        is_windows_mcdm = is_windows_mcdm_device(device)
+    except Exception:
+        # If MCDM detection fails, leave the primary test failure visible.
+        return
+    if not is_windows_mcdm:
+        return
+
+    api_context = f"{api_name} " if api_name else ""
+    pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -12,6 +12,7 @@
 import cuda.bindings.driver as cuda
 import cuda.bindings.runtime as cudart
 from cuda.bindings import driver
+from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
 
 
 def driverVersionLessThan(target):
@@ -270,6 +271,7 @@ def test_cuda_memPool_attr():
 
     attr_list = [None] * 8
     err, pool = cuda.cuMemPoolCreate(poolProps)
+    xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
     for idx, attr in enumerate(
@@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device):
     params.bytesize = allocSize
 
     err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params)
+    if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY:
+        (destroy_err,) = cuda.cuGraphDestroy(graph)
+        assert destroy_err == cuda.CUresult.CUDA_SUCCESS
+        (destroy_err,) = cuda.cuStreamDestroy(stream)
+        assert destroy_err == cuda.CUresult.CUDA_SUCCESS
+        xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device)
     assert err == cuda.CUresult.CUDA_SUCCESS
     err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr)
     assert err == cuda.CUresult.CUDA_SUCCESS
diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py
@@ -11,6 +11,7 @@
 import cuda.bindings.runtime as cudart
 from cuda import pathfinder
 from cuda.bindings import runtime
+from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
 
 
 def isSuccess(err):
@@ -432,6 +433,7 @@ def test_cudart_MemPool_attr():
 
     attr_list = [None] * 8
     err, pool = cudart.cudaMemPoolCreate(poolProps)
+    xfail_if_mempool_oom(err, "cudaMemPoolCreate", poolProps.location.id)
     assertSuccess(err)
 
     for idx, attr in enumerate(
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -27,7 +27,17 @@
     PinnedMemoryResourceOptions,
     _device,
 )
-from cuda.core._utils.cuda_utils import handle_return
+from cuda.core._utils.cuda_utils import CUDAError, handle_return
+
+try:
+    from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
+except ModuleNotFoundError:
+    # Older cuda.bindings artifacts (for example 12.9.x backports) do not ship
+    # this helper yet. In that case, keep the primary failure visible instead of
+    # xfail-ing the known Windows MCDM mempool setup issue.
+    def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
+        return
+
 
 # Import shared test helpers for tests across subprojects.
 # PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
@@ -61,21 +71,56 @@ def skip_if_managed_memory_unsupported(device):
         pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
     try:
         ManagedMemoryResource()
+    except CUDAError as e:
+        xfail_if_mempool_oom(e, device)
+        raise
     except RuntimeError as e:
         if "requires CUDA 13.0" in str(e):
             pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
         raise
 
 
-def create_managed_memory_resource_or_skip(*args, **kwargs):
+def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
+    # Keep the established "skip" helper name for call-site readability, even though
+    # Windows MCDM mempool OOM setup failures are xfailed instead of skipped.
     try:
         return ManagedMemoryResource(*args, **kwargs)
+    except CUDAError as e:
+        xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
+        raise
     except RuntimeError as e:
         if "requires CUDA 13.0" in str(e):
             pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
         raise
 
 
+def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
+    try:
+        return PinnedMemoryResource(*args, **kwargs)
+    except CUDAError as e:
+        xfail_if_mempool_oom(e, xfail_device)
+        raise
+
+
+def _device_id_from_resource_options(device, args, kwargs):
+    if device is not None:
+        return device
+    options = kwargs.get("options")
+    if options is None and args:
+        options = args[0]
+    if options is None:
+        return 0
+    if isinstance(options, dict):
+        preferred_location = options.get("preferred_location")
+        preferred_location_type = options.get("preferred_location_type")
+    else:
+        preferred_location = getattr(options, "preferred_location", None)
+        preferred_location_type = getattr(options, "preferred_location_type", None)
+    if preferred_location_type in (None, "device") and isinstance(preferred_location, int) and preferred_location >= 0:
+        return preferred_location
+    return 0
+
+
 @pytest.fixture(scope="session", autouse=True)
 def session_setup():
     # Always init CUDA.
diff --git a/cuda_core/tests/test_managed_memory_warning.py b/cuda_core/tests/test_managed_memory_warning.py
@@ -13,8 +13,10 @@
 import pytest
 
 import cuda.bindings
+from conftest import xfail_if_mempool_oom
 from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions
 from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning
+from cuda.core._utils.cuda_utils import CUDAError
 
 _cuda_major = int(cuda.bindings.__version__.split(".")[0])
 
@@ -47,8 +49,12 @@ def device_without_concurrent_managed_access(init_cuda):
 @requires_cuda_13
 def test_default_pool_error_without_concurrent_access(device_without_concurrent_managed_access):
     """ManagedMemoryResource() raises RuntimeError when the default pool doesn't support managed."""
-    with pytest.raises(RuntimeError, match="does not support managed allocations"):
-        ManagedMemoryResource()
+    try:
+        with pytest.raises(RuntimeError, match="does not support managed allocations"):
+            ManagedMemoryResource()
+    except CUDAError as exc:
+        xfail_if_mempool_oom(exc, device_without_concurrent_managed_access)
+        raise
 
 
 @requires_cuda_13
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -22,6 +22,7 @@
 
 from conftest import (
     create_managed_memory_resource_or_skip,
+    create_pinned_memory_resource_or_xfail,
     skip_if_managed_memory_unsupported,
     skip_if_pinned_memory_unsupported,
 )
@@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind):
         mr = DeviceMemoryResource(device)
     else:
         skip_if_pinned_memory_unsupported(device)
-        mr = PinnedMemoryResource()
+        mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
     assert mr.is_managed is False
     buf = mr.allocate(1024)
     assert buf.is_managed is False
@@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda):
 
     device.set_current()
 
-    mr = PinnedMemoryResource()
+    mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
     assert mr.is_device_accessible
     assert mr.is_host_accessible
 
@@ -1581,7 +1582,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource:
         skip_if_pinned_memory_unsupported(device)
-        mr = MR()
+        mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
     elif MR is ManagedMemoryResource:
         skip_if_managed_memory_unsupported(device)
         mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id))