Skip to content
54 changes: 54 additions & 0 deletions cuda_bindings/cuda/bindings/_test_helpers/mempool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import sys

import pytest

from cuda.bindings import driver, runtime


def is_windows_mcdm_device(device=0):
if sys.platform != "win32":
return False
import cuda.bindings.nvml as nvml

device_id = int(device.device_id if hasattr(device, "device_id") else device)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: getattr(device, "device_id", device)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: commit 20c8a7a

Thanks for catching this!

(err,) = driver.cuInit(0)
if err != driver.CUresult.CUDA_SUCCESS:
return False
err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
if err != driver.CUresult.CUDA_SUCCESS:
return False
pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
nvml.init_v2()
try:
handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
current, _ = nvml.device_get_driver_model_v2(handle)
return current == nvml.DriverModel.DRIVER_MCDM
finally:
nvml.shutdown()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this assume that nvml was uninitialized on entry to this function? Would it break callers that initialized nvml?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the NVML API contract directly instead of relying on memory. The short answer is that nvmlInit_v2() and nvmlShutdown() are reference-counted, so the balanced nvml.init_v2() / nvml.shutdown() pair in our helper should not break callers that had already initialized NVML.

The most relevant NVIDIA doc is the current NVML "Initialization and Cleanup" page: https://docs.nvidia.com/deploy/nvml-api/group__nvmlInitializationAndCleanup.html.

Cursor generated supporting details:

  • The current NVML docs for nvmlInit_v2() say: "A reference count of the number of initializations is maintained. Shutdown only occurs when the reference count reaches zero."
  • The current NVML docs for nvmlShutdown() say: "This method should be called ... once for each call to nvmlInit_v2(). A reference count of the number of initializations is maintained. Shutdown only occurs when the reference count reaches zero."
  • The same current docs also say this applies "For all products." Separately, the NVML API reference lists Windows as a supported OS platform, so there is no indication that the ref-count behavior is Linux-only.
  • The archived R525 docs use the same ref-count language, which suggests this is not a recent or unstable contract.
  • Our cuda.bindings.nvml layer is a thin pass-through here: init_v2() calls nvmlInit_v2() directly and shutdown() calls nvmlShutdown() directly, so there is no extra Python-side lifecycle logic changing the semantics.
  • The generated binding text in cuda_bindings/cuda/bindings/nvml.pyx also reflects the same contract: ERROR_ALREADY_INITIALIZED is described as deprecated because "Multiple initializations are now allowed through ref counting."
  • The repo already encodes this assumption in cuda_bindings/tests/nvml/test_init.py, whose test_init_ref_count() explicitly exercises repeated init_v2() / shutdown() calls and checks that NVML remains initialized until the matching final shutdown. That test is skipped on Windows, so it is not direct Windows coverage, but it does show the intended interpretation inside this repo.
  • One unrelated wrinkle: the current docs say extra nvmlShutdown() calls beyond the init count are tolerated for backwards compatibility, while our local test expects UninitializedError on a naked shutdown(). That mismatch is worth keeping in mind, but it does not affect this helper because the helper uses a balanced init/shutdown pair.



def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
if api_name is not None and not isinstance(api_name, str):
device = api_name
api_name = None

is_oom = err_or_exc in (
driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY,
runtime.cudaError_t.cudaErrorMemoryAllocation,
) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc)

if not is_oom:
return
try:
is_windows_mcdm = is_windows_mcdm_device(device)
except Exception:
# If MCDM detection fails, leave the primary test failure visible.
return
if not is_windows_mcdm:
return

api_context = f"{api_name} " if api_name else ""
pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")
8 changes: 8 additions & 0 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import cuda.bindings.driver as cuda
import cuda.bindings.runtime as cudart
from cuda.bindings import driver
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom


def driverVersionLessThan(target):
Expand Down Expand Up @@ -270,6 +271,7 @@ def test_cuda_memPool_attr():

attr_list = [None] * 8
err, pool = cuda.cuMemPoolCreate(poolProps)
xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was (perhaps naively) expecting the xfail logic to appear as a decorator on the test itself, or, at worst, a context manager. I guess that's not practical?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Helper-based local skip/xfail logic is a widely-used pattern in this repo, especially under cuda_core/tests. Also, I believe for this specific issue, a local helper is the right tool for the job because the condition is only knowable after a specific CUDA API call returns a specific failure on a specific runtime configuration.

Cursor generated supporting details:

  • Under cuda_core/tests, runtime gating is frequently factored into helpers and fixtures rather than only using decorators. Examples include skip_if_pinned_memory_unsupported() and skip_if_managed_memory_unsupported() in cuda_core/tests/conftest.py, plus local helpers like _skip_if_no_mempool() / _skip_if_no_managed_mempool() in cuda_core/tests/graph/test_graph_definition.py, similar _skip_if_no_mempool() helpers in several other graph/object-protocol modules, and fixture-style runtime gates like skip_if_no_tma in cuda_core/tests/test_tensor_map.py.
  • So I think it is fair to describe helper-based runtime skip/xfail logic as a commonly used pattern under cuda-python, with the strongest examples living in cuda_core/tests.
  • Decorators are most natural when the condition is static up front: platform, version, missing import, permanently absent feature, etc. Here the interesting condition is narrower: a particular mempool setup call fails with the known Windows MCDM OOM-like failure. A decorator would tend to mark the whole test based on environment rather than on the actually observed failure.
  • A context manager is more plausible than a decorator, but still not a great fit here because cuda_bindings/tests is largely return-code driven. The test gets an err back from the CUDA API and then decides what to do. In that style, a helper like xfail_if_mempool_oom(err, api_name, device) is more natural than building an exception-oriented context manager around a return-code check.
  • The local helper also keeps the xfail narrowly scoped. Unaffected systems still pass normally, affected systems only xfail when the known bug actually reproduces, and once the underlying issue is fixed the test can begin passing immediately instead of remaining broadly pre-marked.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this follows the existing pattern. I'd be interested in exploring options to diminish the reliance on these helpers.

At this particular line of code, errors are being checked manually, so a helper makes sense. More broadly, it would be better if the tests could be written directly and some other mechanism could translate failures into skips or xfails, as needed. An aspiration.

assert err == cuda.CUresult.CUDA_SUCCESS

for idx, attr in enumerate(
Expand Down Expand Up @@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device):
params.bytesize = allocSize

err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params)
if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY:
(destroy_err,) = cuda.cuGraphDestroy(graph)
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
(destroy_err,) = cuda.cuStreamDestroy(stream)
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device)
assert err == cuda.CUresult.CUDA_SUCCESS
err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr)
assert err == cuda.CUresult.CUDA_SUCCESS
Expand Down
2 changes: 2 additions & 0 deletions cuda_bindings/tests/test_cudart.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import cuda.bindings.runtime as cudart
from cuda import pathfinder
from cuda.bindings import runtime
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom


def isSuccess(err):
Expand Down Expand Up @@ -432,6 +433,7 @@ def test_cudart_MemPool_attr():

attr_list = [None] * 8
err, pool = cudart.cudaMemPoolCreate(poolProps)
xfail_if_mempool_oom(err, "cudaMemPoolCreate", poolProps.location.id)
assertSuccess(err)

for idx, attr in enumerate(
Expand Down
38 changes: 36 additions & 2 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from cuda import cuda as driver

import cuda.core
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
from cuda.core import (
Device,
DeviceMemoryResource,
Expand All @@ -27,7 +28,7 @@
PinnedMemoryResourceOptions,
_device,
)
from cuda.core._utils.cuda_utils import handle_return
from cuda.core._utils.cuda_utils import CUDAError, handle_return

# Import shared test helpers for tests across subprojects.
# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
Expand Down Expand Up @@ -61,21 +62,54 @@ def skip_if_managed_memory_unsupported(device):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
try:
ManagedMemoryResource()
except CUDAError as e:
xfail_if_mempool_oom(e, device)
raise
except RuntimeError as e:
if "requires CUDA 13.0" in str(e):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
raise


def create_managed_memory_resource_or_skip(*args, **kwargs):
def create_managed_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
try:
return ManagedMemoryResource(*args, **kwargs)
except CUDAError as e:
xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
raise
except RuntimeError as e:
if "requires CUDA 13.0" in str(e):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
raise


def create_pinned_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
try:
return PinnedMemoryResource(*args, **kwargs)
except CUDAError as e:
xfail_if_mempool_oom(e, xfail_device)
raise


def _device_id_from_resource_options(device, args, kwargs):
if device is not None:
return device
options = kwargs.get("options")
if options is None and args:
options = args[0]
if options is None:
return 0
if isinstance(options, dict):
preferred_location = options.get("preferred_location")
preferred_location_type = options.get("preferred_location_type")
else:
preferred_location = getattr(options, "preferred_location", None)
preferred_location_type = getattr(options, "preferred_location_type", None)
if preferred_location_type in (None, "device") and isinstance(preferred_location, int) and preferred_location >= 0:
return preferred_location
return 0


@pytest.fixture(scope="session", autouse=True)
def session_setup():
# Always init CUDA.
Expand Down
10 changes: 8 additions & 2 deletions cuda_core/tests/test_managed_memory_warning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import pytest

import cuda.bindings
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions
from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning
from cuda.core._utils.cuda_utils import CUDAError

_cuda_major = int(cuda.bindings.__version__.split(".")[0])

Expand Down Expand Up @@ -47,8 +49,12 @@ def device_without_concurrent_managed_access(init_cuda):
@requires_cuda_13
def test_default_pool_error_without_concurrent_access(device_without_concurrent_managed_access):
"""ManagedMemoryResource() raises RuntimeError when the default pool doesn't support managed."""
with pytest.raises(RuntimeError, match="does not support managed allocations"):
ManagedMemoryResource()
try:
with pytest.raises(RuntimeError, match="does not support managed allocations"):
ManagedMemoryResource()
except CUDAError as exc:
xfail_if_mempool_oom(exc, device_without_concurrent_managed_access)
raise


@requires_cuda_13
Expand Down
37 changes: 19 additions & 18 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR

from conftest import (
create_managed_memory_resource_or_skip,
create_managed_memory_resource_or_xfail,
create_pinned_memory_resource_or_skip,
skip_if_managed_memory_unsupported,
skip_if_pinned_memory_unsupported,
)
Expand Down Expand Up @@ -617,7 +618,7 @@ def test_managed_memory_resource_buffer_dlpack_device_type():
device = Device()
device.set_current()
skip_if_managed_memory_unsupported(device)
mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=device.device_id))
mr = create_managed_memory_resource_or_xfail(ManagedMemoryResourceOptions(preferred_location=device.device_id))
buf = mr.allocate(1024)

assert mr.is_managed
Expand All @@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind):
mr = DeviceMemoryResource(device)
else:
skip_if_pinned_memory_unsupported(device)
mr = PinnedMemoryResource()
mr = create_pinned_memory_resource_or_skip(xfail_device=device)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sequence looks odd. It checks whether PinnedMemoryResource is supported, but even when that check passes an additional "guarded construction" is necessary.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, sorry I was too rushed last night and got mixed up with the skip/xfail name change. Fixed in these commits:

The diff to main is smaller now, and I believe the "first skip if the capability is missing, then xfail if there is a known issue" sequence here looks much clearer now.

assert mr.is_managed is False
buf = mr.allocate(1024)
assert buf.is_managed is False
Expand Down Expand Up @@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda):

device.set_current()

mr = PinnedMemoryResource()
mr = create_pinned_memory_resource_or_skip(xfail_device=device)
assert mr.is_device_accessible
assert mr.is_host_accessible

Expand Down Expand Up @@ -713,7 +714,7 @@ def test_managed_memory_resource_initialization(init_cuda):

device.set_current()

mr = create_managed_memory_resource_or_skip()
mr = create_managed_memory_resource_or_xfail()
assert mr.is_device_accessible
assert mr.is_host_accessible

Expand Down Expand Up @@ -1028,7 +1029,7 @@ def test_managed_memory_resource_with_options(init_cuda):

# Test basic pool creation
options = ManagedMemoryResourceOptions()
mr = create_managed_memory_resource_or_skip(options)
mr = create_managed_memory_resource_or_xfail(options)
assert mr.is_device_accessible
assert mr.is_host_accessible
assert not mr.is_ipc_enabled
Expand Down Expand Up @@ -1071,7 +1072,7 @@ def test_managed_memory_resource_preferred_location_default(init_cuda):
skip_if_managed_memory_unsupported(device)
device.set_current()

mr = create_managed_memory_resource_or_skip()
mr = create_managed_memory_resource_or_xfail()
assert mr.preferred_location is None


Expand All @@ -1083,15 +1084,15 @@ def test_managed_memory_resource_preferred_location_device(init_cuda):

# Legacy style
opts = ManagedMemoryResourceOptions(preferred_location=device.device_id)
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("device", device.device_id)

# Explicit style
opts = ManagedMemoryResourceOptions(
preferred_location=device.device_id,
preferred_location_type="device",
)
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("device", device.device_id)


Expand All @@ -1103,12 +1104,12 @@ def test_managed_memory_resource_preferred_location_host(init_cuda):

# Legacy style
opts = ManagedMemoryResourceOptions(preferred_location=-1)
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("host", None)

# Explicit style
opts = ManagedMemoryResourceOptions(preferred_location_type="host")
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("host", None)


Expand All @@ -1124,15 +1125,15 @@ def test_managed_memory_resource_preferred_location_host_numa(init_cuda):

# Auto-resolved from current device
opts = ManagedMemoryResourceOptions(preferred_location_type="host_numa")
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("host_numa", numa_id)

# Explicit NUMA node ID
opts = ManagedMemoryResourceOptions(
preferred_location=numa_id,
preferred_location_type="host_numa",
)
mr = create_managed_memory_resource_or_skip(opts)
mr = create_managed_memory_resource_or_xfail(opts)
assert mr.preferred_location == ("host_numa", numa_id)


Expand Down Expand Up @@ -1423,7 +1424,7 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name,
assert mr.is_ipc_enabled == ipc_enabled
elif MR is ManagedMemoryResource:
options = MRops()
mr = create_managed_memory_resource_or_skip(options)
mr = create_managed_memory_resource_or_xfail(options)
assert not mr.is_ipc_enabled

# Get the property value
Expand Down Expand Up @@ -1476,7 +1477,7 @@ def test_mempool_attributes_repr(memory_resource_factory):
elif MR is PinnedMemoryResource:
mr = MR(options={"max_size": 2048})
elif MR is ManagedMemoryResource:
mr = create_managed_memory_resource_or_skip(options={})
mr = create_managed_memory_resource_or_xfail(options={})

buffer1 = mr.allocate(64)
buffer2 = mr.allocate(64)
Expand Down Expand Up @@ -1513,7 +1514,7 @@ def test_mempool_attributes_ownership(memory_resource_factory):
elif MR is PinnedMemoryResource:
mr = MR({"max_size": POOL_SIZE})
elif MR is ManagedMemoryResource:
mr = create_managed_memory_resource_or_skip({})
mr = create_managed_memory_resource_or_xfail({})

attributes = mr.attributes
mr.close()
Expand Down Expand Up @@ -1581,10 +1582,10 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
pytest.skip("Device does not support mempool operations")
elif MR is PinnedMemoryResource:
skip_if_pinned_memory_unsupported(device)
mr = MR()
mr = create_pinned_memory_resource_or_skip(xfail_device=device)
elif MR is ManagedMemoryResource:
skip_if_managed_memory_unsupported(device)
mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id))
mr = create_managed_memory_resource_or_xfail(MROps(preferred_location=device.device_id))
else:
assert MR is DeviceMemoryResource
mr = MR(device)
Expand Down
6 changes: 3 additions & 3 deletions cuda_core/tests/test_tensor_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest

from conftest import create_managed_memory_resource_or_skip, skip_if_managed_memory_unsupported
from conftest import create_managed_memory_resource_or_xfail, skip_if_managed_memory_unsupported
from cuda.core import (
Device,
ManagedMemoryResourceOptions,
Expand Down Expand Up @@ -403,7 +403,7 @@ def test_replace_address_accepts_managed_buffer_on_nonzero_device(self, init_cud
data_type=TensorMapDataType.FLOAT32,
)

mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
mr = create_managed_memory_resource_or_xfail(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
managed_buf = mr.allocate(1024 * 4)

desc.replace_address(managed_buf)
Expand Down Expand Up @@ -442,7 +442,7 @@ def test_from_tiled_accepts_managed_buffer_on_nonzero_device(self, init_cuda):
skip_if_managed_memory_unsupported(dev1)

dev1.set_current()
mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
mr = create_managed_memory_resource_or_xfail(ManagedMemoryResourceOptions(preferred_location=dev1.device_id))
managed_buf = mr.allocate(1024 * 4)

desc = _as_view(managed_buf).as_tensor_map(
Expand Down
Loading