-
Notifications
You must be signed in to change notification settings - Fork 305
test: xfail Windows MCDM mempool OOM setup failures #2000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
279935a
6f3ac7f
d89a77b
79543e4
1337557
a181fd6
4517648
20c8a7a
6e790f7
d757fa0
04307fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE | ||
|
|
||
| import sys | ||
|
|
||
| import pytest | ||
|
|
||
| from cuda.bindings import driver, runtime | ||
|
|
||
|
|
||
| def is_windows_mcdm_device(device=0): | ||
| if sys.platform != "win32": | ||
| return False | ||
| import cuda.bindings.nvml as nvml | ||
|
|
||
| device_id = int(device.device_id if hasattr(device, "device_id") else device) | ||
| (err,) = driver.cuInit(0) | ||
| if err != driver.CUresult.CUDA_SUCCESS: | ||
| return False | ||
| err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id) | ||
| if err != driver.CUresult.CUDA_SUCCESS: | ||
| return False | ||
| pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii") | ||
| nvml.init_v2() | ||
| try: | ||
| handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) | ||
| current, _ = nvml.device_get_driver_model_v2(handle) | ||
| return current == nvml.DriverModel.DRIVER_MCDM | ||
| finally: | ||
| nvml.shutdown() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this assume that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked the NVML API contract directly instead of relying on memory. The short answer is that The most relevant NVIDIA doc is the current NVML "Initialization and Cleanup" page: https://docs.nvidia.com/deploy/nvml-api/group__nvmlInitializationAndCleanup.html. Cursor generated supporting details:
|
||
|
|
||
|
|
||
| def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0): | ||
| if api_name is not None and not isinstance(api_name, str): | ||
| device = api_name | ||
| api_name = None | ||
|
|
||
| is_oom = err_or_exc in ( | ||
| driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY, | ||
| runtime.cudaError_t.cudaErrorMemoryAllocation, | ||
| ) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc) | ||
|
|
||
| if not is_oom: | ||
| return | ||
| try: | ||
| is_windows_mcdm = is_windows_mcdm_device(device) | ||
| except Exception: | ||
| # If MCDM detection fails, leave the primary test failure visible. | ||
| return | ||
| if not is_windows_mcdm: | ||
| return | ||
|
|
||
| api_context = f"{api_name} " if api_name else "" | ||
| pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| import cuda.bindings.driver as cuda | ||
| import cuda.bindings.runtime as cudart | ||
| from cuda.bindings import driver | ||
| from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom | ||
|
|
||
|
|
||
| def driverVersionLessThan(target): | ||
|
|
@@ -270,6 +271,7 @@ def test_cuda_memPool_attr(): | |
|
|
||
| attr_list = [None] * 8 | ||
| err, pool = cuda.cuMemPoolCreate(poolProps) | ||
| xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was (perhaps naively) expecting the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Helper-based local Cursor generated supporting details:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree this follows the existing pattern. I'd be interested in exploring options to diminish the reliance on these helpers. At this particular line of code, errors are being checked manually, so a helper makes sense. More broadly, it would be better if the tests could be written directly and some other mechanism could translate failures into skips or xfails, as needed. An aspiration. |
||
| assert err == cuda.CUresult.CUDA_SUCCESS | ||
|
|
||
| for idx, attr in enumerate( | ||
|
|
@@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device): | |
| params.bytesize = allocSize | ||
|
|
||
| err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params) | ||
| if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY: | ||
| (destroy_err,) = cuda.cuGraphDestroy(graph) | ||
| assert destroy_err == cuda.CUresult.CUDA_SUCCESS | ||
| (destroy_err,) = cuda.cuStreamDestroy(stream) | ||
| assert destroy_err == cuda.CUresult.CUDA_SUCCESS | ||
| xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device) | ||
| assert err == cuda.CUresult.CUDA_SUCCESS | ||
| err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr) | ||
| assert err == cuda.CUresult.CUDA_SUCCESS | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,8 @@ | |
| from helpers.buffers import DummyUnifiedMemoryResource, TrackingMR | ||
|
|
||
| from conftest import ( | ||
| create_managed_memory_resource_or_skip, | ||
| create_managed_memory_resource_or_xfail, | ||
| create_pinned_memory_resource_or_skip, | ||
| skip_if_managed_memory_unsupported, | ||
| skip_if_pinned_memory_unsupported, | ||
| ) | ||
|
|
@@ -617,7 +618,7 @@ def test_managed_memory_resource_buffer_dlpack_device_type(): | |
| device = Device() | ||
| device.set_current() | ||
| skip_if_managed_memory_unsupported(device) | ||
| mr = create_managed_memory_resource_or_skip(ManagedMemoryResourceOptions(preferred_location=device.device_id)) | ||
| mr = create_managed_memory_resource_or_xfail(ManagedMemoryResourceOptions(preferred_location=device.device_id)) | ||
| buf = mr.allocate(1024) | ||
|
|
||
| assert mr.is_managed | ||
|
|
@@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind): | |
| mr = DeviceMemoryResource(device) | ||
| else: | ||
| skip_if_pinned_memory_unsupported(device) | ||
| mr = PinnedMemoryResource() | ||
| mr = create_pinned_memory_resource_or_skip(xfail_device=device) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sequence looks odd. It checks whether
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, sorry I was too rushed last night and got mixed up with the skip/xfail name change. Fixed in these commits: The diff to main is smaller now, and I believe the "first skip if the capability is missing, then xfail if there is a known issue" sequence here looks much clearer now. |
||
| assert mr.is_managed is False | ||
| buf = mr.allocate(1024) | ||
| assert buf.is_managed is False | ||
|
|
@@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda): | |
|
|
||
| device.set_current() | ||
|
|
||
| mr = PinnedMemoryResource() | ||
| mr = create_pinned_memory_resource_or_skip(xfail_device=device) | ||
| assert mr.is_device_accessible | ||
| assert mr.is_host_accessible | ||
|
|
||
|
|
@@ -713,7 +714,7 @@ def test_managed_memory_resource_initialization(init_cuda): | |
|
|
||
| device.set_current() | ||
|
|
||
| mr = create_managed_memory_resource_or_skip() | ||
| mr = create_managed_memory_resource_or_xfail() | ||
| assert mr.is_device_accessible | ||
| assert mr.is_host_accessible | ||
|
|
||
|
|
@@ -1028,7 +1029,7 @@ def test_managed_memory_resource_with_options(init_cuda): | |
|
|
||
| # Test basic pool creation | ||
| options = ManagedMemoryResourceOptions() | ||
| mr = create_managed_memory_resource_or_skip(options) | ||
| mr = create_managed_memory_resource_or_xfail(options) | ||
| assert mr.is_device_accessible | ||
| assert mr.is_host_accessible | ||
| assert not mr.is_ipc_enabled | ||
|
|
@@ -1071,7 +1072,7 @@ def test_managed_memory_resource_preferred_location_default(init_cuda): | |
| skip_if_managed_memory_unsupported(device) | ||
| device.set_current() | ||
|
|
||
| mr = create_managed_memory_resource_or_skip() | ||
| mr = create_managed_memory_resource_or_xfail() | ||
| assert mr.preferred_location is None | ||
|
|
||
|
|
||
|
|
@@ -1083,15 +1084,15 @@ def test_managed_memory_resource_preferred_location_device(init_cuda): | |
|
|
||
| # Legacy style | ||
| opts = ManagedMemoryResourceOptions(preferred_location=device.device_id) | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("device", device.device_id) | ||
|
|
||
| # Explicit style | ||
| opts = ManagedMemoryResourceOptions( | ||
| preferred_location=device.device_id, | ||
| preferred_location_type="device", | ||
| ) | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("device", device.device_id) | ||
|
|
||
|
|
||
|
|
@@ -1103,12 +1104,12 @@ def test_managed_memory_resource_preferred_location_host(init_cuda): | |
|
|
||
| # Legacy style | ||
| opts = ManagedMemoryResourceOptions(preferred_location=-1) | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("host", None) | ||
|
|
||
| # Explicit style | ||
| opts = ManagedMemoryResourceOptions(preferred_location_type="host") | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("host", None) | ||
|
|
||
|
|
||
|
|
@@ -1124,15 +1125,15 @@ def test_managed_memory_resource_preferred_location_host_numa(init_cuda): | |
|
|
||
| # Auto-resolved from current device | ||
| opts = ManagedMemoryResourceOptions(preferred_location_type="host_numa") | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("host_numa", numa_id) | ||
|
|
||
| # Explicit NUMA node ID | ||
| opts = ManagedMemoryResourceOptions( | ||
| preferred_location=numa_id, | ||
| preferred_location_type="host_numa", | ||
| ) | ||
| mr = create_managed_memory_resource_or_skip(opts) | ||
| mr = create_managed_memory_resource_or_xfail(opts) | ||
| assert mr.preferred_location == ("host_numa", numa_id) | ||
|
|
||
|
|
||
|
|
@@ -1423,7 +1424,7 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name, | |
| assert mr.is_ipc_enabled == ipc_enabled | ||
| elif MR is ManagedMemoryResource: | ||
| options = MRops() | ||
| mr = create_managed_memory_resource_or_skip(options) | ||
| mr = create_managed_memory_resource_or_xfail(options) | ||
| assert not mr.is_ipc_enabled | ||
|
|
||
| # Get the property value | ||
|
|
@@ -1476,7 +1477,7 @@ def test_mempool_attributes_repr(memory_resource_factory): | |
| elif MR is PinnedMemoryResource: | ||
| mr = MR(options={"max_size": 2048}) | ||
| elif MR is ManagedMemoryResource: | ||
| mr = create_managed_memory_resource_or_skip(options={}) | ||
| mr = create_managed_memory_resource_or_xfail(options={}) | ||
|
|
||
| buffer1 = mr.allocate(64) | ||
| buffer2 = mr.allocate(64) | ||
|
|
@@ -1513,7 +1514,7 @@ def test_mempool_attributes_ownership(memory_resource_factory): | |
| elif MR is PinnedMemoryResource: | ||
| mr = MR({"max_size": POOL_SIZE}) | ||
| elif MR is ManagedMemoryResource: | ||
| mr = create_managed_memory_resource_or_skip({}) | ||
| mr = create_managed_memory_resource_or_xfail({}) | ||
|
|
||
| attributes = mr.attributes | ||
| mr.close() | ||
|
|
@@ -1581,10 +1582,10 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): | |
| pytest.skip("Device does not support mempool operations") | ||
| elif MR is PinnedMemoryResource: | ||
| skip_if_pinned_memory_unsupported(device) | ||
| mr = MR() | ||
| mr = create_pinned_memory_resource_or_skip(xfail_device=device) | ||
| elif MR is ManagedMemoryResource: | ||
| skip_if_managed_memory_unsupported(device) | ||
| mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id)) | ||
| mr = create_managed_memory_resource_or_xfail(MROps(preferred_location=device.device_id)) | ||
| else: | ||
| assert MR is DeviceMemoryResource | ||
| mr = MR(device) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
getattr(device, "device_id", device)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done: commit 20c8a7a
Thanks for catching this!