Skip to content

Commit 870956d

Browse files
authored
Fix MIG-related tests in cuda.core.system (#2065)
* Fix MIG-related tests in cuda.core.system * Improve docs and exception-handling * Add release notes * Fix cuda_bindings part of the tests
1 parent d62ff30 commit 870956d

9 files changed

Lines changed: 79 additions & 13 deletions

File tree

cuda_bindings/tests/nvml/conftest.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,23 @@ def nmigs(handles):
106106

107107
@pytest.fixture
108108
def mig_handles(nmigs):
109-
handles = [nvml.device_get_mig_device_handle_by_index(i) for i in range(nmigs)]
110-
assert len(handles) == nmigs
109+
handles = []
110+
with NVMLInitializer():
111+
dev_count = nvml.device_get_count_v2()
112+
113+
for dev_idx in range(dev_count):
114+
try:
115+
dev = nvml.device_get_handle_by_index_v2(dev_idx)
116+
except nvml.NoPermissionError:
117+
continue
118+
for mig_idx in range(nmigs):
119+
try:
120+
mig = nvml.device_get_mig_device_handle_by_index(dev, mig_idx)
121+
except nvml.NotFoundError:
122+
# Not all MIG devices may be available
123+
continue
124+
else:
125+
handles.append(mig)
111126
return handles
112127

113128

cuda_bindings/tests/nvml/test_cuda.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import os
55

6+
import pytest
7+
68
import cuda.bindings.driver as cuda
79
from cuda.bindings import nvml
810

@@ -56,6 +58,10 @@ def test_cuda_device_order():
5658
cuda_devices = get_cuda_device_names()
5759
nvml_devices = get_nvml_device_names()
5860

61+
if any("Thor" in device["name"] for device in nvml_devices):
62+
pytest.skip("Skipping test on Thor, which has non-standard device naming")
63+
return
64+
5965
if "CUDA_VISIBLE_DEVICES" not in os.environ:
6066
# If that environment variable isn't set, the device lists should match exactly
6167
assert cuda_devices == nvml_devices, "CUDA and NVML device lists do not match"

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def test_device_get_memory_info(ngpus, handles):
187187

188188
def test_device_get_utilization_rates(ngpus, handles):
189189
for i in range(ngpus):
190-
with unsupported_before(handles[i], "FERMI"):
190+
with unsupported_before(handles[i], None):
191191
urate = nvml.device_get_utilization_rates(handles[i])
192192
assert urate.gpu >= 0
193193
assert urate.memory >= 0

cuda_core/cuda/core/system/_device.pyx

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,15 @@ cdef class Device:
376376
-------
377377
cuda.core.Device
378378
The corresponding CUDA device.
379+
380+
Raises
381+
------
382+
RuntimeError
383+
No corresponding CUDA device is found for this NVML device.
384+
385+
For example, on a MIG system, the physical GPU will not have an
386+
available CUDA device, since it can not be used directly, even
387+
though it can be enumerated from NVML.
379388
"""
380389
from cuda.core import Device as CudaDevice
381390

@@ -890,8 +899,16 @@ cdef class Device:
890899
def pci_info(self) -> PciInfo:
891900
"""
892901
:obj:`~_device.PciInfo` object with the PCI attributes of this device.
902+
903+
Non-physical devices, such as MIG devices, may not have PCI attributes.
904+
In that case, this property will raise a `RuntimeError`.
893905
"""
894-
return PciInfo(nvml.device_get_pci_info_ext(self._handle), self._handle)
906+
try:
907+
pci_info = nvml.device_get_pci_info_ext(self._handle)
908+
except nvml.InvalidArgumentError:
909+
raise RuntimeError("This device does not have PCI attributes") from None
910+
else:
911+
return PciInfo(pci_info, self._handle)
895912
896913
##########################################################################
897914
# PERFORMANCE

cuda_core/cuda/core/system/_mig.pxi

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,8 @@ cdef class MigInfo:
163163
A list of all MIG devices corresponding to this GPU.
164164
"""
165165
for i in range(self.device_count):
166-
yield self.get_device_by_index(i)
166+
try:
167+
yield self.get_device_by_index(i)
168+
except nvml.NotFoundError:
169+
# Not all MIG devices may be available
170+
continue
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
.. SPDX-License-Identifier: Apache-2.0
3+
4+
.. currentmodule:: cuda.core
5+
6+
``cuda.core`` 1.0.1 Release Notes
7+
=================================
8+
9+
10+
Fixes and enhancements
11+
----------------------
12+
13+
- When iterating over MIG devices with
14+
``cuda.core.system.Device.mig.get_all_devices``, only available MIG devices will
15+
be returned. Previously, if any MIG device was unavailable, an exception would
16+
be raised. (`#2065 <https://github.com/NVIDIA/cuda-python/issues/2065>`__)
17+
- When converting an NVML device (``cuda.core.system.Device``) to a CUDA device
18+
(``cuda.core.Device``), using ``cuda.core.system.Device.to_cuda_device``, if the
19+
device does not have a matching CUDA device, a ``RuntimeError`` will be raised.
20+
Previously, a ``cuda.core.system.InvalidArgumentError`` would be raised. For
21+
example, this may happen for physical devices on a MIG system.

cuda_core/tests/system/test_system_device.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,12 @@ def test_to_cuda_device():
5656
from cuda.core import Device as CudaDevice
5757

5858
for device in system.Device.get_all_devices():
59-
cuda_device = device.to_cuda_device()
59+
try:
60+
cuda_device = device.to_cuda_device()
61+
except RuntimeError:
62+
# Not all physical NVML devices may have a matching CUDA device
63+
# when MIG is involved.
64+
continue
6065

6166
assert isinstance(cuda_device, CudaDevice)
6267
assert cuda_device.uuid == device.uuid_without_prefix

cuda_core/tests/system/test_system_system.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,6 @@ def test_kernel_mode_driver_version():
4040
assert 0 <= ver_patch[0] <= 99
4141

4242

43-
def test_num_devices():
44-
num_devices = system.get_num_devices()
45-
expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
46-
assert num_devices == expected_num_devices, "Number of devices does not match expected value"
47-
48-
4943
def test_devices():
5044
devices = Device.get_all_devices()
5145
expected_num_devices = handle_return(runtime.cudaGetDeviceCount())

cuda_core/tests/test_device.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
import contextlib
5+
46
try:
57
from cuda.bindings import driver, runtime
68
except ImportError:
@@ -45,7 +47,9 @@ def test_to_system_device(deinit_cuda):
4547

4648
# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
4749
# 4-byte domain
48-
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]
50+
# MIG devices don't have pci_info, so skip the bus ID check if it's missing
51+
with contextlib.suppress(RuntimeError):
52+
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]
4953

5054

5155
def test_device_set_current(deinit_cuda):

0 commit comments

Comments
 (0)