Skip to content

Commit 5dd4ac9

Browse files
mdboomCopilot
andauthored
cuda.core.system: Better checks for when we expect APIs to be unsupported (#1510)
* cuda.core.system: Better checks for when we expect APIs to be unsupported * Update cuda_core/tests/system/test_system_device.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update cuda_core/tests/system/conftest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update cuda_bindings/tests/nvml/conftest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix importing of conftest.py * Fix tests * Fix tests * Fix a test on T4 * Refactor test helper * Use Unicode character * Fix imports * Add warning if devices are different architectures * Add some comments --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 9cc2ae5 commit 5dd4ac9

11 files changed

Lines changed: 245 additions & 234 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
5+
# This package contains test helper utilities that may also be useful for other libraries outside of `cuda.bindings`,
6+
# such as `cuda.core`. These utilities are not part of the public API of `cuda.bindings` and may change without notice.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
5+
from contextlib import contextmanager
6+
7+
import pytest
8+
from cuda.bindings import _nvml as nvml
9+
10+
11+
@contextmanager
12+
def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
13+
device_arch = nvml.device_get_architecture(device)
14+
15+
if isinstance(expected_device_arch, nvml.DeviceArch):
16+
expected_device_arch_int = int(expected_device_arch)
17+
elif expected_device_arch == "FERMI":
18+
expected_device_arch_int = 1
19+
else:
20+
expected_device_arch_int = 0
21+
22+
if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
23+
# In this case, we don't /know/ if it will fail, but we are ok if it
24+
# does or does not.
25+
26+
# TODO: There are APIs that are documented as supported only if the
27+
# device has an InfoROM, but I couldn't find a way to detect that. For
28+
# now, they are just handled as "possibly failing".
29+
30+
try:
31+
yield
32+
except nvml.NotSupportedError:
33+
# The API call raised NotSupportedError, so we skip the test, but
34+
# don't fail it
35+
pytest.skip(
36+
f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
37+
f"on device '{nvml.device_get_name(device)}'"
38+
)
39+
# If the API call worked, just continue
40+
elif int(device_arch) < expected_device_arch_int:
41+
# In this case, we /know/ if will fail, and we want to assert that it does.
42+
with pytest.raises(nvml.NotSupportedError):
43+
yield
44+
# The above call was unsupported, so the rest of the test is skipped
45+
pytest.skip(f"Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
46+
else:
47+
# In this case, we /know/ it should work, and if it fails, the test should fail.
48+
yield

cuda_bindings/tests/nvml/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
from cuda.bindings import _nvml as nvml
8+
from cuda.bindings._test_helpers.arch_check import unsupported_before # noqa: F401
89

910

1011
class NVMLInitializer:

cuda_bindings/tests/nvml/test_compute_mode.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import pytest
88
from cuda.bindings import _nvml as nvml
99

10+
from .conftest import unsupported_before
11+
1012
COMPUTE_MODES = [
1113
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
1214
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -16,18 +18,11 @@
1618

1719
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
1820
def test_compute_mode_supported_nonroot(all_devices):
19-
skip_reasons = set()
2021
for device in all_devices:
21-
try:
22+
with unsupported_before(device, None):
2223
original_compute_mode = nvml.device_get_compute_mode(device)
23-
except nvml.NotSupportedError:
24-
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
25-
continue
2624

2725
for cm in COMPUTE_MODES:
2826
with pytest.raises(nvml.NoPermissionError):
2927
nvml.device_set_compute_mode(device, cm)
3028
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
31-
32-
if skip_reasons:
33-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_gpu.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from cuda.bindings import _nvml as nvml
66

77
from . import util
8+
from .conftest import unsupported_before
89

910

1011
def test_gpu_get_module_id(nvml_init):
@@ -23,23 +24,14 @@ def test_gpu_get_module_id(nvml_init):
2324

2425

2526
def test_gpu_get_platform_info(all_devices):
26-
skip_reasons = set()
2727
for device in all_devices:
2828
if util.is_vgpu(device):
29-
skip_reasons.add(f"Not supported on vGPU device {device}")
30-
continue
29+
pytest.skip(f"Not supported on vGPU device {device}")
3130

32-
# TODO
33-
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
34-
# test_utils.skip_test("Not supported on chip before Blackwell")
31+
# Documentation says Blackwell or newer only, but this does seem to pass
32+
# on some newer GPUs.
3533

36-
try:
34+
with unsupported_before(device, None):
3735
platform_info = nvml.device_get_platform_info(device)
38-
except nvml.NotSupportedError:
39-
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
40-
continue
4136

4237
assert isinstance(platform_info, nvml.PlatformInfo_v2)
43-
44-
if skip_reasons:
45-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_init.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

44
import sys
5+
import warnings
56

67
import pytest
78
from cuda.bindings import _nvml as nvml
@@ -16,6 +17,23 @@ def assert_nvml_is_uninitialized():
1617
nvml.device_get_count_v2()
1718

1819

20+
def test_devices_are_the_same_architecture(all_devices):
21+
# The tests in this directory that use `unsupported_before` will generally
22+
# skip the entire test after the first device that isn't supported is found.
23+
# This means that if subsequent devices are of a different architecture,
24+
# they won't be tested properly. This tests for the (hopefully rare) case
25+
# where a system has devices of different architectures and produces a warning.
26+
27+
all_arches = set(nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices)
28+
29+
if len(all_arches) > 1:
30+
warnings.warn( # noqa: B028
31+
f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). "
32+
f" Some tests may be skipped unexpectedly",
33+
UserWarning,
34+
)
35+
36+
1937
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
2038
def test_init_ref_count():
2139
"""

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from cuda.bindings import _nvml as nvml
1111

1212
from . import util
13+
from .conftest import unsupported_before
1314

1415
XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
1516

@@ -66,7 +67,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
6667
def test_device_get_memory_affinity(handles, scope):
6768
size = 1024
6869
for handle in handles:
69-
node_set = nvml.device_get_memory_affinity(handle, size, scope)
70+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
71+
node_set = nvml.device_get_memory_affinity(handle, size, scope)
7072
assert node_set is not None
7173
assert len(node_set) == size
7274

@@ -76,7 +78,8 @@ def test_device_get_memory_affinity(handles, scope):
7678
def test_device_get_cpu_affinity_within_scope(handles, scope):
7779
size = 1024
7880
for handle in handles:
79-
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
81+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
82+
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
8083
assert cpu_set is not None
8184
assert len(cpu_set) == size
8285

@@ -136,22 +139,22 @@ def test_device_get_p2p_status(handles, index):
136139

137140
def test_device_get_power_usage(ngpus, handles):
138141
for i in range(ngpus):
139-
try:
142+
# Note: documentation says this is supported on Fermi or newer,
143+
# but in practice it fails on some later architectures.
144+
with unsupported_before(handles[i], None):
140145
power_mwatts = nvml.device_get_power_usage(handles[i])
141-
except nvml.NotSupportedError:
142-
pytest.skip("device_get_power_usage not supported")
143146
assert power_mwatts >= 0.0
144147

145148

146149
def test_device_get_total_energy_consumption(ngpus, handles):
147150
for i in range(ngpus):
148-
try:
151+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
149152
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
150-
except nvml.NotSupportedError:
151-
pytest.skip("device_get_total_energy_consumption not supported")
153+
152154
for j in range(10): # idle for 150 ms
153155
time.sleep(0.015) # and check for increase every 15 ms
154-
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
156+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
157+
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
155158
assert energy_mjoules2 >= energy_mjoules1
156159
if energy_mjoules2 > energy_mjoules1:
157160
break
@@ -182,7 +185,8 @@ def test_device_get_memory_info(ngpus, handles):
182185

183186
def test_device_get_utilization_rates(ngpus, handles):
184187
for i in range(ngpus):
185-
urate = nvml.device_get_utilization_rates(handles[i])
188+
with unsupported_before(handles[i], "FERMI"):
189+
urate = nvml.device_get_utilization_rates(handles[i])
186190
assert urate.gpu >= 0
187191
assert urate.memory >= 0
188192

@@ -239,7 +243,8 @@ def test_device_get_utilization_rates(ngpus, handles):
239243

240244
def test_device_get_pcie_throughput(ngpus, handles):
241245
for i in range(ngpus):
242-
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
246+
with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
247+
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
243248
assert tx_bytes_tp >= 0
244249
rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
245250
assert rx_bytes_tp >= 0
@@ -271,10 +276,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
271276
def test_device_get_nvlink_capability(ngpus, handles, cap_type):
272277
for i in range(ngpus):
273278
for j in range(nvml.NVLINK_MAX_LINKS):
274-
try:
279+
# By the documentation, this should be supported on PASCAL or newer,
280+
# but this also seems to fail on newer.
281+
with unsupported_before(handles[i], None):
275282
cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
276-
except nvml.NotSupportedError:
277-
pytest.skip("NVLink capability not supported")
278283
assert cap >= 0
279284

280285

cuda_core/cuda/core/system/_device.pyx

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ ClocksEventReasons = nvml.ClocksEventReasons
2121
ClockType = nvml.ClockType
2222
CoolerControl = nvml.CoolerControl
2323
CoolerTarget = nvml.CoolerTarget
24+
DeviceArch = nvml.DeviceArch
2425
EventType = nvml.EventType
2526
FanControlPolicy = nvml.FanControlPolicy
2627
FieldId = nvml.FieldId
@@ -45,41 +46,6 @@ include "_performance.pxi"
4546
include "_temperature.pxi"
4647

4748

48-
class DeviceArchitecture:
49-
"""
50-
Device architecture enumeration.
51-
"""
52-
53-
def __init__(self, architecture: int):
54-
try:
55-
self._architecture = nvml.DeviceArch(architecture)
56-
except ValueError:
57-
self._architecture = None
58-
59-
@property
60-
def id(self) -> int:
61-
"""
62-
The numeric id of the device architecture.
63-
64-
Returns -1 if the device is unknown.
65-
"""
66-
if self._architecture is None:
67-
return -1
68-
return int(self._architecture)
69-
70-
@property
71-
def name(self) -> str:
72-
"""
73-
The name of the device architecture.
74-
75-
Returns "Unlisted" if the device is unknown.
76-
"""
77-
if self._architecture is None:
78-
return "Unlisted"
79-
name = self._architecture.name
80-
return name[name.rfind("_") + 1 :].title()
81-
82-
8349
cdef class MemoryInfo:
8450
"""
8551
Memory allocation information for a device.
@@ -692,7 +658,8 @@ cdef class Device:
692658
If anything other than a single `index`, `uuid` or `pci_bus_id` are specified.
693659
"""
694660

695-
cdef intptr_t _handle
661+
# This is made public for testing purposes only
662+
cdef public intptr_t _handle
696663

697664
def __init__(
698665
self,
@@ -976,16 +943,15 @@ cdef class Device:
976943
return [Pstates(x) for x in nvml.device_get_supported_performance_states(self._handle)]
977944

978945
@property
979-
def architecture(self) -> DeviceArchitecture:
946+
def arch(self) -> DeviceArch:
980947
"""
981-
Device architecture. For example, a Tesla V100 will report
982-
``DeviceArchitecture.name == "Volta"``, and RTX A6000 will report
983-
``DeviceArchitecture.name == "Ampere"``. If the device returns an
984-
architecture that is unknown to NVML then ``DeviceArchitecture.name ==
985-
"Unknown"`` is reported, whereas an architecture that is unknown to
986-
cuda.core.system is reported as ``DeviceArchitecture.name == "Unlisted"``.
948+
Device architecture.
949+
950+
For example, a Tesla V100 will report ``DeviceArchitecture.name ==
951+
"VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==
952+
"AMPERE"``.
987953
"""
988-
return DeviceArchitecture(nvml.device_get_architecture(self._handle))
954+
return DeviceArch(nvml.device_get_architecture(self._handle))
989955

990956
@property
991957
def bar1_memory_info(self) -> BAR1MemoryInfo:
@@ -1051,6 +1017,8 @@ cdef class Device:
10511017
"""
10521018
Retrieves the globally unique board serial number associated with this
10531019
device's board.
1020+
1021+
For all products with an InfoROM.
10541022
"""
10551023
return nvml.device_get_serial(self._handle)
10561024

@@ -1300,6 +1268,8 @@ cdef class Device:
13001268
"""
13011269
Get the addressing mode of the device.
13021270

1271+
For Turing™ or newer fully supported devices.
1272+
13031273
Addressing modes can be one of:
13041274

13051275
- :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated
@@ -1518,7 +1488,7 @@ __all__ = [
15181488
"CoolerInfo",
15191489
"CoolerTarget",
15201490
"Device",
1521-
"DeviceArchitecture",
1491+
"DeviceArch",
15221492
"DeviceAttributes",
15231493
"DeviceEvents",
15241494
"EventData",

cuda_core/docs/source/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ CUDA system information and NVIDIA Management Library (NVML)
103103
system.CoolerControl
104104
system.CoolerInfo
105105
system.CoolerTarget
106-
system.DeviceArchitecture
106+
system.DeviceArch
107107
system.DeviceAttributes
108108
system.DeviceEvents
109109
system.EventData

cuda_core/tests/system/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,9 @@
99
skip_if_nvml_unsupported = pytest.mark.skipif(
1010
not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+"
1111
)
12+
13+
14+
def unsupported_before(device, expected_device_arch):
15+
from cuda.bindings._test_helpers.arch_check import unsupported_before as nvml_unsupported_before
16+
17+
return nvml_unsupported_before(device._handle, expected_device_arch)

0 commit comments

Comments
 (0)