Skip to content

Commit f993098

Browse files
mdboomcpcloud
andauthored
nvbug6084457: Fix device architecture handling and NVLink link count query (#1937)
* nvbug6084457: Fix device architecture handling and NVLink link count query * Apply suggestion from @cpcloud Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com> * Simplify code --------- Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
1 parent 44b80d8 commit f993098

3 files changed

Lines changed: 14 additions & 4 deletions

File tree

cuda_bindings/tests/nvml/test_init.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,17 @@ def test_devices_are_the_same_architecture(all_devices):
2525
# they won't be tested properly. This tests for the (hopefully rare) case
2626
# where a system has devices of different architectures and produces a warning.
2727

28-
all_arches = {nvml.DeviceArch(nvml.device_get_architecture(device)) for device in all_devices}
28+
def get_architecture_name(arch):
29+
try:
30+
return nvml.DeviceArch(arch).name
31+
except ValueError:
32+
return f"UNKNOWN_ARCH_ID({arch})"
33+
34+
all_arches = {nvml.device_get_architecture(device) for device in all_devices}
2935

3036
if len(all_arches) > 1:
3137
warnings.warn(
32-
f"System has devices of multiple architectures ({', '.join(x.name for x in all_arches)}). "
38+
f"System has devices of multiple architectures ({', '.join(get_architecture_name(x) for x in all_arches)}). "
3339
f" Some tests may be skipped unexpectedly",
3440
UserWarning,
3541
)

cuda_bindings/tests/nvml/test_nvlink.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ def test_nvlink_get_link_count(all_devices):
2626
# The feature_nvlink_supported detection is not robust, so we
2727
# can't be more specific about how many links we should find.
2828
if value.nvml_return == nvml.Return.SUCCESS:
29-
assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
29+
assert value.value.ui_val[0] <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val[0]}"

cuda_core/cuda/core/system/_device.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,11 @@ cdef class Device:
180180
"VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==
181181
"AMPERE"``.
182182
"""
183-
return DeviceArch(nvml.device_get_architecture(self._handle))
183+
arch = nvml.device_get_architecture(self._handle)
184+
try:
185+
return DeviceArch(arch)
186+
except ValueError:
187+
return nvml.DeviceArch.UNKNOWN
184188

185189
@property
186190
def name(self) -> str:

0 commit comments

Comments
 (0)