Skip to content

Commit c747f7b

Browse files
authored
cuda.core.system: Add basic Nvlink and Utilization support (#1918)
* cuda.core.system: Add basic Nvlink and Utilization support * Address comments in the PR * Fix test
1 parent aa10843 commit c747f7b

6 files changed

Lines changed: 161 additions & 0 deletions

File tree

cuda_core/cuda/core/system/_device.pyx

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ include "_field_values.pxi"
3333
include "_inforom.pxi"
3434
include "_memory.pxi"
3535
include "_mig.pxi"
36+
include "_nvlink.pxi"
3637
include "_pci_info.pxi"
3738
include "_performance.pxi"
3839
include "_repair_status.pxi"
3940
include "_temperature.pxi"
41+
include "_utilization.pxi"
4042

4143

4244
cdef class Device:
@@ -702,6 +704,20 @@ cdef class Device:
702704
"""
703705
return MemoryInfo(nvml.device_get_memory_info_v2(self._handle))
704706
707+
##########################################################################
708+
# NVLINK
709+
# See external class definitions in _nvlink.pxi
710+
711+
def get_nvlink(self, link: int) -> NvlinkInfo:
712+
"""
713+
Get :obj:`~NvlinkInfo` about this device.
714+
715+
For devices with NVLink support.
716+
"""
717+
if link < 0 or link >= NvlinkInfo.max_links:
718+
raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})")
719+
return NvlinkInfo(self, link)
720+
705721
##########################################################################
706722
# PCI INFO
707723
# See external class definitions in _pci_info.pxi
@@ -798,6 +814,31 @@ cdef class Device:
798814
device._handle = handle
799815
yield device
800816
817+
#######################################################################
818+
# UTILIZATION
819+
820+
@property
821+
def utilization(self) -> Utilization:
822+
"""
823+
Retrieves the current :obj:`~Utilization` rates for the device's major
824+
subsystems.
825+
826+
For Fermi™ or newer fully supported devices.
827+
828+
Note: During driver initialization when ECC is enabled one can see high
829+
GPU and Memory Utilization readings. This is caused by ECC Memory
830+
Scrubbing mechanism that is performed during driver initialization.
831+
832+
Note: On MIG-enabled GPUs, querying device utilization rates is not
833+
currently supported.
834+
835+
Returns
836+
-------
837+
Utilization
838+
An object containing the current utilization rates for the device.
839+
"""
840+
return Utilization(nvml.device_get_utilization_rates(self._handle))
841+
801842
802843
def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel:
803844
"""
@@ -872,10 +913,12 @@ __all__ = [
872913
"GpuP2PStatus",
873914
"GpuTopologyLevel",
874915
"InforomObject",
916+
"NvlinkVersion",
875917
"PcieUtilCounter",
876918
"Pstates",
877919
"TemperatureSensors",
878920
"TemperatureThresholds",
879921
"ThermalController",
880922
"ThermalTarget",
923+
"Utilization",
881924
]
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
NvlinkVersion = nvml.NvlinkVersion
7+
8+
9+
cdef class NvlinkInfo:
10+
"""
11+
Nvlink information for a device.
12+
"""
13+
cdef Device _device
14+
cdef int _link
15+
16+
def __init__(self, device: Device, link: int):
17+
self._device = device
18+
self._link = link
19+
20+
@property
21+
def version(self) -> NvlinkVersion:
22+
"""
23+
Retrieves the :obj:`~NvlinkVersion` for the device and link.
24+
25+
For all products with NvLink support.
26+
27+
Returns
28+
-------
29+
NvlinkVersion
30+
The Nvlink version.
31+
"""
32+
return NvlinkVersion(nvml.device_get_nvlink_version(self._device._handle, self._link))
33+
34+
@property
35+
def state(self) -> bool:
36+
"""
37+
Retrieves the state of the device's Nvlink for the device and link specified.
38+
39+
For Pascal™ or newer fully supported devices.
40+
41+
For all products with Nvlink support.
42+
43+
Returns
44+
-------
45+
bool
46+
`True` if the Nvlink is active.
47+
"""
48+
return (
49+
nvml.device_get_nvlink_state(self._device._handle, self._link) == nvml.EnableState.FEATURE_ENABLED
50+
)
51+
52+
max_links = nvml.NVLINK_MAX_LINKS
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
cdef class Utilization:
7+
"""
8+
Utilization rates for a device.
9+
10+
For devices with compute capability 2.0 or higher.
11+
"""
12+
cdef object _utilization
13+
14+
def __init__(self, utilization: nvml.Utilization):
15+
self._utilization = utilization
16+
17+
@property
18+
def gpu(self) -> int:
19+
"""
20+
Percent of time over the past sample period during which one or more kernels was executing on the GPU.
21+
"""
22+
return self._utilization.gpu
23+
24+
@property
25+
def memory(self) -> int:
26+
"""
27+
Percent of time over the past sample period during which global (device) memory was being read or written.
28+
"""
29+
return self._utilization.memory

cuda_core/docs/source/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ Enums
220220
system.FanControlPolicy
221221
system.FieldId
222222
system.InforomObject
223+
system.NvlinkVersion
223224
system.PcieUtilCounter
224225
system.Pstates
225226
system.TemperatureSensors

cuda_core/docs/source/api_private.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ NVML
7777
system._device.InforomInfo
7878
system._device.MemoryInfo
7979
system._device.MigInfo
80+
system._device.NvlinkInfo
8081
system._device.PciInfo
8182
system._device.RepairStatus
8283
system._device.Temperature

cuda_core/tests/system/test_system_device.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,41 @@ def test_pstates():
731731
assert isinstance(utilization.dec_threshold, int)
732732

733733

734+
def test_nvlink():
735+
for device in system.Device.get_all_devices():
736+
max_links = _device.NvlinkInfo.max_links
737+
assert isinstance(max_links, int)
738+
assert max_links > 0
739+
740+
for link in range(max_links):
741+
with unsupported_before(device, None):
742+
nvlink_info = device.get_nvlink(link)
743+
assert isinstance(nvlink_info, _device.NvlinkInfo)
744+
745+
with unsupported_before(device, None):
746+
version = nvlink_info.version
747+
assert isinstance(version, system.NvlinkVersion)
748+
749+
with unsupported_before(device, None):
750+
state = nvlink_info.state
751+
assert isinstance(state, bool)
752+
753+
754+
def test_utilization():
755+
for device in system.Device.get_all_devices():
756+
with unsupported_before(device, None):
757+
utilization = device.utilization
758+
assert isinstance(utilization, system.Utilization)
759+
760+
gpu = utilization.gpu
761+
assert isinstance(gpu, int)
762+
assert 0 <= gpu <= 100
763+
764+
memory = utilization.memory
765+
assert isinstance(memory, int)
766+
assert 0 <= memory <= 100
767+
768+
734769
@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="MIG not supported on WSL or Windows")
735770
def test_mig():
736771
for device in system.Device.get_all_devices():

0 commit comments

Comments
 (0)