Skip to content

Commit 9013c64

Browse files
committed
cuda.core.system: Add basic Nvlink and Utilization support
1 parent 82e6bb8 commit 9013c64

File tree

5 files changed

+138
-0
lines changed

5 files changed

+138
-0
lines changed

cuda_core/cuda/core/system/_device.pyx

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,12 @@ include "_fan.pxi"
3232
include "_field_values.pxi"
3333
include "_inforom.pxi"
3434
include "_memory.pxi"
35+
include "_nvlink.pxi"
3536
include "_pci_info.pxi"
3637
include "_performance.pxi"
3738
include "_repair_status.pxi"
3839
include "_temperature.pxi"
40+
include "_utilization.pxi"
3941

4042

4143
cdef class Device:
@@ -674,6 +676,18 @@ cdef class Device:
674676
"""
675677
return MemoryInfo(nvml.device_get_memory_info_v2(self._handle))
676678
679+
##########################################################################
680+
# NVLINK
681+
# See external class definitions in _nvlink.pxi
682+
683+
def nvlink(self, link: int) -> NvlinkInfo:
684+
"""
685+
Get information about NVLink on this device.
686+
687+
For devices with NVLink support.
688+
"""
689+
return NvlinkInfo(self, link)
690+
677691
##########################################################################
678692
# PCI INFO
679693
# See external class definitions in _pci_info.pxi
@@ -765,6 +779,30 @@ cdef class Device:
765779
device._handle = handle
766780
yield device
767781
782+
#######################################################################
783+
# UTILIZATION
784+
785+
@property
786+
def utilization(self) -> Utilization:
787+
"""
788+
Retrieves the current utilization rates for the device's major subsystems.
789+
790+
For Fermi &tm; or newer fully supported devices.
791+
792+
Note: During driver initialization when ECC is enabled one can see high
793+
GPU and Memory Utilization readings. This is caused by ECC Memory
794+
Scrubbing mechanism that is performed during driver initialization.
795+
796+
Note: On MIG-enabled GPUs, querying device utilization rates is not
797+
currently supported.
798+
799+
Returns
800+
-------
801+
Utilization
802+
An object containing the current utilization rates for the device.
803+
"""
804+
return Utilization(nvml.device_get_utilization_rates(self._handle))
805+
768806
769807
def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel:
770808
"""
@@ -853,6 +891,8 @@ __all__ = [
853891
"InforomInfo",
854892
"InforomObject",
855893
"MemoryInfo",
894+
"NvlinkInfo",
895+
"NvlinkVersion",
856896
"PcieUtilCounter",
857897
"PciInfo",
858898
"Pstates",
@@ -864,4 +904,5 @@ __all__ = [
864904
"ThermalSensor",
865905
"ThermalSettings",
866906
"ThermalTarget",
907+
"Utilization",
867908
]
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
NvlinkVersion = nvml.NvlinkVersion
7+
8+
9+
cdef class NvlinkInfo:
10+
"""
11+
Nvlink information for a device.
12+
"""
13+
cdef Device _device
14+
cdef int _link
15+
16+
def __init__(self, device: Device, link: int):
17+
self._device = device
18+
self._link = link
19+
20+
@property
21+
def version(self) -> NvLinkVersion:
22+
"""
23+
Retrieves the NvLink version for the device.
24+
25+
For all products with NvLink support.
26+
27+
Returns
28+
-------
29+
30+
The NvLink version.
31+
"""
32+
return NvlinkVersion(nvml.device_get_nvlink_version(self._device._handle, self._link))
33+
34+
max_links = nvml.NVLINK_MAX_LINKS
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
cdef class Utilization:
7+
"""
8+
Utilization rates for a device.
9+
10+
For devices with compute capability 2.0 or higher.
11+
"""
12+
cdef object _utilization
13+
14+
def __init__(self, utilization: nvml.Utilization):
15+
self._utilization = utilization
16+
17+
@property
18+
def gpu(self) -> int:
19+
"""
20+
Percent of time over the past sample period during which one or more kernels was executing on the GPU.
21+
"""
22+
return self._utilization.gpu
23+
24+
@property
25+
def memory(self) -> int:
26+
"""
27+
Percent of time over the past sample period during which global (device) memory was being read or written.
28+
"""
29+
return self._utilization.memory

cuda_core/docs/source/api.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ Enums
222222
system.FanControlPolicy
223223
system.FieldId
224224
system.InforomObject
225+
system.NvlinkVersion
225226
system.PcieUtilCounter
226227
system.Pstates
227228
system.TemperatureSensors
@@ -256,11 +257,13 @@ Types
256257
system.GpuTopologyLevel
257258
system.InforomInfo
258259
system.MemoryInfo
260+
system.NvlinkInfo
259261
system.PciInfo
260262
system.RepairStatus
261263
system.Temperature
262264
system.ThermalSensor
263265
system.ThermalSettings
266+
system.Utilization
264267

265268
.. module:: cuda.core.utils
266269

cuda_core/tests/system/test_system_device.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,3 +729,34 @@ def test_pstates():
729729
assert isinstance(utilization.percentage, int)
730730
assert isinstance(utilization.inc_threshold, int)
731731
assert isinstance(utilization.dec_threshold, int)
732+
733+
734+
def test_nvlink():
735+
for device in system.Device.get_all_devices():
736+
max_links = system.NvlinkInfo.max_links
737+
assert isinstance(max_links, int)
738+
assert max_links > 0
739+
740+
for link in range(max_links):
741+
with unsupported_before(device, None):
742+
nvlink_info = device.nvlink(link)
743+
assert isinstance(nvlink_info, system.NvlinkInfo)
744+
745+
with unsupported_before(device, None):
746+
version = nvlink_info.version
747+
assert isinstance(version, system.NvlinkVersion)
748+
749+
750+
def test_utilization():
751+
for device in system.Device.get_all_devices():
752+
with unsupported_before(device, None):
753+
utilization = device.utilization
754+
assert isinstance(utilization, system.Utilization)
755+
756+
gpu = utilization.gpu
757+
assert isinstance(gpu, int)
758+
assert 0 <= gpu <= 100
759+
760+
memory = utilization.memory
761+
assert isinstance(memory, int)
762+
assert 0 <= memory <= 100

0 commit comments

Comments
 (0)