diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 23fcf81e92..f3013342e6 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -32,10 +32,12 @@ include "_fan.pxi" include "_field_values.pxi" include "_inforom.pxi" include "_memory.pxi" +include "_nvlink.pxi" include "_pci_info.pxi" include "_performance.pxi" include "_repair_status.pxi" include "_temperature.pxi" +include "_utilization.pxi" cdef class Device: @@ -674,6 +676,20 @@ cdef class Device: """ return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) + ########################################################################## + # NVLINK + # See external class definitions in _nvlink.pxi + + def nvlink(self, link: int) -> NvlinkInfo: + """ + Get :obj:`~NvlinkInfo` about this device. + + For devices with NVLink support. + """ + if link < 0 or link >= NvlinkInfo.max_links: + raise ValueError(f"Link index {link} is out of range [0, {NvlinkInfo.max_links})") + return NvlinkInfo(self, link) + ########################################################################## # PCI INFO # See external class definitions in _pci_info.pxi @@ -770,6 +786,31 @@ cdef class Device: device._handle = handle yield device + ####################################################################### + # UTILIZATION + + @property + def utilization(self) -> Utilization: + """ + Retrieves the current :obj:`~Utilization` rates for the device's major + subsystems. + + For Fermi™ or newer fully supported devices. + + Note: During driver initialization when ECC is enabled one can see high + GPU and Memory Utilization readings. This is caused by ECC Memory + Scrubbing mechanism that is performed during driver initialization. + + Note: On MIG-enabled GPUs, querying device utilization rates is not + currently supported. + + Returns + ------- + Utilization + An object containing the current utilization rates for the device. + """ + return Utilization(nvml.device_get_utilization_rates(self._handle)) + def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: """ @@ -844,10 +885,12 @@ __all__ = [ "GpuP2PStatus", "GpuTopologyLevel", "InforomObject", + "NvlinkVersion", "PcieUtilCounter", "Pstates", "TemperatureSensors", "TemperatureThresholds", "ThermalController", "ThermalTarget", + "Utilization", ] diff --git a/cuda_core/cuda/core/system/_nvlink.pxi b/cuda_core/cuda/core/system/_nvlink.pxi new file mode 100644 index 0000000000..aeee3af153 --- /dev/null +++ b/cuda_core/cuda/core/system/_nvlink.pxi @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +NvlinkVersion = nvml.NvlinkVersion + + +cdef class NvlinkInfo: + """ + Nvlink information for a device. + """ + cdef Device _device + cdef int _link + + def __init__(self, device: Device, link: int): + self._device = device + self._link = link + + @property + def version(self) -> NvlinkVersion: + """ + Retrieves the :obj:`~NvlinkVersion` for the device and link. + + For all products with NvLink support. + + Returns + ------- + NvlinkVersion + The Nvlink version. + """ + return NvlinkVersion(nvml.device_get_nvlink_version(self._device._handle, self._link)) + + @property + def state(self) -> bool: + """ + Retrieves the state of the device's Nvlink for the device and link specified. + + For Pascal™ or newer fully supported devices. + + For all products with Nvlink support. + + Returns + ------- + bool + `True` if the Nvlink is active. + """ + return ( + nvml.device_get_nvlink_state(self._device._handle, self._link) == nvml.EnableState.FEATURE_ENABLED + ) + + max_links = nvml.NVLINK_MAX_LINKS diff --git a/cuda_core/cuda/core/system/_utilization.pxi b/cuda_core/cuda/core/system/_utilization.pxi new file mode 100644 index 0000000000..689b7dc67f --- /dev/null +++ b/cuda_core/cuda/core/system/_utilization.pxi @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class Utilization: + """ + Utilization rates for a device. + + For devices with compute capability 2.0 or higher. + """ + cdef object _utilization + + def __init__(self, utilization: nvml.Utilization): + self._utilization = utilization + + @property + def gpu(self) -> int: + """ + Percent of time over the past sample period during which one or more kernels was executing on the GPU. + """ + return self._utilization.gpu + + @property + def memory(self) -> int: + """ + Percent of time over the past sample period during which global (device) memory was being read or written. + """ + return self._utilization.memory diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 8bd3638da0..88780732d5 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -220,6 +220,7 @@ Enums system.FanControlPolicy system.FieldId system.InforomObject + system.NvlinkVersion system.PcieUtilCounter system.Pstates system.TemperatureSensors diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index de3e6bf77f..8b1f5065c6 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -1,4 +1,4 @@ -.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 :orphan: @@ -76,6 +76,7 @@ NVML system._device.GpuTopologyLevel system._device.InforomInfo system._device.MemoryInfo + system._device.NvlinkInfo system._device.PciInfo system._device.RepairStatus system._device.Temperature diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 85a541018d..3180425ac8 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -729,3 +729,38 @@ def test_pstates(): assert isinstance(utilization.percentage, int) assert isinstance(utilization.inc_threshold, int) assert isinstance(utilization.dec_threshold, int) + + +def test_nvlink(): + for device in system.Device.get_all_devices(): + max_links = _device.NvlinkInfo.max_links + assert isinstance(max_links, int) + assert max_links > 0 + + for link in range(max_links): + with unsupported_before(device, None): + nvlink_info = device.nvlink(link) + assert isinstance(nvlink_info, _device.NvlinkInfo) + + with unsupported_before(device, None): + version = nvlink_info.version + assert isinstance(version, system.NvlinkVersion) + + with unsupported_before(device, None): + state = nvlink_info.state + assert isinstance(state, bool) + + +def test_utilization(): + for device in system.Device.get_all_devices(): + with unsupported_before(device, None): + utilization = device.utilization + assert isinstance(utilization, system.Utilization) + + gpu = utilization.gpu + assert isinstance(gpu, int) + assert 0 <= gpu <= 100 + + memory = utilization.memory + assert isinstance(memory, int) + assert 0 <= memory <= 100