Skip to content

Commit 43f2037

Browse files
mdboomCopilot
andauthored
cuda.core.system: Add support for getting and clearing field values (NVIDIA#1446)
* Add APIs for accessing field values * Improve types in docstrings * Update cuda_core/tests/system/test_system_device.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update cuda_core/cuda/core/system/_device.pyx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix typos in docstrings * Add testing for clear_field_values * Be backward-compatible with old cuda_bindings * More refined working NVML check * Simplify _device for backward compat again * Fix tests * Fix tests again * Fix memory management bug revealed on py3.14t --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent acc78f7 commit 43f2037

6 files changed

Lines changed: 293 additions & 46 deletions

File tree

cuda_bindings/cuda/bindings/_nvml.pyx

Lines changed: 44 additions & 37 deletions
Large diffs are not rendered by default.

cuda_bindings/tests/nvml/test_nvlink.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ def test_nvlink_get_link_count(all_devices):
1111
"""
1212
for device in all_devices:
1313
fields = nvml.FieldValue(1)
14-
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
14+
fields[0].field_id = nvml.FieldId.DEV_NVLINK_LINK_COUNT
1515
value = nvml.device_get_field_values(device, fields)[0]
1616
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
1717
f"Unexpected return {value.nvml_return} for link count field query"
1818
)
1919

2020
# Use the alternative argument to device_get_field_values
21-
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
21+
value = nvml.device_get_field_values(device, [nvml.FieldId.DEV_NVLINK_LINK_COUNT])[0]
2222
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
2323
f"Unexpected return {value.nvml_return} for link count field query"
2424
)

cuda_core/cuda/core/system/_device.pyx

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ from ._nvml_context cimport initialize
1515
include "_device_utils.pxi"
1616

1717

18+
FieldId = nvml.FieldId
19+
20+
1821
class DeviceArchitecture:
1922
"""
2023
Device architecture enumeration.
@@ -171,6 +174,141 @@ cdef class PciInfo:
171174
return self._pci_info.pci_device_id >> 16
172175

173176

177+
cdef class FieldValue:
178+
"""
179+
Represents the data from a single field value.
180+
181+
Use :meth:`Device.get_field_values` to get multiple field values at once.
182+
"""
183+
cdef object _field_value
184+
185+
def __init__(self, field_value: nvml.FieldValue):
186+
assert len(field_value) == 1
187+
self._field_value = field_value
188+
189+
@property
190+
def field_id(self) -> FieldId:
191+
"""
192+
The field ID.
193+
"""
194+
return FieldId(self._field_value.field_id)
195+
196+
@property
197+
def scope_id(self) -> int:
198+
"""
199+
The scope ID.
200+
"""
201+
# Explicit int() cast required because this is a Numpy type
202+
return int(self._field_value.scope_id)
203+
204+
@property
205+
def timestamp(self) -> int:
206+
"""
207+
The CPU timestamp (in microseconds since 1970) at which the value was
208+
sampled.
209+
"""
210+
# Explicit int() cast required because this is a Numpy type
211+
return int(self._field_value.timestamp)
212+
213+
@property
214+
def latency_usec(self) -> int:
215+
"""
216+
How long this field value took to update (in usec) within NVML. This may
217+
be averaged across several fields that are serviced by the same driver
218+
call.
219+
"""
220+
# Explicit int() cast required because this is a Numpy type
221+
return int(self._field_value.latency_usec)
222+
223+
@property
224+
def value(self) -> int | float:
225+
"""
226+
The field value.
227+
228+
Raises
229+
------
230+
:class:`cuda.core.system.NvmlError`
231+
If there was an error retrieving the field value.
232+
"""
233+
nvml.check_status(self._field_value.nvml_return)
234+
235+
cdef int value_type = self._field_value.value_type
236+
value = self._field_value.value
237+
238+
ValueType = nvml.ValueType
239+
240+
if value_type == ValueType.DOUBLE:
241+
return float(value.d_val[0])
242+
elif value_type == ValueType.UNSIGNED_INT:
243+
return int(value.ui_val[0])
244+
elif value_type == ValueType.UNSIGNED_LONG:
245+
return int(value.ul_val[0])
246+
elif value_type == ValueType.UNSIGNED_LONG_LONG:
247+
return int(value.ull_val[0])
248+
elif value_type == ValueType.SIGNED_LONG_LONG:
249+
return int(value.ll_val[0])
250+
elif value_type == ValueType.SIGNED_INT:
251+
return int(value.si_val[0])
252+
elif value_type == ValueType.UNSIGNED_SHORT:
253+
return int(value.us_val[0])
254+
else:
255+
raise AssertionError("Unexpected value type")
256+
257+
258+
cdef class FieldValues:
259+
"""
260+
Container of multiple field values.
261+
"""
262+
cdef object _field_values
263+
264+
def __init__(self, field_values: nvml.FieldValue):
265+
self._field_values = field_values
266+
267+
def __getitem__(self, idx: int) -> FieldValue:
268+
return FieldValue(self._field_values[idx])
269+
270+
def __len__(self) -> int:
271+
return len(self._field_values)
272+
273+
def validate(self) -> None:
274+
"""
275+
Validate that there are no issues in any of the contained field values.
276+
277+
Raises an exception for the first issue found, if any.
278+
279+
Raises
280+
------
281+
:class:`cuda.core.system.NvmlError`
282+
If any of the contained field values has an associated exception.
283+
"""
284+
# TODO: This is a classic use case for an `ExceptionGroup`, but those
285+
# are only available in Python 3.11+.
286+
return_values = self._field_values.nvml_return
287+
if len(self._field_values) == 1:
288+
return_values = [return_values]
289+
for return_value in return_values:
290+
nvml.check_status(return_value)
291+
292+
def get_all_values(self) -> list[int | float]:
293+
"""
294+
Get all field values as a list.
295+
296+
This will validate each of the values and include just the core value in
297+
the list.
298+
299+
Returns
300+
-------
301+
list[int | float]
302+
List of all field values.
303+
304+
Raises
305+
------
306+
:class:`cuda.core.system.NvmlError`
307+
If any of the contained field values has an associated exception.
308+
"""
309+
return [x.value for x in self]
310+
311+
174312
cdef class Device:
175313
"""
176314
Representation of a device.
@@ -313,11 +451,54 @@ cdef class Device:
313451
"""
314452
return nvml.device_get_uuid(self._handle)
315453

454+
def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues:
455+
"""
456+
Get multiple field values from the device.
457+
458+
Each value specified can raise its own exception. That exception will
459+
be raised when attempting to access the corresponding ``value`` from the
460+
returned :class:`FieldValues` container.
461+
462+
To confirm that there are no exceptions in the entire container, call
463+
:meth:`FieldValues.validate`.
464+
465+
Parameters
466+
----------
467+
field_ids: list of int or tuple of (int, int)
468+
List of field IDs to query.
469+
470+
Each item may be either a single value from the :class:`FieldId`
471+
enum, or a pair of (:class:`FieldId`, scope ID).
472+
473+
Returns
474+
-------
475+
:class:`FieldValues`
476+
Container of field values corresponding to the requested field IDs.
477+
"""
478+
return FieldValues(nvml.device_get_field_values(self._handle, field_ids))
479+
480+
def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None:
481+
"""
482+
Clear multiple field values from the device.
483+
484+
Parameters
485+
----------
486+
field_ids: list of int or tuple of (int, int)
487+
List of field IDs to clear.
488+
489+
Each item may be either a single value from the :class:`FieldId`
490+
enum, or a pair of (:class:`FieldId`, scope ID).
491+
"""
492+
nvml.device_clear_field_values(self._handle, field_ids)
493+
316494

317495
__all__ = [
318496
"BAR1MemoryInfo",
319497
"Device",
320498
"DeviceArchitecture",
499+
"FieldId",
500+
"FieldValue",
501+
"FieldValues",
321502
"MemoryInfo",
322503
"PciInfo",
323504
]

cuda_core/cuda/core/system/_system.pyx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ else:
2020

2121
if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
2222
from cuda.bindings import _nvml as nvml
23+
# TODO: We need to be even more specific than version numbers for development.
24+
# This can be removed once we have a release including everything we need.
25+
for member in ["FieldId"]:
26+
if not hasattr(nvml, member):
27+
CUDA_BINDINGS_NVML_IS_COMPATIBLE = False
28+
break
29+
30+
if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
2331
from ._nvml_context import initialize
2432
else:
2533
from cuda.core._utils.cuda_utils import driver, handle_return, runtime

cuda_core/docs/source/api.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ CUDA system information and NVIDIA Management Library (NVML)
8383

8484
system.Device
8585
system.DeviceArchitecture
86+
system.FieldId
87+
system.FieldValue
88+
system.FieldValues
8689
system.MemoryInfo
8790
system.BAR1MemoryInfo
8891
system.PciInfo

cuda_core/tests/system/test_system_device.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515

1616
import pytest
1717
from cuda.core import system
18-
from cuda.core.system import _device as system_device
1918

2019
if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
2120
from cuda.bindings import _nvml as nvml
21+
from cuda.core.system import _device
2222

2323

2424
@pytest.fixture(autouse=True, scope="module")
@@ -36,7 +36,7 @@ def test_device_architecture():
3636
for device in system.Device.get_all_devices():
3737
device_arch = device.architecture
3838

39-
assert isinstance(device_arch, system_device.DeviceArchitecture)
39+
assert isinstance(device_arch, system.DeviceArchitecture)
4040
if sys.version_info < (3, 12):
4141
assert device_arch.id in nvml.DeviceArch.__members__.values()
4242
else:
@@ -52,7 +52,7 @@ def test_device_bar1_memory():
5252
bar1_memory_info.used,
5353
)
5454

55-
assert isinstance(bar1_memory_info, system_device.BAR1MemoryInfo)
55+
assert isinstance(bar1_memory_info, system.BAR1MemoryInfo)
5656
assert isinstance(free, int)
5757
assert isinstance(total, int)
5858
assert isinstance(used, int)
@@ -93,7 +93,7 @@ def test_device_memory():
9393
memory_info = device.memory_info
9494
free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved
9595

96-
assert isinstance(memory_info, system_device.MemoryInfo)
96+
assert isinstance(memory_info, system.MemoryInfo)
9797
assert isinstance(free, int)
9898
assert isinstance(total, int)
9999
assert isinstance(used, int)
@@ -116,7 +116,7 @@ def test_device_name():
116116
def test_device_pci_info():
117117
for device in system.Device.get_all_devices():
118118
pci_info = device.pci_info
119-
assert isinstance(pci_info, system_device.PciInfo)
119+
assert isinstance(pci_info, system.PciInfo)
120120

121121
assert isinstance(pci_info.bus_id, str)
122122
assert re.match("[a-f0-9]{8}:[a-f0-9]{2}:[a-f0-9]{2}.[a-f0-9]", pci_info.bus_id.lower())
@@ -183,9 +183,57 @@ def test_device_uuid():
183183
],
184184
)
185185
def test_unpack_bitmask(params):
186-
assert system_device._unpack_bitmask(array.array("Q", params["input"])) == params["output"]
186+
assert _device._unpack_bitmask(array.array("Q", params["input"])) == params["output"]
187187

188188

189189
def test_unpack_bitmask_single_value():
190190
with pytest.raises(TypeError):
191-
system_device._unpack_bitmask(1)
191+
_device._unpack_bitmask(1)
192+
193+
194+
def test_field_values():
195+
for device in system.Device.get_all_devices():
196+
# TODO: Are there any fields that return double's? It would be good to
197+
# test those.
198+
199+
field_ids = [
200+
system.FieldId.DEV_TOTAL_ENERGY_CONSUMPTION,
201+
system.FieldId.DEV_PCIE_COUNT_TX_BYTES,
202+
]
203+
field_values = device.get_field_values(field_ids)
204+
field_values.validate()
205+
206+
with pytest.raises(TypeError):
207+
field_values["invalid_index"]
208+
209+
assert isinstance(field_values, system.FieldValues)
210+
assert len(field_values) == len(field_ids)
211+
212+
raw_values = field_values.get_all_values()
213+
assert all(x == y.value for x, y in zip(raw_values, field_values))
214+
215+
for field_id, field_value in zip(field_ids, field_values):
216+
assert field_value.field_id == field_id
217+
assert type(field_value.value) is int
218+
assert field_value.latency_usec >= 0
219+
assert field_value.timestamp >= 0
220+
221+
orig_timestamp = field_values[0].timestamp
222+
field_values = device.get_field_values(field_ids)
223+
assert field_values[0].timestamp >= orig_timestamp
224+
225+
# Test only one element, because that's weirdly a special case
226+
field_ids = [
227+
system.FieldId.DEV_PCIE_REPLAY_COUNTER,
228+
]
229+
field_values = device.get_field_values(field_ids)
230+
assert len(field_values) == 1
231+
field_values.validate()
232+
old_value = field_values[0].value
233+
234+
# Test clear_field_values
235+
device.clear_field_values(field_ids)
236+
field_values = device.get_field_values(field_ids)
237+
field_values.validate()
238+
assert len(field_values) == 1
239+
assert field_values[0].value <= old_value

0 commit comments

Comments
 (0)