Skip to content

Commit 9cc2ae5

Browse files
mdboomCopilot
andauthored
cuda.core.system: add conveniences to convert between device types (#1508)
* cuda.core.system: Add conveniences to convert device types * Update cuda_core/cuda/core/_device.pyx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Support systems with old cuda.bindings * Make uuid match between NVML and CUDA * Add documentation --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 747bf60 commit 9cc2ae5

4 files changed

Lines changed: 97 additions & 1 deletion

File tree

cuda_core/cuda/core/_device.pyx

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,29 @@ class Device:
10341034
total = system.get_num_devices()
10351035
return tuple(cls(device_id) for device_id in range(total))
10361036

1037+
def to_system_device(self) -> 'cuda.core.system.Device':
1038+
"""
1039+
Get the corresponding :class:`cuda.core.system.Device` (which is used
1040+
for NVIDIA Machine Library (NVML) access) for this
1041+
:class:`cuda.core.Device` (which is used for CUDA access).
1042+
1043+
The devices are mapped to one another by their UUID.
1044+
1045+
Returns
1046+
-------
1047+
cuda.core.system.Device
1048+
The corresponding system-level device instance used for NVML access.
1049+
"""
1050+
from cuda.core.system._system import CUDA_BINDINGS_NVML_IS_COMPATIBLE
1051+
1052+
if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
1053+
raise RuntimeError(
1054+
"cuda.core.system.Device requires cuda_bindings 13.1.2+ or 12.9.6+"
1055+
)
1056+
1057+
from cuda.core.system import Device as SystemDevice
1058+
return SystemDevice(uuid=self.uuid)
1059+
10371060
@property
10381061
def device_id(self) -> int:
10391062
"""Return device ordinal."""

cuda_core/cuda/core/system/_device.pyx

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,30 @@ cdef class Device:
722722
pci_bus_id = pci_bus_id.decode("ascii")
723723
self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
724724

725+
def to_cuda_device(self) -> "cuda.core.Device":
726+
"""
727+
Get the corresponding :class:`cuda.core.Device` (which is used for CUDA
728+
access) for this :class:`cuda.core.system.Device` (which is used for
729+
NVIDIA machine library (NVML) access).
730+
731+
The devices are mapped to one another by their UUID.
732+
733+
Returns
734+
-------
735+
cuda.core.Device
736+
The corresponding CUDA device.
737+
"""
738+
from cuda.core import Device as CudaDevice
739+
740+
# CUDA does not have an API to get a device by its UUID, so we just
741+
# search all the devices for one with a matching UUID.
742+
743+
for cuda_device in CudaDevice.get_all_devices():
744+
if cuda_device.uuid == self.uuid:
745+
return cuda_device
746+
747+
raise RuntimeError("No corresponding CUDA device found for this NVML device.")
748+
725749
@classmethod
726750
def get_device_count(cls) -> int:
727751
"""
@@ -1036,8 +1060,16 @@ cdef class Device:
10361060
Retrieves the globally unique immutable UUID associated with this
10371061
device, as a 5 part hexadecimal string, that augments the immutable,
10381062
board serial identifier.
1063+
1064+
In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
1065+
prefix. That is not included in ``cuda.core.system``.
10391066
"""
1040-
return nvml.device_get_uuid(self._handle)
1067+
# NVML UUIDs have a `GPU-` or `MIG-` prefix. We remove that here.
1068+
1069+
# TODO: If the user cares about the prefix, we will expose that in the
1070+
# future using the MIG-related APIs in NVML.
1071+
1072+
return nvml.device_get_uuid(self._handle)[4:]
10411073

10421074
def register_events(self, events: EventType | int | list[EventType | int]) -> DeviceEvents:
10431075
"""

cuda_core/tests/system/test_system_device.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,23 @@ def test_device_count():
3333
assert system.Device.get_device_count() == system.get_num_devices()
3434

3535

36+
def test_to_cuda_device():
37+
from cuda.core import Device as CudaDevice
38+
39+
for device in system.Device.get_all_devices():
40+
cuda_device = device.to_cuda_device()
41+
42+
assert isinstance(cuda_device, CudaDevice)
43+
assert cuda_device.uuid == device.uuid
44+
45+
# Technically, this test will only work with PCI devices, but are there
46+
# non-PCI devices we need to support?
47+
48+
# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
49+
# 4-byte domain
50+
assert cuda_device.pci_bus_id == device.pci_info.bus_id[4:]
51+
52+
3653
def test_device_architecture():
3754
for device in system.Device.get_all_devices():
3855
device_arch = device.architecture

cuda_core/tests/test_device.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,30 @@ def cuda_version():
2525
return _py_major_ver, _driver_ver
2626

2727

28+
def test_to_system_device(deinit_cuda):
29+
from cuda.core.system import _system
30+
31+
device = Device()
32+
33+
if not _system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
34+
with pytest.raises(RuntimeError):
35+
device.to_system_device()
36+
pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")
37+
38+
from cuda.core.system import Device as SystemDevice
39+
40+
system_device = device.to_system_device()
41+
assert isinstance(system_device, SystemDevice)
42+
assert system_device.uuid == device.uuid
43+
44+
# Technically, this test will only work with PCI devices, but are there
45+
# non-PCI devices we need to support?
46+
47+
# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
48+
# 4-byte domain
49+
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]
50+
51+
2852
def test_device_set_current(deinit_cuda):
2953
device = Device()
3054
device.set_current()

0 commit comments

Comments
 (0)