Skip to content

Commit 088664c

Browse files
committed
Merge remote-tracking branch 'upstream/main' into cuda-core-system-cudf
2 parents 9745368 + b162f64 commit 088664c

6 files changed

Lines changed: 230 additions & 15 deletions

File tree

.gitignore

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd
2626
cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx
2727
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd
2828
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx
29-
cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd
30-
cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx
3129
cuda_bindings/cuda/bindings/_internal/_nvml.pyx
3230
cuda_bindings/cuda/bindings/_internal/cufile.pyx
3331
cuda_bindings/cuda/bindings/_internal/nvfatbin.pyx
@@ -42,14 +40,10 @@ cuda_bindings/cuda/bindings/cyruntime.pxd
4240
cuda_bindings/cuda/bindings/cyruntime.pyx
4341
cuda_bindings/cuda/bindings/cyruntime_functions.pxi
4442
cuda_bindings/cuda/bindings/cyruntime_types.pxi
45-
cuda_bindings/cuda/bindings/cynvrtc.pxd
46-
cuda_bindings/cuda/bindings/cynvrtc.pyx
4743
cuda_bindings/cuda/bindings/driver.pxd
4844
cuda_bindings/cuda/bindings/driver.pyx
4945
cuda_bindings/cuda/bindings/runtime.pxd
5046
cuda_bindings/cuda/bindings/runtime.pyx
51-
cuda_bindings/cuda/bindings/nvrtc.pxd
52-
cuda_bindings/cuda/bindings/nvrtc.pyx
5347
cuda_bindings/cuda/bindings/utils/_get_handle.pyx
5448

5549
# Version files from setuptools_scm

cuda_core/cuda/core/system/_device.pyx

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ include "_fan.pxi"
3232
include "_field_values.pxi"
3333
include "_inforom.pxi"
3434
include "_memory.pxi"
35+
include "_mig.pxi"
3536
include "_pci_info.pxi"
3637
include "_performance.pxi"
3738
include "_process.pxi"
@@ -133,12 +134,23 @@ cdef class Device:
133134
board serial identifier.
134135

135136
In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
136-
prefix. That is not included in ``cuda.core.system``.
137+
prefix. If you need a `uuid` without that prefix (for example, to
138+
interact with CUDA), use the `uuid_without_prefix` property.
137139
"""
138-
# NVML UUIDs have a `GPU-` or `MIG-` prefix. We remove that here.
140+
return nvml.device_get_uuid(self._handle)
139141

140-
# TODO: If the user cares about the prefix, we will expose that in the
141-
# future using the MIG-related APIs in NVML.
142+
@property
143+
def uuid_without_prefix(self) -> str:
144+
"""
145+
Retrieves the globally unique immutable UUID associated with this
146+
device, as a 5 part hexadecimal string, that augments the immutable,
147+
board serial identifier.
148+
149+
In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
150+
prefix. This property returns it without the prefix, to match the UUIDs
151+
used in CUDA. If you need the prefix, use the `uuid` property.
152+
"""
153+
# NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here.
142154
return nvml.device_get_uuid(self._handle)[4:]
143155

144156
@property
@@ -266,7 +278,7 @@ cdef class Device:
266278
# search all the devices for one with a matching UUID.
267279

268280
for cuda_device in CudaDevice.get_all_devices():
269-
if cuda_device.uuid == self.uuid:
281+
if cuda_device.uuid == self.uuid_without_prefix:
270282
return cuda_device
271283

272284
raise RuntimeError("No corresponding CUDA device found for this NVML device.")
@@ -281,6 +293,8 @@ cdef class Device:
281293
int
282294
The number of available devices.
283295
"""
296+
initialize()
297+
284298
return nvml.device_get_count_v2()
285299

286300
@classmethod
@@ -293,6 +307,8 @@ cdef class Device:
293307
Iterator over :obj:`~Device`
294308
An iterator over available devices.
295309
"""
310+
initialize()
311+
296312
for device_id in range(nvml.device_get_count_v2()):
297313
yield cls(index=device_id)
298314

@@ -318,6 +334,18 @@ cdef class Device:
318334
"""
319335
return AddressingMode(nvml.device_get_addressing_mode(self._handle).value)
320336

337+
#########################################################################
338+
# MIG (MULTI-INSTANCE GPU) DEVICES
339+
340+
@property
341+
def mig(self) -> MigInfo:
342+
"""
343+
Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information.
344+
345+
For Ampere™ or newer fully supported devices.
346+
"""
347+
return MigInfo(self)
348+
321349
#########################################################################
322350
# AFFINITY
323351

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
6+
from typing import Iterable
7+
8+
9+
cdef class MigInfo:
10+
cdef Device _device
11+
12+
def __init__(self, device: Device):
13+
self._device = device
14+
15+
@property
16+
def is_mig_device(self) -> bool:
17+
"""
18+
Whether this device is a MIG (Multi-Instance GPU) device.
19+
20+
A MIG device handle is an NVML abstraction which maps to a MIG compute
21+
instance. These overloaded references can be used (with some
22+
restrictions) interchangeably with a GPU device handle to execute
23+
queries at a per-compute instance granularity.
24+
25+
For Ampere™ or newer fully supported devices.
26+
"""
27+
return bool(nvml.device_is_mig_device_handle(self._device._handle))
28+
29+
@property
30+
def mode(self) -> bool:
31+
"""
32+
Get current MIG mode for the device.
33+
34+
For Ampere™ or newer fully supported devices.
35+
36+
Changing MIG modes may require device unbind or reset. The "pending" MIG
37+
mode refers to the target mode following the next activation trigger.
38+
39+
Returns
40+
-------
41+
bool
42+
`True` if current MIG mode is enabled.
43+
"""
44+
current, _ = nvml.device_get_mig_mode(self._device._handle)
45+
return current == nvml.EnableState.FEATURE_ENABLED
46+
47+
@mode.setter
48+
def mode(self, mode: bool):
49+
"""
50+
Set the MIG mode for the device.
51+
52+
For Ampere™ or newer fully supported devices.
53+
54+
Changing MIG modes may require device unbind or reset. The "pending" MIG
55+
mode refers to the target mode following the next activation trigger.
56+
57+
Parameters
58+
----------
59+
mode: bool
60+
`True` to enable MIG mode, `False` to disable MIG mode.
61+
"""
62+
nvml.device_set_mig_mode(
63+
self._device._handle,
64+
nvml.EnableState.FEATURE_ENABLED if mode else nvml.EnableState.FEATURE_DISABLED
65+
)
66+
67+
@property
68+
def pending_mode(self) -> bool:
69+
"""
70+
Get pending MIG mode for the device.
71+
72+
For Ampere™ or newer fully supported devices.
73+
74+
Changing MIG modes may require device unbind or reset. The "pending" MIG
75+
mode refers to the target mode following the next activation trigger.
76+
77+
If the device is not a MIG device, returns `False`.
78+
79+
Returns
80+
-------
81+
bool
82+
`True` if pending MIG mode is enabled.
83+
"""
84+
_, pending = nvml.device_get_mig_mode(self._device._handle)
85+
return pending == nvml.EnableState.FEATURE_ENABLED
86+
87+
@property
88+
def device_count(self) -> int:
89+
"""
90+
Get the maximum number of MIG devices that can exist under this device.
91+
92+
Returns zero if MIG is not supported or enabled.
93+
94+
For Ampere™ or newer fully supported devices.
95+
96+
Returns
97+
-------
98+
int
99+
The number of MIG devices (compute instances) on this GPU.
100+
"""
101+
return nvml.device_get_max_mig_device_count(self._device._handle)
102+
103+
@property
104+
def parent(self) -> Device:
105+
"""
106+
For MIG devices, get the parent GPU device.
107+
108+
For Ampere™ or newer fully supported devices.
109+
110+
Returns
111+
-------
112+
Device
113+
The parent GPU device for this MIG device.
114+
"""
115+
parent_handle = nvml.device_get_device_handle_from_mig_device_handle(self._device._handle)
116+
parent_device = Device.__new__(Device)
117+
parent_device._handle = parent_handle
118+
return parent_device
119+
120+
def get_device_by_index(self, index: int) -> Device:
121+
"""
122+
Get MIG device for the given index under its parent device.
123+
124+
If the compute instance is destroyed either explicitly or by destroying,
125+
resetting or unbinding the parent GPU instance or the GPU device itself
126+
the MIG device handle would remain invalid and must be requested again
127+
using this API. Handles may be reused and their properties can change in
128+
the process.
129+
130+
For Ampere™ or newer fully supported devices.
131+
132+
Parameters
133+
----------
134+
index: int
135+
The index of the MIG device (compute instance) to retrieve. Must be
136+
between 0 and the value returned by `device_count - 1`.
137+
138+
Returns
139+
-------
140+
Device
141+
The MIG device corresponding to the given index.
142+
"""
143+
mig_device_handle = nvml.device_get_mig_device_handle_by_index(self._device._handle, index)
144+
mig_device = Device.__new__(Device)
145+
mig_device._handle = mig_device_handle
146+
return mig_device
147+
148+
def get_all_devices(self) -> Iterable[Device]:
149+
"""
150+
Get all MIG devices under its parent device.
151+
152+
If the compute instance is destroyed either explicitly or by destroying,
153+
resetting or unbinding the parent GPU instance or the GPU device itself
154+
the MIG device handle would remain invalid and must be requested again
155+
using this API. Handles may be reused and their properties can change in
156+
the process.
157+
158+
For Ampere™ or newer fully supported devices.
159+
160+
Returns
161+
-------
162+
list[Device]
163+
A list of all MIG devices corresponding to this GPU.
164+
"""
165+
for i in range(self.device_count):
166+
yield self.get_device_by_index(i)

cuda_core/docs/source/api_private.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ NVML
7676
system._device.GpuTopologyLevel
7777
system._device.InforomInfo
7878
system._device.MemoryInfo
79+
system._device.MigInfo
7980
system._device.PciInfo
8081
system._device.ProcessInfo
8182
system._device.RepairStatus

cuda_core/tests/system/test_system_device.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_to_cuda_device():
5757
cuda_device = device.to_cuda_device()
5858

5959
assert isinstance(cuda_device, CudaDevice)
60-
assert cuda_device.uuid == device.uuid
60+
assert cuda_device.uuid == device.uuid_without_prefix
6161

6262
# Technically, this test will only work with PCI devices, but are there
6363
# non-PCI devices we need to support?
@@ -227,9 +227,9 @@ def test_device_serial():
227227
assert len(serial) > 0
228228

229229

230-
def test_device_uuid():
230+
def test_device_uuid_without_prefix():
231231
for device in system.Device.get_all_devices():
232-
uuid = device.uuid
232+
uuid = device.uuid_without_prefix
233233
assert isinstance(uuid, str)
234234

235235
# Expands to GPU-8hex-4hex-4hex-4hex-12hex, where 8hex means 8 consecutive
@@ -742,3 +742,29 @@ def test_compute_running_processes():
742742
assert isinstance(proc.used_gpu_memory, int)
743743
assert isinstance(proc.gpu_instance_id, int)
744744
assert isinstance(proc.compute_instance_id, int)
745+
746+
747+
@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="MIG not supported on WSL or Windows")
748+
def test_mig():
749+
for device in system.Device.get_all_devices():
750+
with unsupported_before(device, None):
751+
mig = device.mig
752+
753+
assert isinstance(mig.is_mig_device, bool)
754+
assert isinstance(mig.mode, bool)
755+
assert isinstance(mig.pending_mode, bool)
756+
757+
device_count = mig.device_count
758+
assert isinstance(device_count, int)
759+
assert device_count >= 0
760+
761+
for mig_device in mig.get_all_devices():
762+
assert isinstance(mig_device, system.Device)
763+
764+
765+
def test_uuid():
766+
for device in system.Device.get_all_devices():
767+
uuid = device.uuid
768+
assert isinstance(uuid, str)
769+
assert uuid.startswith(("GPU-", "MIG-"))
770+
assert uuid == device.uuid

cuda_core/tests/test_device.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def test_to_system_device(deinit_cuda):
3838

3939
system_device = device.to_system_device()
4040
assert isinstance(system_device, SystemDevice)
41-
assert system_device.uuid == device.uuid
41+
assert system_device.uuid_without_prefix == device.uuid
4242

4343
# Technically, this test will only work with PCI devices, but are there
4444
# non-PCI devices we need to support?

0 commit comments

Comments
 (0)