diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 23fcf81e92..27a487bc3f 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -32,6 +32,7 @@ include "_fan.pxi" include "_field_values.pxi" include "_inforom.pxi" include "_memory.pxi" +include "_mig.pxi" include "_pci_info.pxi" include "_performance.pxi" include "_repair_status.pxi" @@ -132,12 +133,23 @@ cdef class Device: board serial identifier. In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` - prefix. That is not included in ``cuda.core.system``. + prefix. If you need a `uuid` without that prefix (for example, to + interact with CUDA), use the `uuid_without_prefix` property. """ - # NVML UUIDs have a `GPU-` or `MIG-` prefix. We remove that here. + return nvml.device_get_uuid(self._handle) - # TODO: If the user cares about the prefix, we will expose that in the - # future using the MIG-related APIs in NVML. + @property + def uuid_without_prefix(self) -> str: + """ + Retrieves the globally unique immutable UUID associated with this + device, as a 5 part hexadecimal string, that augments the immutable, + board serial identifier. + + In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` + prefix. This property returns it without the prefix, to match the UUIDs + used in CUDA. If you need the prefix, use the `uuid` property. + """ + # NVML UUIDs have a `gpu-` or `mig-` prefix. We remove that here. return nvml.device_get_uuid(self._handle)[4:] @property @@ -265,7 +277,7 @@ cdef class Device: # search all the devices for one with a matching UUID. for cuda_device in CudaDevice.get_all_devices(): - if cuda_device.uuid == self.uuid: + if cuda_device.uuid == self.uuid_without_prefix: return cuda_device raise RuntimeError("No corresponding CUDA device found for this NVML device.") @@ -280,6 +292,8 @@ cdef class Device: int The number of available devices. """ + initialize() + return nvml.device_get_count_v2() @classmethod @@ -292,6 +306,8 @@ cdef class Device: Iterator over :obj:`~Device` An iterator over available devices. """ + initialize() + for device_id in range(nvml.device_get_count_v2()): yield cls(index=device_id) @@ -317,6 +333,18 @@ cdef class Device: """ return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) + ######################################################################### + # MIG (MULTI-INSTANCE GPU) DEVICES + + @property + def mig(self) -> MigInfo: + """ + Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information. + + For Ampere™ or newer fully supported devices. + """ + return MigInfo(self) + ######################################################################### # AFFINITY diff --git a/cuda_core/cuda/core/system/_mig.pxi b/cuda_core/cuda/core/system/_mig.pxi new file mode 100644 index 0000000000..8fa6b9d780 --- /dev/null +++ b/cuda_core/cuda/core/system/_mig.pxi @@ -0,0 +1,166 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from typing import Iterable + + +cdef class MigInfo: + cdef Device _device + + def __init__(self, device: Device): + self._device = device + + @property + def is_mig_device(self) -> bool: + """ + Whether this device is a MIG (Multi-Instance GPU) device. + + A MIG device handle is an NVML abstraction which maps to a MIG compute + instance. These overloaded references can be used (with some + restrictions) interchangeably with a GPU device handle to execute + queries at a per-compute instance granularity. + + For Ampere™ or newer fully supported devices. + """ + return bool(nvml.device_is_mig_device_handle(self._device._handle)) + + @property + def mode(self) -> bool: + """ + Get current MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + Returns + ------- + bool + `True` if current MIG mode is enabled. + """ + current, _ = nvml.device_get_mig_mode(self._device._handle) + return current == nvml.EnableState.FEATURE_ENABLED + + @mode.setter + def mode(self, mode: bool): + """ + Set the MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + Parameters + ---------- + mode: bool + `True` to enable MIG mode, `False` to disable MIG mode. + """ + nvml.device_set_mig_mode( + self._device._handle, + nvml.EnableState.FEATURE_ENABLED if mode else nvml.EnableState.FEATURE_DISABLED + ) + + @property + def pending_mode(self) -> bool: + """ + Get pending MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + If the device is not a MIG device, returns `False`. + + Returns + ------- + bool + `True` if pending MIG mode is enabled. + """ + _, pending = nvml.device_get_mig_mode(self._device._handle) + return pending == nvml.EnableState.FEATURE_ENABLED + + @property + def device_count(self) -> int: + """ + Get the maximum number of MIG devices that can exist under this device. + + Returns zero if MIG is not supported or enabled. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + int + The number of MIG devices (compute instances) on this GPU. + """ + return nvml.device_get_max_mig_device_count(self._device._handle) + + @property + def parent(self) -> Device: + """ + For MIG devices, get the parent GPU device. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + Device + The parent GPU device for this MIG device. + """ + parent_handle = nvml.device_get_device_handle_from_mig_device_handle(self._device._handle) + parent_device = Device.__new__(Device) + parent_device._handle = parent_handle + return parent_device + + def get_device_by_index(self, index: int) -> Device: + """ + Get MIG device for the given index under its parent device. + + If the compute instance is destroyed either explicitly or by destroying, + resetting or unbinding the parent GPU instance or the GPU device itself + the MIG device handle would remain invalid and must be requested again + using this API. Handles may be reused and their properties can change in + the process. + + For Ampere™ or newer fully supported devices. + + Parameters + ---------- + index: int + The index of the MIG device (compute instance) to retrieve. Must be + between 0 and the value returned by `device_count - 1`. + + Returns + ------- + Device + The MIG device corresponding to the given index. + """ + mig_device_handle = nvml.device_get_mig_device_handle_by_index(self._device._handle, index) + mig_device = Device.__new__(Device) + mig_device._handle = mig_device_handle + return mig_device + + def get_all_devices(self) -> Iterable[Device]: + """ + Get all MIG devices under its parent device. + + If the compute instance is destroyed either explicitly or by destroying, + resetting or unbinding the parent GPU instance or the GPU device itself + the MIG device handle would remain invalid and must be requested again + using this API. Handles may be reused and their properties can change in + the process. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + list[Device] + A list of all MIG devices corresponding to this GPU. + """ + for i in range(self.device_count): + yield self.get_device_by_index(i) diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index de3e6bf77f..604ff9120f 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -1,4 +1,4 @@ -.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 :orphan: @@ -76,6 +76,7 @@ NVML system._device.GpuTopologyLevel system._device.InforomInfo system._device.MemoryInfo + system._device.MigInfo system._device.PciInfo system._device.RepairStatus system._device.Temperature diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 85a541018d..118b09fb9d 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -57,7 +57,7 @@ def test_to_cuda_device(): cuda_device = device.to_cuda_device() assert isinstance(cuda_device, CudaDevice) - assert cuda_device.uuid == device.uuid + assert cuda_device.uuid == device.uuid_without_prefix # Technically, this test will only work with PCI devices, but are there # non-PCI devices we need to support? @@ -227,9 +227,9 @@ def test_device_serial(): assert len(serial) > 0 -def test_device_uuid(): +def test_device_uuid_without_prefix(): for device in system.Device.get_all_devices(): - uuid = device.uuid + uuid = device.uuid_without_prefix assert isinstance(uuid, str) # Expands to GPU-8hex-4hex-4hex-4hex-12hex, where 8hex means 8 consecutive @@ -729,3 +729,29 @@ def test_pstates(): assert isinstance(utilization.percentage, int) assert isinstance(utilization.inc_threshold, int) assert isinstance(utilization.dec_threshold, int) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="MIG not supported on WSL or Windows") +def test_mig(): + for device in system.Device.get_all_devices(): + with unsupported_before(device, None): + mig = device.mig + + assert isinstance(mig.is_mig_device, bool) + assert isinstance(mig.mode, bool) + assert isinstance(mig.pending_mode, bool) + + device_count = mig.device_count + assert isinstance(device_count, int) + assert device_count >= 0 + + for mig_device in mig.get_all_devices(): + assert isinstance(mig_device, system.Device) + + +def test_uuid(): + for device in system.Device.get_all_devices(): + uuid = device.uuid + assert isinstance(uuid, str) + assert uuid.startswith(("GPU-", "MIG-")) + assert uuid == device.uuid diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index c4e1e9931f..56a97f5185 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -38,7 +38,7 @@ def test_to_system_device(deinit_cuda): system_device = device.to_system_device() assert isinstance(system_device, SystemDevice) - assert system_device.uuid == device.uuid + assert system_device.uuid_without_prefix == device.uuid # Technically, this test will only work with PCI devices, but are there # non-PCI devices we need to support?