Skip to content

Commit 2ed3f98

Browse files
authored
Fix device attribute handling for older drivers (NVIDIA#1437)
* Fix device attribute handling for older drivers Add explicit default value handling for device attributes that may not be supported by older CUDA drivers. When cuDeviceGetAttribute returns CUDA_ERROR_INVALID_VALUE, return a sensible default instead of raising an error. - Add default parameter to _get_attribute() and _get_cached_attribute() - Use default=0 for boolean/enablement attributes (returns False) - Use default=1 for mem_sync_domain_count (single domain is traditional behavior) - Use default=-1 for host_numa_id (indicates NUMA not supported) - Document that gpu_pci_device_id/gpu_pci_subsystem_id return 0 if unsupported Closes NVIDIA#1420 * Add exception specifiers and refine attribute defaults - Add except? -2 to _get_attribute and _get_cached_attribute for proper exception propagation (-2 never clashes with valid return values) - Keep default parameter untyped to allow None, cast to int when used - Simplify gpu_pci_device_id/gpu_pci_subsystem_id to return 0 when unsupported (0 is never a valid PCI ID)
1 parent 43af7de commit 2ed3f98

1 file changed

Lines changed: 19 additions & 9 deletions

File tree

cuda_core/cuda/core/_device.pyx

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,21 +55,21 @@ cdef class DeviceProperties:
5555
self._cache = {}
5656
return self
5757

58-
cdef inline _get_attribute(self, cydriver.CUdevice_attribute attr):
58+
cdef inline int _get_attribute(self, cydriver.CUdevice_attribute attr, default=0) except? -2:
5959
"""Retrieve the attribute value directly from the driver."""
6060
cdef int val
6161
cdef cydriver.CUresult err
6262
with nogil:
6363
err = cydriver.cuDeviceGetAttribute(&val, attr, self._handle)
64-
if err == cydriver.CUresult.CUDA_ERROR_INVALID_VALUE:
65-
return 0
64+
if err == cydriver.CUresult.CUDA_ERROR_INVALID_VALUE and default is not None:
65+
return <int>default
6666
HANDLE_RETURN(err)
6767
return val
6868

69-
cdef _get_cached_attribute(self, attr):
69+
cdef inline int _get_cached_attribute(self, attr, default=0) except? -2:
7070
"""Retrieve the attribute value, using cache if applicable."""
7171
if attr not in self._cache:
72-
self._cache[attr] = self._get_attribute(attr)
72+
self._cache[attr] = self._get_attribute(attr, default)
7373
return self._cache[attr]
7474

7575
@property
@@ -787,6 +787,8 @@ cdef class DeviceProperties:
787787
"""bool: Device supports buffer sharing with dma_buf mechanism."""
788788
return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED))
789789

790+
# Start of CUDA 12 device attributes
791+
790792
@property
791793
def ipc_event_supported(self) -> bool:
792794
"""bool: Device supports IPC Events."""
@@ -795,7 +797,7 @@ cdef class DeviceProperties:
795797
@property
796798
def mem_sync_domain_count(self) -> int:
797799
"""int: Number of memory domains the device supports."""
798-
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT)
800+
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT, default=1)
799801

800802
@property
801803
def tensor_map_access_supported(self) -> bool:
@@ -824,7 +826,7 @@ cdef class DeviceProperties:
824826
@property
825827
def host_numa_id(self) -> int:
826828
"""int: NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA."""
827-
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID)
829+
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, default=-1)
828830

829831
@property
830832
def d3d12_cig_supported(self) -> bool:
@@ -848,12 +850,18 @@ cdef class DeviceProperties:
848850

849851
@property
850852
def gpu_pci_device_id(self) -> int:
851-
"""int: The combined 16-bit PCI device ID and 16-bit PCI vendor ID."""
853+
"""int: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
854+
855+
Returns 0 if the driver does not support this query.
856+
"""
852857
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID)
853858

854859
@property
855860
def gpu_pci_subsystem_id(self) -> int:
856-
"""int: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID."""
861+
"""int: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
862+
863+
Returns 0 if the driver does not support this query.
864+
"""
857865
return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID)
858866

859867
@property
@@ -872,6 +880,8 @@ cdef class DeviceProperties:
872880
self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED)
873881
)
874882

883+
# Start of CUDA 13 device attributes
884+
875885
@property
876886
def host_numa_multinode_ipc_supported(self) -> bool:
877887
"""bool: Device supports HOST_NUMA location IPC between nodes in a multi-node system."""

0 commit comments

Comments
 (0)