Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 13 additions & 16 deletions cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# To regenerate the dictionary below run:
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/cuda.h
# Replace the dictionary below with the output.
# Also update the CUDA Toolkit version number below.
from cuda.bindings import driver
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations

# CUDA Toolkit v13.2.0
DRIVER_CU_RESULT_EXPLANATIONS = {
# CUDA Toolkit v13.1.1
_FALLBACK_EXPLANATIONS = {
0: (
"The API call returned with no errors. In the case of query calls, this"
" also means that the operation being queried is complete (see"
Expand Down Expand Up @@ -334,15 +332,12 @@
" changes which violated constraints specific to instantiated graph update."
),
911: (
"This indicates that an error has occurred in a device outside of GPU. It can be a"
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
" signal before consuming shared data, the external device signaled an error indicating that"
" the data is not valid for consumption. This leaves the process in an inconsistent"
" state and any further CUDA work will return the same error. To continue using CUDA,"
" the process must be terminated and relaunched."
" In case of synchronous error, it means that one or more external devices"
" have encountered an error and cannot complete the operation."
"This indicates that an async error has occurred in a device outside of CUDA."
" If CUDA was waiting for an external device's signal before consuming shared data,"
" the external device signaled an error indicating that the data is not valid for"
" consumption. This leaves the process in an inconsistent state and any further CUDA"
" work will return the same error. To continue using CUDA, the process must be"
" terminated and relaunched."
),
912: "Indicates a kernel launch error due to cluster misconfiguration.",
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
Expand All @@ -356,3 +351,5 @@
),
999: "This indicates that an unknown internal error has occurred.",
}

DRIVER_CU_RESULT_EXPLANATIONS = get_best_available_explanations(driver.CUresult, _FALLBACK_EXPLANATIONS)
121 changes: 121 additions & 0 deletions cuda_core/cuda/core/_utils/enum_explanations_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

"""Internal support for error-enum explanations.

``cuda_core`` keeps frozen 13.1.1 fallback tables for older ``cuda-bindings``
releases. Driver/runtime error enums carry usable ``__doc__`` text starting in
the 12.x backport line at ``cuda-bindings`` 12.9.6, and in the mainline 13.x
series at ``cuda-bindings`` 13.2.0. This module decides which source to use
and normalizes generated docstrings so user-facing ``CUDAError`` messages stay
presentable.

The cleanup rules here were derived while validating generated enum docstrings
in PR #1805. Keep them narrow and remove them when codegen quirks or fallback
support are no longer needed.
"""

from __future__ import annotations

import importlib.metadata
import re
from typing import Any

_MIN_12X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (12, 9, 6)
_MIN_13X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS = (13, 2, 0)


# ``version.pyx`` cannot be reused here (circular import via ``cuda_utils``).
def _binding_version() -> tuple[int, int, int]:
"""Return the installed ``cuda-bindings`` version, or a conservative old value."""
try:
parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
except importlib.metadata.PackageNotFoundError:
return (0, 0, 0) # For very old versions of cuda-python
return tuple(int(v) for v in parts)


def _binding_version_has_usable_enum_docstrings(version: tuple[int, int, int]) -> bool:
"""Whether released bindings are known to carry usable error-enum ``__doc__`` text."""
return (
_MIN_12X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS <= version < (13, 0, 0)
or version >= _MIN_13X_BINDING_VERSION_FOR_ENUM_DOCSTRINGS
)


def _fix_hyphenation_wordwrap_spacing(s: str) -> str:
"""Remove spaces around hyphens introduced by line wrapping in generated ``__doc__`` text.

This is a narrow workaround for wrapped forms such as ``non- linear`` that
would otherwise look awkward in user-facing messages.
"""
prev = None
while prev != s:
prev = s
s = re.sub(r"([a-z])- ([a-z])", r"\1-\2", s)
s = re.sub(r"([a-z]) -([a-z])", r"\1-\2", s)
return s


def clean_enum_member_docstring(doc: str | None) -> str | None:
"""Turn an enum member ``__doc__`` into plain text.

The generated enum docstrings are already close to user-facing prose, but
they may contain Sphinx inline roles, line wrapping, or a small known
codegen defect. Normalize only those differences so the text is suitable
for error messages.
"""
if doc is None:
return None
s = doc
# Known codegen bug on cudaErrorIncompatibleDriverContext. Remove once fixed
# in cuda-bindings code generation.
s = s.replace("\n:py:obj:`~.Interactions`", ' "Interactions ')
s = re.sub(
r":(?:py:)?(?:obj|func|meth|class|mod|data|const|exc):`([^`]+)`",
lambda m: re.sub(r"^~?\.", "", m.group(1)),
s,
)
s = re.sub(r"\*\*([^*]+)\*\*", r"\1", s)
s = re.sub(r"\*([^*]+)\*", r"\1", s)
s = re.sub(r"\s+", " ", s).strip()
s = _fix_hyphenation_wordwrap_spacing(s)
return s


class DocstringBackedExplanations:
"""Compatibility shim exposing enum-member ``__doc__`` text via ``dict.get``.

Keeps the existing ``.get(int(error))`` lookup shape used by ``cuda_utils.pyx``.
"""

__slots__ = ("_enum_type",)

def __init__(self, enum_type: Any) -> None:
self._enum_type = enum_type

def get(self, code: int, default: str | None = None) -> str | None:
try:
member = self._enum_type(code)
except ValueError:
return default

raw_doc = member.__doc__
if raw_doc is None:
return default

return clean_enum_member_docstring(raw_doc)


def get_best_available_explanations(
enum_type: Any, fallback: dict[int, str | tuple[str, ...]]
) -> DocstringBackedExplanations | dict[int, str | tuple[str, ...]]:
"""Pick one explanation source per bindings version.

Use enum-member ``__doc__`` only for bindings versions known to expose
usable per-member text (12.9.6+ in the 12.x backport line, 13.2.0+ in the
13.x mainline). Otherwise keep using the frozen 13.1.1 fallback tables.
"""
if not _binding_version_has_usable_enum_docstrings(_binding_version()):
return fallback
return DocstringBackedExplanations(enum_type)
34 changes: 13 additions & 21 deletions cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# To regenerate the dictionary below run:
# ../../../../../toolshed/reformat_cuda_enums_as_py.py /usr/local/cuda/include/driver_types.h
# Replace the dictionary below with the output.
# Also update the CUDA Toolkit version number below.
from cuda.bindings import runtime
from cuda.core._utils.enum_explanations_helpers import get_best_available_explanations

# CUDA Toolkit v13.2.0
RUNTIME_CUDA_ERROR_EXPLANATIONS = {
# CUDA Toolkit v13.1.1
_FALLBACK_EXPLANATIONS = {
0: (
"The API call returned with no errors. In the case of query calls, this"
" also means that the operation being queried is complete (see"
Expand Down Expand Up @@ -52,11 +50,6 @@
" requesting too many threads or blocks. See ::cudaDeviceProp for more"
" device limitations."
),
10: (
"This indicates that the driver is newer than the runtime version"
" and returned graph node parameter information that the runtime"
" does not understand and is unable to translate."
),
12: (
"This indicates that one or more of the pitch-related parameters passed"
" to the API call is not within the acceptable range for pitch."
Expand Down Expand Up @@ -523,15 +516,12 @@
" changes which violated constraints specific to instantiated graph update."
),
911: (
"This indicates that an error has occurred in a device outside of GPU. It can be a"
" synchronous error w.r.t. CUDA API or an asynchronous error from the external device."
" In case of asynchronous error, it means that if cuda was waiting for an external device's"
" signal before consuming shared data, the external device signaled an error indicating that"
" the data is not valid for consumption. This leaves the process in an inconsistent"
" state and any further CUDA work will return the same error. To continue using CUDA,"
" the process must be terminated and relaunched."
" In case of synchronous error, it means that one or more external devices"
" have encountered an error and cannot complete the operation."
"This indicates that an async error has occurred in a device outside of CUDA."
" If CUDA was waiting for an external device's signal before consuming shared data,"
" the external device signaled an error indicating that the data is not valid for"
" consumption. This leaves the process in an inconsistent state and any further CUDA"
" work will return the same error. To continue using CUDA, the process must be"
" terminated and relaunched."
),
912: ("This indicates that a kernel launch error has occurred due to cluster misconfiguration."),
913: ("Indiciates a function handle is not loaded when calling an API that requires a loaded function."),
Expand All @@ -549,3 +539,5 @@
" This error return is deprecated as of CUDA 4.1."
),
}

RUNTIME_CUDA_ERROR_EXPLANATIONS = get_best_available_explanations(runtime.cudaError_t, _FALLBACK_EXPLANATIONS)
86 changes: 54 additions & 32 deletions cuda_core/tests/test_cuda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,12 @@
from cuda.core._utils.clear_error_support import assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable


def test_driver_cu_result_explanations_health():
expl_dict = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS

# Ensure all CUresult enums are in expl_dict
known_codes = set()
for error in driver.CUresult:
code = int(error)
assert code in expl_dict
known_codes.add(code)

from cuda.core._utils.version import binding_version

if binding_version() >= (13, 0, 0):
# Ensure expl_dict has no codes not known as a CUresult enum
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
assert not extra_expl


def test_runtime_cuda_error_explanations_health():
expl_dict = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS

# Ensure all cudaError_t enums are in expl_dict
known_codes = set()
for error in runtime.cudaError_t:
code = int(error)
assert code in expl_dict
known_codes.add(code)

def _skip_if_bindings_pre_enum_docstrings():
from cuda.core._utils.enum_explanations_helpers import _binding_version_has_usable_enum_docstrings
from cuda.core._utils.version import binding_version

if binding_version() >= (13, 0, 0):
# Ensure expl_dict has no codes not known as a cudaError_t enum
extra_expl = sorted(set(expl_dict.keys()) - known_codes)
assert not extra_expl
if not _binding_version_has_usable_enum_docstrings(binding_version()):
pytest.skip("cuda-bindings version does not expose usable enum __doc__ strings")


def test_check_driver_error():
Expand Down Expand Up @@ -85,6 +57,56 @@ def test_check_runtime_error():
assert num_unexpected < len(driver.CUresult) * 0.5


def test_driver_error_enum_has_non_empty_docstring():
_skip_if_bindings_pre_enum_docstrings()

doc = driver.CUresult.CUDA_ERROR_INVALID_VALUE.__doc__
assert doc is not None
assert doc.strip() != ""


def test_runtime_error_enum_has_non_empty_docstring():
_skip_if_bindings_pre_enum_docstrings()

doc = runtime.cudaError_t.cudaErrorInvalidValue.__doc__
assert doc is not None
assert doc.strip() != ""


def test_check_driver_error_attaches_explanation():
error = driver.CUresult.CUDA_ERROR_INVALID_VALUE
name_err, name = driver.cuGetErrorName(error)
assert name_err == driver.CUresult.CUDA_SUCCESS
desc_err, desc = driver.cuGetErrorString(error)
assert desc_err == driver.CUresult.CUDA_SUCCESS
expl = cuda_utils.DRIVER_CU_RESULT_EXPLANATIONS.get(int(error))
assert expl is not None
assert expl != desc.decode()

with pytest.raises(cuda_utils.CUDAError) as e:
cuda_utils._check_driver_error(error)

assert str(e.value) == f"{name.decode()}: {expl}"
assert str(e.value) != f"{name.decode()}: {desc.decode()}"


def test_check_runtime_error_attaches_explanation():
error = runtime.cudaError_t.cudaErrorInvalidValue
name_err, name = runtime.cudaGetErrorName(error)
assert name_err == runtime.cudaError_t.cudaSuccess
desc_err, desc = runtime.cudaGetErrorString(error)
assert desc_err == runtime.cudaError_t.cudaSuccess
expl = cuda_utils.RUNTIME_CUDA_ERROR_EXPLANATIONS.get(int(error))
assert expl is not None
assert expl != desc.decode()

with pytest.raises(cuda_utils.CUDAError) as e:
cuda_utils._check_runtime_error(error)

assert str(e.value) == f"{name.decode()}: {expl}"
assert str(e.value) != f"{name.decode()}: {desc.decode()}"


def test_precondition():
def checker(*args, what=""):
if args[0] < 0:
Expand Down
Loading
Loading