Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions numba_cuda/numba/cuda/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,8 @@ def close():
Explicitly clears all contexts in the current thread, and destroys all
contexts if the current thread is the main thread.
"""
devices.require_resettable()

# Must clear memsys object in case it has been used already
from .memory_management import rtsys

Expand Down
18 changes: 9 additions & 9 deletions numba_cuda/numba/cuda/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,11 @@ def __init__(
self._ptx_cache = {}
# Maps CC -> LTO-IR
self._ltoir_cache = {}
# Maps CC -> cubin
# Maps CC -> cubin bytes
self._cubin_cache = {}
# Maps CC -> linker info output for cubin
self._linkerinfo_cache = {}
# Maps Device numeric ID -> cufunc
# Maps execution-context cache key -> cufunc
self._cufunc_cache = {}

self._max_registers = max_registers
Expand Down Expand Up @@ -321,9 +321,9 @@ def _link_all(self, linker, cc, ignore_nonlto=False):
def get_cubin(self, cc=None):
cc = self._ensure_cc(cc)

cubin = self._cubin_cache.get(cc, None)
if cubin:
return cubin
cubin_code = self._cubin_cache.get(cc, None)
if cubin_code is not None:
return driver.ObjectCode.from_cubin(cubin_code, name=self._name)

if self._lto and config.DUMP_ASSEMBLY:
ptx = self.get_lto_ptx(cc=cc)
Expand All @@ -338,7 +338,7 @@ def get_cubin(self, cc=None):
self._link_all(linker, cc, ignore_nonlto=False)
cubin = linker.complete()

self._cubin_cache[cc] = cubin
self._cubin_cache[cc] = cubin.code
self._linkerinfo_cache[cc] = linker.info_log

return cubin
Expand All @@ -352,9 +352,9 @@ def get_cufunc(self):
raise RuntimeError(msg)

ctx = devices.get_context()
device = ctx.device
cache_key = ctx.cache_key

cufunc = self._cufunc_cache.get(device.id, None)
cufunc = self._cufunc_cache.get(cache_key, None)
if cufunc:
return cufunc
cubin = self.get_cubin()
Expand All @@ -366,7 +366,7 @@ def get_cufunc(self):
cufunc = module.get_function(self._entry_name)

# Populate caches
self._cufunc_cache[device.id] = cufunc
self._cufunc_cache[cache_key] = cufunc

return cufunc

Expand Down
19 changes: 18 additions & 1 deletion numba_cuda/numba/cuda/cuda_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
from contextlib import contextmanager

_env_path_tuple = namedtuple("_env_path_tuple", ["by", "info"])
_header_dir_info = namedtuple(
"_header_dir_info", ["found_via", "abs_path"]
)

SEARCH_PRIORITY = [
"Conda environment",
Expand Down Expand Up @@ -82,6 +85,20 @@ def _find_valid_path(options):
return "<unknown>", None


def _locate_nvidia_header_directory(name):
locator = getattr(pathfinder, "locate_nvidia_header_directory", None)
if locator is None:
locator = getattr(pathfinder, "find_nvidia_header_directory", None)
if locator is None:
return None
located = locator(name)
if located is None:
return None
if hasattr(located, "abs_path") and hasattr(located, "found_via"):
return located
return _header_dir_info("cuda.pathfinder", located)
Comment on lines +88 to +99
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Did you do this because you did not want to raise the lower bound of cuda-pathfinder (to the version that offers these APIs?)



def _get_libdevice_path_decision():
options = _build_options(
[
Expand Down Expand Up @@ -425,7 +442,7 @@ def get_current_cuda_target_name():

def _get_include_dir():
"""Find the root include directory."""
located_header_dir = pathfinder.locate_nvidia_header_directory("cudart")
located_header_dir = _locate_nvidia_header_directory("cudart")
if located_header_dir is not None:
if not os.path.exists(
os.path.join(
Expand Down
51 changes: 42 additions & 9 deletions numba_cuda/numba/cuda/cudadrv/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import threading
from contextlib import contextmanager

from .driver import driver
from .driver import driver, is_green_context_handle


class _DeviceList:
Expand Down Expand Up @@ -91,8 +91,8 @@ def __str__(self):
class _Runtime:
"""Emulate the CUDA runtime context management.

It owns all Devices and Contexts.
Keeps at most one Context per Device
It owns all Devices and primary Contexts, and borrows active external
contexts by handle when needed.
"""

def __init__(self):
Expand Down Expand Up @@ -130,8 +130,8 @@ def ensure_context(self):

def get_or_create_context(self, devnum):
"""Returns the primary context and push+create it if needed
for *devnum*. If *devnum* is None, use the active CUDA context (must
be primary) or create a new one with ``devnum=0``.
for *devnum*. If *devnum* is None, use the active CUDA context or
create a new one with ``devnum=0``.
"""
if devnum is None:
attached_ctx = self._get_attached_context()
Expand All @@ -155,10 +155,15 @@ def _get_or_create_context_uncached(self, devnum):
if not ac:
return self._activate_context_for(0)
else:
# Get primary context for the active device
ctx = self.gpus[ac.devnum].get_primary_context()
# Is active context the primary context?
if ctx.handle != ac.context_handle:
gpu = self.gpus[ac.devnum]
primary_ctx = gpu.get_primary_context()
if primary_ctx.handle == ac.context_handle:
ctx = primary_ctx
elif is_green_context_handle(ac.context_handle):
ctx = gpu.get_or_create_borrowed_context(
ac.context_handle
)
else:
raise RuntimeError(
"Numba cannot operate on non-primary"
f" CUDA context {int(ac.context_handle):x}"
Expand Down Expand Up @@ -190,6 +195,7 @@ def reset(self):
"""Clear all contexts in the thread. Destroy the context if and only
if we are in the main thread.
"""
self._ensure_resettable()
# Pop all active context.
while driver.pop_active_context() is not None:
pass
Expand All @@ -203,6 +209,28 @@ def _destroy_all_contexts(self):
for gpu in self.gpus:
gpu.reset()

def _ensure_resettable(self):
if self._has_borrowed_contexts():
raise RuntimeError(
"Cannot reset CUDA subsystem while borrowed CUDA contexts "
"are still live"
)

with driver.get_active_context() as ac:
if not ac:
return

gpu = self.gpus[ac.devnum]
primary_ctx = gpu.get_primary_context()
if primary_ctx.handle != ac.context_handle:
raise RuntimeError(
"Cannot reset CUDA subsystem while a non-primary CUDA "
"context is active"
)

def _has_borrowed_contexts(self):
return any(gpu.has_borrowed_contexts() for gpu in self.gpus)


_runtime = _Runtime()

Expand Down Expand Up @@ -245,3 +273,8 @@ def reset():

"""
_runtime.reset()


def require_resettable():
"""Raise if a destructive reset would touch externally-managed contexts."""
_runtime._ensure_resettable()
50 changes: 44 additions & 6 deletions numba_cuda/numba/cuda/cudadrv/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,23 @@ def __bool__(self):
driver = Driver()


def is_green_context_handle(handle):
if not hasattr(binding, "CUgreenCtx") or not hasattr(
binding, "cuGreenCtxGetId"
):
return False

green_handle = binding.CUgreenCtx(int(handle))
try:
driver.cuGreenCtxGetId(green_handle)
except CudaAPIError as e:
if e.code == binding.CUresult.CUDA_ERROR_INVALID_CONTEXT:
return False
raise
else:
return True


class Device:
"""
The device object owns the CUDA contexts. This is owned by the driver
Expand Down Expand Up @@ -491,6 +508,7 @@ def __init__(self, devnum: int) -> None:
self.name = self._dev.name
self.uuid = f"GPU-{self._dev.uuid}"
self.primary_context = None
self._borrowed_contexts = weakref.WeakValueDictionary()

def get_device_identity(self):
return {
Expand Down Expand Up @@ -534,17 +552,30 @@ def get_primary_context(self):
f"{self} has compute capability < {MIN_REQUIRED_CC}"
)

self._dev.set_current()
if CUDA_CORE_GT_0_6:
ctx_handle = self._dev.context.handle
else:
ctx_handle = self._dev.context._handle
ctx_handle = driver.cuDevicePrimaryCtxRetain(self.id)
self.primary_context = ctx = Context(
weakref.proxy(self),
ctx_handle,
)
return ctx

def get_or_create_borrowed_context(self, handle):
handle_value = int(handle)

if (ctx := self.primary_context) is not None:
if int(ctx.handle) == handle_value:
return ctx

if (ctx := self._borrowed_contexts.get(handle_value)) is not None:
return ctx

ctx = Context(weakref.proxy(self), handle, borrowed=True)
self._borrowed_contexts[handle_value] = ctx
return ctx

def has_borrowed_contexts(self):
return bool(self._borrowed_contexts)

def release_primary_context(self):
"""
Release reference to primary context if it has been retained.
Expand Down Expand Up @@ -1054,9 +1085,11 @@ class Context:
Contexts should not be constructed directly by user code.
"""

def __init__(self, device, handle):
def __init__(self, device, handle, borrowed=False):
self.device = device
self.handle = handle
self.borrowed = borrowed
self._generation = 0
self.allocations = utils.UniqueDict()
self.deallocations = _PendingDeallocs()
_ensure_memory_manager()
Expand All @@ -1069,6 +1102,7 @@ def reset(self):
"""
Clean up all owned resources in this context.
"""
self._generation += 1
# Free owned resources
_logger.info("reset context of device %s", self.device.id)
self.memory_manager.reset()
Expand Down Expand Up @@ -1144,6 +1178,10 @@ def prepare_for_use(self):
"""
self.memory_manager.initialize()

@property
def cache_key(self):
return int(self.handle), self._generation

def push(self):
"""
Pushes this context on the current CPU Thread.
Expand Down
8 changes: 5 additions & 3 deletions numba_cuda/numba/cuda/cudadrv/nvrtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
CCSupportError,
)
from numba.cuda import config
from numba.cuda.cuda_paths import get_cuda_paths
from numba.cuda.cuda_paths import (
get_cuda_paths,
_locate_nvidia_header_directory,
)
from numba.cuda.utils import _readenv
from cuda import pathfinder
import os
import warnings
import functools
Expand Down Expand Up @@ -115,7 +117,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
elif nvrtc_ver_major == 13:
numba_include = f"{os.path.join(numba_cuda_path, 'include', '13')}"

cccl_found_header_dir = pathfinder.locate_nvidia_header_directory("cccl")
cccl_found_header_dir = _locate_nvidia_header_directory("cccl")
if cccl_found_header_dir is not None:
# TODO: Not every kernel needs cccl, so it shouldn't
# be added to the include path for every kernel.
Expand Down
Loading