Skip to content

Commit 304e64e

Browse files
committed
Fix some threading issues (some free-threading related)
This fixes a few threading issues, but we may want to discuss some details still. * The GraphNode cleanup order is an important fix. Another thread may end up with the same pointer (but new object) as soon as we clean it up. So we have to remove it from the cache before cleaning it up. * Use of atomics: I think this is needed, but for this one place an atomic seemed more reasonable. (However, hard to test and if it can fail IIUC only on ARM.) * The critical sections should be pretty safe. I am not sure they will all ensure that the object is always the _identity_ but I am pretty sure it protects from worse races. (Testing did find this for MemPool.attributes, not others yet. Testing with thread-sanitizer might flush out some...) * The split mutex: This is thread-unsafe. But I am honestly not sure if that isn't just expected, or whether the mutex is good but it should also be safe from within CUDA. * Use of `setdefault` cached pattern is largely just normalizing. Without the `return dict.setdefault` a different instance may be returned on different threads (or a cache entry replaced). For the `cyGraphMemoryResource` that triggered a test with pytest-run-parallel although that doesn't mean it is problematic as such. `cuda-pathfinder` uses functools.cache, but usually for strings; the one we may want to look at is `load_nvidia_dynamic_lib`.
1 parent 5befc7d commit 304e64e

10 files changed

Lines changed: 59 additions & 32 deletions

File tree

cuda_core/cuda/core/_device.pyx

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,12 @@ cdef class DeviceProperties:
7979

8080
cdef inline int _get_cached_attribute(self, attr, default=0) except? -2:
8181
"""Retrieve the attribute value, using cache if applicable."""
82-
if attr not in self._cache:
83-
self._cache[attr] = self._get_attribute(attr, default)
84-
return self._cache[attr]
82+
cached = self._cache.get(attr)
83+
if cached is not None:
84+
return cached
85+
cdef int value = self._get_attribute(attr, default)
86+
self._cache[attr] = value # setdefault not needed for ints
87+
return value
8588

8689
@property
8790
def max_threads_per_block(self) -> int:
@@ -1125,11 +1128,11 @@ class Device:
11251128
def compute_capability(self) -> ComputeCapability:
11261129
"""Return a named tuple with 2 fields: major and minor."""
11271130
cdef DeviceProperties prop = self.properties
1128-
if "compute_capability" in prop._cache:
1129-
return prop._cache["compute_capability"]
1131+
cached = prop._cache.get("compute_capability")
1132+
if cached is not None:
1133+
return cached
11301134
cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor)
1131-
prop._cache["compute_capability"] = cc
1132-
return cc
1135+
return prop._cache.setdefault("compute_capability", cc)
11331136

11341137
@property
11351138
def arch(self) -> str:

cuda_core/cuda/core/_device_resources.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
cimport cython
6+
57
from cuda.bindings cimport cydriver
68
from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle
79

@@ -15,6 +17,7 @@ cdef class SMResource:
1517
unsigned int _flags
1618
bint _is_usable
1719
object __weakref__
20+
cython.pymutex _split_mutex
1821

1922
@staticmethod
2023
cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id)

cuda_core/cuda/core/_device_resources.pyx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -493,10 +493,11 @@ cdef class SMResource:
493493
)
494494
_resolve_group_count(opts)
495495
_check_green_ctx_support()
496-
if _can_use_structured_sm_split():
497-
return _split_with_general_api(self, opts, dry_run)
498-
# SplitByCount requires the same 12.4+ as green ctx support (already checked above)
499-
return _split_with_count_api(self, opts, dry_run)
496+
with self._split_mutex:
497+
if _can_use_structured_sm_split():
498+
return _split_with_general_api(self, opts, dry_run)
499+
# SplitByCount requires the same 12.4+ as green ctx support (already checked above)
500+
return _split_with_count_api(self, opts, dry_run)
500501

501502

502503
cdef class WorkqueueResource:

cuda_core/cuda/core/_memory/_buffer.pxd

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from libc.stdint cimport uintptr_t
6+
from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release
67

78
from cuda.bindings cimport cydriver
89
from cuda.core._resource_handles cimport DevicePtrHandle
@@ -18,13 +19,13 @@ cdef struct _MemAttrs:
1819

1920
cdef class Buffer:
2021
cdef:
21-
DevicePtrHandle _h_ptr
22-
MemoryResource _memory_resource
23-
object _ipc_data
24-
object _owner
25-
_MemAttrs _mem_attrs
26-
bint _mem_attrs_inited
27-
object __weakref__
22+
DevicePtrHandle _h_ptr
23+
MemoryResource _memory_resource
24+
object _ipc_data
25+
object _owner
26+
_MemAttrs _mem_attrs
27+
std_atomic[bool] _mem_attrs_inited
28+
object __weakref__
2829
cdef public:
2930
# Python code in _memory/_virtual_memory_resource.py needs to update
3031
# this value, though it is technically private.

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ cdef class Buffer:
9393
self._memory_resource = None
9494
self._ipc_data = None
9595
self._owner = None
96-
self._mem_attrs_inited = False
96+
self._mem_attrs_inited.store(False)
9797

9898
def __init__(self, *args, **kwargs):
9999
raise RuntimeError("Buffer objects cannot be instantiated directly. "
@@ -123,7 +123,7 @@ cdef class Buffer:
123123
self._memory_resource = mr
124124
self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
125125
self._owner = owner
126-
self._mem_attrs_inited = False
126+
self._mem_attrs_inited.store(False)
127127
return self
128128

129129
@staticmethod
@@ -188,6 +188,7 @@ cdef class Buffer:
188188
return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream)
189189

190190
@property
191+
@cython.critical_section
191192
def ipc_descriptor(self) -> IPCBufferDescriptor:
192193
"""Descriptor for sharing this buffer with other processes."""
193194
if self._ipc_data is None:
@@ -444,9 +445,9 @@ cdef class Buffer:
444445
# ------------------------------
445446
cdef inline void _init_mem_attrs(Buffer self):
446447
"""Initialize memory attributes by querying the pointer."""
447-
if not self._mem_attrs_inited:
448+
if not self._mem_attrs_inited.load(memory_order_acquire):
448449
_query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr))
449-
self._mem_attrs_inited = True
450+
self._mem_attrs_inited.store(True, memory_order_release)
450451

451452

452453
cdef inline int _query_memory_attrs(
@@ -588,7 +589,7 @@ cdef Buffer Buffer_from_deviceptr_handle(
588589
buf._memory_resource = mr
589590
buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
590591
buf._owner = None
591-
buf._mem_attrs_inited = False
592+
buf._mem_attrs_inited.store(False)
592593
return buf
593594

594595

cuda_core/cuda/core/_memory/_graph_memory_resource.pyx

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ from cuda.core._resource_handles cimport (
1818
from cuda.core._stream cimport Stream_accept, Stream
1919
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
2020

21-
from functools import cache
2221

2322
__all__ = ['GraphMemoryResource']
2423

@@ -149,6 +148,8 @@ cdef class cyGraphMemoryResource(MemoryResource):
149148
return False
150149

151150

151+
cdef dict _mem_resource_cache = {}
152+
152153
class GraphMemoryResource(cyGraphMemoryResource):
153154
"""
154155
A memory resource for memory related to graphs.
@@ -173,9 +174,16 @@ class GraphMemoryResource(cyGraphMemoryResource):
173174
return cls._create(c_device_id)
174175

175176
@classmethod
176-
@cache
177177
def _create(cls, int device_id):
178-
return cyGraphMemoryResource.__new__(cls, device_id)
178+
# we use a dict currently, because functools.cache is currently less
179+
# thread-safe see also: https://github.com/python/cpython/issues/150708
180+
res = _mem_resource_cache.get(device_id)
181+
if res is not None:
182+
return res
183+
184+
# create new instance, but in case of a race may return another:
185+
new = cyGraphMemoryResource.__new__(cls, device_id)
186+
return _mem_resource_cache.setdefault(device_id, new)
179187

180188

181189
# Raise an exception if the given stream is capturing.

cuda_core/cuda/core/_memory/_memory_pool.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
cimport cython
78
from libc.limits cimport ULLONG_MAX
89
from libc.stdint cimport uintptr_t
910
from libc.string cimport memset
@@ -164,6 +165,7 @@ cdef class _MemPool(MemoryResource):
164165
_MP_deallocate(self, <uintptr_t>ptr, size, s)
165166

166167
@property
168+
@cython.critical_section
167169
def attributes(self) -> _MemPoolAttributes:
168170
"""Memory pool attributes."""
169171
if self._attributes is None:

cuda_core/cuda/core/_memoryview.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
cimport cython
78
from ._dlpack cimport *
89
from ._dlpack import classify_dl_device
910
from libc.stdint cimport intptr_t
@@ -78,7 +79,7 @@ cdef inline bint _is_torch_tensor(object obj):
7879
cdef str mod = tp.__module__ or ""
7980
cdef bint result = mod.startswith("torch") and hasattr(obj, "data_ptr") \
8081
and _torch_version_check()
81-
_torch_type_cache[tp] = result
82+
_torch_type_cache[tp] = result # setdefault not needed for bools
8283
return result
8384

8485

@@ -533,6 +534,7 @@ cdef class StridedMemoryView:
533534
+ f" readonly={self.readonly},\n"
534535
+ f" exporting_obj={get_simple_repr(self.exporting_obj)})")
535536

537+
@cython.critical_section
536538
cdef inline _StridedLayout get_layout(self):
537539
if self._layout is None:
538540
if self.dl_tensor:
@@ -543,6 +545,7 @@ cdef class StridedMemoryView:
543545
raise ValueError("Cannot infer layout from the exporting object")
544546
return self._layout
545547

548+
@cython.critical_section
546549
cdef inline object get_buffer(self):
547550
"""
548551
Returns Buffer instance with the underlying data.
@@ -556,6 +559,7 @@ cdef class StridedMemoryView:
556559
self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj)
557560
return self._buffer
558561

562+
@cython.critical_section
559563
cdef inline object get_dtype(self):
560564
if self._dtype is None:
561565
if self.dl_tensor != NULL:

cuda_core/cuda/core/_module.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
cimport cython
78
from libc.stddef cimport size_t
89

910
from collections import namedtuple
@@ -82,7 +83,7 @@ cdef class KernelAttributes:
8283
cdef int result
8384
with nogil:
8485
HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id))
85-
self._cache[cache_key] = result
86+
self._cache[cache_key] = result # setdefault not needed for ints
8687
return result
8788

8889
def __getitem__(self, device) -> KernelAttributes:
@@ -451,6 +452,7 @@ cdef class Kernel:
451452
return ker
452453

453454
@property
455+
@cython.critical_section
454456
def attributes(self) -> KernelAttributes:
455457
"""Get the read-only attributes of this kernel."""
456458
if self._attributes is None:
@@ -498,6 +500,7 @@ cdef class Kernel:
498500
return param_info
499501

500502
@property
503+
@cython.critical_section
501504
def occupancy(self) -> KernelOccupancy:
502505
"""Get the occupancy information for launching this kernel."""
503506
if self._occupancy is None:
@@ -729,6 +732,7 @@ cdef class ObjectCode:
729732

730733
# TODO: do we want to unload in a finalizer? Probably not..
731734

735+
@cython.critical_section
732736
cdef int _lazy_load_module(self) except -1:
733737
if self._h_library:
734738
return 0

cuda_core/cuda/core/graph/_graph_node.pyx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ _node_registry = weakref.WeakValueDictionary()
7272

7373

7474
cdef inline GraphNode _registered(GraphNode n):
75-
_node_registry[<uintptr_t>n._h_node.get()] = n
76-
return n
75+
return _node_registry.setdefault(<uintptr_t>n._h_node.get(), n)
7776

7877

7978
cdef class GraphNode:
@@ -156,10 +155,11 @@ cdef class GraphNode:
156155
cdef cydriver.CUgraphNode node = as_cu(self._h_node)
157156
if node == NULL:
158157
return
159-
with nogil:
160-
HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))
158+
161159
_node_registry.pop(<uintptr_t>self._h_node.get(), None)
162160
invalidate_graph_node(self._h_node)
161+
with nogil:
162+
HANDLE_RETURN(cydriver.cuGraphDestroyNode(node))
163163

164164
@property
165165
def pred(self):

0 commit comments

Comments
 (0)