Raise exceptions when an EvalContext is active in multiple threads (#6221)

rostan-t · web-flow · commit 5ffdbce7ded6 · 2026-02-24T12:52:59.000+01:00
* Disallow EvalContext from being active in multiple threads
* Update the threading guide to reflect the fact that exceptins are raised
* Skip checking if the eval context is active if we're in the background thread of async execution
---------

Signed-off-by: Rostan Tabet &lt;rtabet@nvidia.com&gt;
diff --git a/dali/python/nvidia/dali/experimental/dynamic/_eval_context.py b/dali/python/nvidia/dali/experimental/dynamic/_eval_context.py
@@ -13,16 +13,17 @@
 # limitations under the License.
 
 import copy
+import sys
+import threading
 import weakref
-from threading import current_thread, local
 
 import nvidia.dali.backend_impl as _b
 
 from . import _device, _stream
 from ._async import _AsyncExecutor
 
 
-class _ThreadLocalStorage(local):
+class _ThreadLocalStorage(threading.local):
     def __init__(self):
         super().__init__()
         self.default = {}  # per-device default context
@@ -35,7 +36,6 @@ def __init__(self):
 def _default_num_threads():
     """Gets the default number of threads used in DALI dynamic mode."""
     import os
-    import sys
     from functools import wraps
 
     mod = sys.modules[__name__]
@@ -164,11 +164,14 @@ def __init__(self, *, num_threads=None, device_id=None, cuda_stream=None):
         self._instance_cache = {}
 
         # The thread pool needs to be thread-local because of eager execution
-        self._tls = local()
+        self._tls = threading.local()
 
         self._async_executor = _AsyncExecutor()
         weakref.finalize(self, self._async_executor.shutdown)
 
+        # Used to disallow the EvalContext to be active in two threads simultaneously
+        self._lock = threading.RLock()
+
     def _purge_operator_cache(self):
         """Empties the operator instance cache"""
         self._instance_cache = {}
@@ -207,18 +210,35 @@ def _is_current(self) -> bool:
         return self is _tls.default.get(current_device_id)
 
     def __enter__(self):
-        _tls.stack.append(self)
-        if self._device:
-            self._device.__enter__()
+        skip_lock = self._is_in_background_thread()
+        if not skip_lock and not self._lock.acquire(blocking=False):
+            raise RuntimeError("An EvalContext cannot be active in two threads simultaneously.")
+        try:
+            _tls.stack.append(self)
+            if self._device:
+                self._device.__enter__()
+        except Exception:
+            if not skip_lock:
+                self._lock.release()
+            raise
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        assert _tls.stack[-1] is self
-        if len(_tls.stack) < 2 or (_tls.stack[-2] is not self):
-            self.evaluate_all()
-        _tls.stack.pop()
-        if self._device:
-            self._device.__exit__(exc_type, exc_value, traceback)
+        try:
+            # During interpreter shutdown, finalizers of objects created in background threads
+            # can be called from the main thread.
+            if _tls.stack:
+                assert _tls.stack[-1] is self
+                if len(_tls.stack) < 2 or (_tls.stack[-2] is not self):
+                    self.evaluate_all()
+                _tls.stack.pop()
+            else:
+                assert sys.is_finalizing()
+            if self._device:
+                self._device.__exit__(exc_type, exc_value, traceback)
+        finally:
+            if not self._is_in_background_thread():
+                self._lock.release()
 
     def evaluate_all(self):
         """Evaluates all pending invocations."""
@@ -312,7 +332,7 @@ def _snapshot(self):
         return ctx
 
     def _is_in_background_thread(self):
-        return current_thread() is self._async_executor._thread
+        return threading.current_thread() is self._async_executor._thread
 
 
 __all__ = [
diff --git a/dali/test/python/experimental_mode/test_multithreading.py b/dali/test/python/experimental_mode/test_multithreading.py
@@ -13,52 +13,31 @@
 # limitations under the License.
 
 
-import functools
 import os
-import sys
 import threading
 from collections.abc import Callable
 from typing import TypeVar
 
 import numpy as np
 import nvidia.dali.experimental.dynamic as ndd
 from nose2.tools import cartesian_params, params
-from nose_utils import SkipTest
-
-
-def allow_nogil_failure(exc_type: type[Exception]):
-    """
-    Skip the test on free-threaded Python if a specific exception is raised.
-    This is useful until https://github.com/python/cpython/pull/133305 is backported.
-    """
-
-    def decorator(test_func):
-        if getattr(sys, "_is_gil_enabled", lambda: True)():
-            return test_func
-
-        @functools.wraps(test_func)
-        def wrapper(*args, **kwargs):
-            try:
-                return test_func(*args, **kwargs)
-            except exc_type:
-                raise SkipTest(f"{exc_type.__name__} allowed for this test with the GIL disabled")
-
-        return wrapper
-
-    return decorator
-
+from nose_utils import assert_raises
 
 T = TypeVar("T")
 
 
-def run_parallel(function: Callable[[int], T], num_threads: int | None = None) -> dict[int, T]:
+def get_num_threads(num_threads: int | None = None):
     if num_threads is None:
         try:
             num_threads = len(os.sched_getaffinity(0))
         except AttributeError:
             num_threads = os.cpu_count() or 4
 
-    num_threads = min(32, num_threads)
+    return min(32, num_threads)
+
+
+def run_parallel(function: Callable[[int], T], num_threads: int | None = None) -> dict[int, T]:
+    num_threads = get_num_threads(num_threads)
 
     barrier = threading.Barrier(num_threads)
     results = {}
@@ -89,7 +68,6 @@ def wrapper(thread_id: int):
     return results
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_parallel_eval_contexts(device):
     def worker(thread_id: int):
@@ -109,7 +87,6 @@ def worker(thread_id: int):
         np.testing.assert_equal(actual.cpu(), expected)
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_parallel_creation(device):
     def worker(thread_id: int):
@@ -135,7 +112,6 @@ def worker(thread_id: int):
             np.testing.assert_array_equal(actual.cpu(), expected)
 
 
-@allow_nogil_failure(KeyError)
 def test_parallel_different_devices():
     def worker(thread_id: int):
         device = "cpu" if thread_id % 2 == 0 else "gpu"
@@ -154,7 +130,6 @@ def worker(thread_id: int):
         np.testing.assert_equal(result.cpu(), expected)
 
 
-@allow_nogil_failure(KeyError)
 @cartesian_params(("cpu", "gpu"), ndd.EvalMode)
 def test_parallel_eval_modes(device, eval_mode):
     def worker(thread_id: int):
@@ -173,7 +148,6 @@ def worker(thread_id: int):
         np.testing.assert_array_almost_equal(actual.cpu(), expected)
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_parallel_mixed_eval_modes(device):
     eval_modes = tuple(ndd.EvalMode)
@@ -200,7 +174,6 @@ def worker(thread_id: int):
         np.testing.assert_array_almost_equal(data["result"].cpu(), data["expected"])
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_parallel_indexing(device):
     tensor = ndd.tensor([[1, 2, 3], [4, 5, 6]], device=device)
@@ -220,7 +193,6 @@ def worker(thread_id: int):
         assert result == tensor.cpu()[slice].item()
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_thread_local_rng_determinism(device):
     def worker(_):
@@ -242,7 +214,6 @@ def worker(_):
         np.testing.assert_array_equal(data["normal"].cpu(), reference["normal"].cpu())
 
 
-@allow_nogil_failure(KeyError)
 @params("cpu", "gpu")
 def test_chained_threads(device):
     source = ndd.tensor([1, 2, 3, 4], dtype=ndd.float32, device=device).evaluate()
@@ -266,3 +237,18 @@ def worker2(tensor: ndd.Tensor):
 
     assert result is not None
     np.testing.assert_array_almost_equal(result.cpu(), source.cpu())
+
+
+def test_error_parallel_eval_contexts():
+    def worker(_):
+        with ctx:
+            try:
+                barrier.wait(0.1)
+            except threading.BrokenBarrierError:
+                pass
+
+    barrier = threading.Barrier(get_num_threads())
+    ctx = ndd.EvalContext()
+
+    with assert_raises(RuntimeError, glob="EvalContext"):
+        run_parallel(worker)
diff --git a/docs/dali_dynamic/threading.rst b/docs/dali_dynamic/threading.rst
@@ -19,15 +19,15 @@ threads.
 :octicon:`alert-fill;1.2em;align-text-bottom text-warning` Multiple threads using the same :class:`EvalContext`:
 
 .. code-block:: python
-   :emphasize-lines: 4
+   :emphasize-lines: 7
 
    import threading
    import nvidia.dali.experimental.dynamic as ndd
 
    ctx = ndd.EvalContext(num_threads=4)
 
    def worker():
-       with ctx:  # Bad: using the same EvalContext in multiple threads simultaneously
+       with ctx:  # Raises an exception
            img = ndd.random.uniform(shape=(100, 100, 3), range=(0, 255), dtype=ndd.uint8)
            flipped = ndd.flip(img, horizontal=True)
            ...
@@ -39,10 +39,12 @@ threads.
        t.join()
 
 Here, the code should either create an instance of the evaluation context per thread, or use
-:func:`set_num_threads`. Here's a corrected version:
+:func:`set_num_threads`.
+
+:octicon:`check-circle-fill;1.2em;align-text-bottom text-success` Correct code using
+:func:`set_num_threads`:
 
 .. code-block:: python
-   :emphasize-lines: 4
 
    import threading
    import nvidia.dali.experimental.dynamic as ndd