From 2d49363d8dd859807571469f9cd499558bac1e64 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 16:19:07 +0100
Subject: [PATCH 01/19] Move to nose2 only

Remove the WARs used to keep nose alive.

nose2 supports the yield-style test discovery by default
@attr has a different filtering syntax (`-A`) and just checks for
presence of truthy test_foo.attribute_name. A decorator uses this
mechanism for backward compatibility.

nose2 splits with_setup(setup, teardown) into two separate decorators,
a backward compatible decorator is added.

nottest sets special attribute.

SkipTest from unittest is recommended to be used directly (with the same
functionality).

Test scripts are adjusted with minimal changes to run through nose2.
Followup cleanup can be used for renaming.

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/nose_utils.py                | 250 ++++++++----------
 qa/TL0_FW_iterators/test_paddle.sh            |   2 +-
 qa/TL0_FW_iterators/test_pytorch.sh           |   2 +-
 qa/TL0_cpu_only/test_nofw.sh                  |   4 +-
 qa/TL0_cpu_only/test_pytorch.sh               |   2 +-
 qa/TL0_cpu_only/test_tf.sh                    |   4 +-
 qa/TL0_multigpu/test_body.sh                  |   4 +-
 qa/TL0_plugin_manager/test.sh                 |   2 +-
 qa/TL0_python-self-test-core/test_body.sh     |   8 +-
 .../test_body.sh                              |   2 +-
 qa/TL0_python-self-test_tegra/test_body.sh    |  11 +-
 .../test_cupy.sh                              |  12 +-
 .../test_pytorch.sh                           |  26 +-
 qa/TL0_self_test_Ampere/test.sh               |   2 +-
 qa/TL0_tensorflow_plugin/test.sh              |  20 +-
 qa/TL0_tensorflow_plugin_conda/test.sh        |   8 +-
 qa/TL0_video_plugin/test.sh                   |   2 +-
 qa/TL0_videoreader_test/test.sh               |   4 +-
 qa/TL1_naive_histogram/test.sh                |   2 +-
 qa/TL1_python-self-test-slow/test.sh          |   2 +-
 qa/TL1_python-self-test_conda/test_body.sh    |   2 +-
 qa/TL1_tensorflow_dataset/test_impl.sh        |  14 +-
 qa/TL1_tensorflow_plugin/test.sh              |   6 +-
 qa/nose_wrapper/__main__.py                   |  19 --
 qa/test_template_impl.sh                      |  11 +-
 25 files changed, 186 insertions(+), 235 deletions(-)
 delete mode 100644 qa/nose_wrapper/__main__.py

diff --git a/dali/test/python/nose_utils.py b/dali/test/python/nose_utils.py
index 059b82becd8..be0cd88af9e 100644
--- a/dali/test/python/nose_utils.py
+++ b/dali/test/python/nose_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,132 +11,57 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-import collections
-
-if sys.version_info >= (3, 12):
-    # to make sure we can import anything from nose
-    from importlib import machinery, util
-    from importlib._bootstrap import _exec, _load
-    import modulefinder
-    import types
-    import unittest
-
-    # the below are based on https://github.com/python/cpython/blob/3.11/Lib/imp.py
-    # based on PSF license
-    def find_module(name, path):
-        return modulefinder.ModuleFinder(path).find_module(name, path)
-
-    def load_module(name, file, filename, details):
-        PY_SOURCE = 1
-        PY_COMPILED = 2
-
-        class _HackedGetData:
-            """Compatibility support for 'file' arguments of various load_*()
-            functions."""
-
-            def __init__(self, fullname, path, file=None):
-                super().__init__(fullname, path)
-                self.file = file
-
-            def get_data(self, path):
-                """Gross hack to contort loader to deal w/ load_*()'s bad API."""
-                if self.file and path == self.path:
-                    # The contract of get_data() requires us to return bytes. Reopen the
-                    # file in binary mode if needed.
-                    file = None
-                    if not self.file.closed:
-                        file = self.file
-                        if "b" not in file.mode:
-                            file.close()
-                    if self.file.closed:
-                        self.file = file = open(self.path, "rb")
-
-                    with file:
-                        return file.read()
-                else:
-                    return super().get_data(path)
-
-        class _LoadSourceCompatibility(_HackedGetData, machinery.SourceFileLoader):
-            """Compatibility support for implementing load_source()."""
-
-        _, mode, type_ = details
-        if mode and (not mode.startswith("r") or "+" in mode):
-            raise ValueError("invalid file open mode {!r}".format(mode))
-        elif file is None and type_ in {PY_SOURCE, PY_COMPILED}:
-            msg = "file object required for import (type code {})".format(type_)
-            raise ValueError(msg)
-        assert type_ == PY_SOURCE, "load_module replacement supports only PY_SOURCE file type"
-        loader = _LoadSourceCompatibility(name, filename, file)
-        spec = util.spec_from_file_location(name, filename, loader=loader)
-        if name in sys.modules:
-            module = _exec(spec, sys.modules[name])
-        else:
-            module = _load(spec)
-        # To allow reloading to potentially work, use a non-hacked loader which
-        # won't rely on a now-closed file object.
-        module.__loader__ = machinery.SourceFileLoader(name, filename)
-        module.__spec__.loader = module.__loader__
-        return module
-
-    def acquire_lock():
-        pass
 
-    def release_lock():
-        pass
-
-    context = {
-        "find_module": find_module,
-        "load_module": load_module,
-        "acquire_lock": acquire_lock,
-        "release_lock": release_lock,
-    }
-    imp_module = types.ModuleType("imp", "Mimics old imp module")
-    imp_module.__dict__.update(context)
-    sys.modules["imp"] = imp_module
-    unittest._TextTestResult = unittest.TextTestResult
-
-# Handle pkg_resources deprecation/removal
-try:
-    import pkg_resources  # noqa: F401
-except ImportError:
-    from importlib import metadata
-    import types
-
-    def iter_entry_points(group, name=None):
-        """Mimics pkg_resources.iter_entry_points using importlib.metadata."""
-        eps = metadata.entry_points()
-        selected = eps.select(group=group)
-
-        if name is not None:
-            selected = [ep for ep in selected if ep.name == name]
-
-        return selected
-
-    pkg_resources_module = types.ModuleType("pkg_resources", "Mimics pkg_resources module")
-    pkg_resources_module.iter_entry_points = iter_entry_points
-    sys.modules["pkg_resources"] = pkg_resources_module
-
-import nose.case
-import nose.inspector
-import nose.loader
-import nose.suite
-import nose.plugins.attrib
-from nose import SkipTest, with_setup  # noqa: F401
-from nose.plugins.attrib import attr  # noqa: F401
-from nose.tools import nottest  # noqa: F401
-
-if sys.version_info >= (3, 10) and not hasattr(collections, "Callable"):
-    nose.case.collections = collections.abc
-    nose.inspector.collections = collections.abc
-    nose.loader.collections = collections.abc
-    nose.suite.collections = collections.abc
-    nose.plugins.attrib.collections = collections.abc
-
-import nose.tools as tools
+from nose2.tools.decorators import with_setup as _nose2_with_setup, with_teardown as _nose2_with_teardown
+from unittest import SkipTest  # noqa: F401
+import unittest
 import re
 import fnmatch
-import unittest
+import functools
+import warnings
+
+
+def with_setup(setup=None, teardown=None):
+    """
+    Decorator to add setup and/or teardown functions to a test function.
+    Compatible with nose's with_setup(setup, teardown) signature.
+
+    Usage:
+        @with_setup(setup_func)
+        @with_setup(setup_func, teardown_func)
+        @with_setup(teardown=teardown_func)
+    """
+    def decorator(func):
+        if setup is not None:
+            func = _nose2_with_setup(setup)(func)
+        if teardown is not None:
+            func = _nose2_with_teardown(teardown)(func)
+        return func
+    return decorator
+
+
+def with_teardown(teardown):
+    """Decorator to add teardown function to a test function."""
+    return _nose2_with_teardown(teardown)
+
+
+def attr(*tags):
+    """Set test attributes for nose2 filtering with -A flag.
+
+    Usage: @attr("pytorch", "slow")
+    Filtering: nose2 -A 'pytorch' or nose2 -A '!slow'
+    """
+    def decorator(func):
+        for tag in tags:
+            setattr(func, tag, True)
+        return func
+    return decorator
+
+
+def nottest(func):
+    """Mark function as not a test."""
+    func.__test__ = False
+    return func
 
 
 class empty_case(unittest.TestCase):
@@ -187,28 +112,84 @@ def get_pattern(glob=None, regex=None, match_case=None):
 
 def assert_raises(exception, *args, glob=None, regex=None, match_case=None, **kwargs):
     """
-    Wrapper combining `nose.tools.assert_raises` and `nose.tools.assert_raises_regex`.
+    Wrapper combining unittest assertRaises and assertRaisesRegex.
     Specify ``regex=pattern`` or ``glob=pattern`` to check error message of expected exception
     against the pattern.
     Value for `glob` must be a string, `regex` can be either a literal or compiled regex pattern.
     By default, the check will ignore case, if called with `glob` or a literal for `regex`.
     To enforce case sensitive check pass ``match_case=True``.
     Don't specify `match_case` if passing already compiled regex pattern.
+
+    Can be used as context manager or with callable:
+        with assert_raises(Exception):
+            raise Exception()
+
+        assert_raises(Exception, callable, arg1, arg2, kwarg=value)
     """
+    tc = unittest.TestCase()
+    tc.maxDiff = None  # Show full diff on assertion failures
 
     if glob is None and regex is None:
-        return tools.assert_raises(exception, *args, **kwargs)
-
-    pattern = get_pattern(glob, regex, match_case)
-    return tools.assert_raises_regex(exception, pattern, *args, **kwargs)
+        # Use unittest's assertRaises
+        if args:
+            # Called with callable: assert_raises(Exception, callable, *args, **kwargs)
+            callable_func = args[0]
+            callable_args = args[1:]
+            with tc.assertRaises(exception):
+                callable_func(*callable_args, **kwargs)
+        else:
+            # Used as context manager
+            return tc.assertRaises(exception)
+    else:
+        pattern = get_pattern(glob, regex, match_case)
+        # Use unittest's assertRaisesRegex
+        if args:
+            # Called with callable
+            callable_func = args[0]
+            callable_args = args[1:]
+            with tc.assertRaisesRegex(exception, pattern):
+                callable_func(*callable_args, **kwargs)
+        else:
+            # Used as context manager
+            return tc.assertRaisesRegex(exception, pattern)
 
 
 def assert_warns(exception=Warning, *args, glob=None, regex=None, match_case=None, **kwargs):
-    if glob is None and regex is None:
-        return tools.assert_warns(exception, *args, **kwargs)
+    """
+    Wrapper for asserting warnings, optionally with pattern matching.
 
-    pattern = get_pattern(glob, regex, match_case)
-    return tools.assert_warns_regex(exception, pattern, *args, **kwargs)
+    Can be used as context manager or with callable:
+        with assert_warns(UserWarning):
+            warnings.warn("test", UserWarning)
+
+        assert_warns(UserWarning, callable, arg1, arg2, kwarg=value)
+    """
+    tc = unittest.TestCase()
+    tc.maxDiff = None
+
+    if glob is None and regex is None:
+        # Use unittest's assertWarns
+        if args:
+            # Called with callable
+            callable_func = args[0]
+            callable_args = args[1:]
+            with tc.assertWarns(exception):
+                callable_func(*callable_args, **kwargs)
+        else:
+            # Used as context manager
+            return tc.assertWarns(exception)
+    else:
+        pattern = get_pattern(glob, regex, match_case)
+        # Use unittest's assertWarnsRegex
+        if args:
+            # Called with callable
+            callable_func = args[0]
+            callable_args = args[1:]
+            with tc.assertWarnsRegex(exception, pattern):
+                callable_func(*callable_args, **kwargs)
+        else:
+            # Used as context manager
+            return tc.assertWarnsRegex(exception, pattern)
 
 
 def raises(exception, glob=None, regex=None, match_case=None):
@@ -235,10 +216,11 @@ def test():
     """
 
     def decorator(func):
+        @functools.wraps(func)
         def new_func(*args, **kwargs):
             with assert_raises(exception, glob=glob, regex=regex, match_case=match_case):
                 return func(*args, **kwargs)
 
-        return tools.make_decorator(func)(new_func)
+        return new_func
 
     return decorator
diff --git a/qa/TL0_FW_iterators/test_paddle.sh b/qa/TL0_FW_iterators/test_paddle.sh
index 8be13feede1..5408c2aba92 100755
--- a/qa/TL0_FW_iterators/test_paddle.sh
+++ b/qa/TL0_FW_iterators/test_paddle.sh
@@ -18,7 +18,7 @@ test_body() {
             python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
                 --workers 3 --prefetch 2 -i 2 --epochs 2 --fp16
         done
-        ${python_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*paddle*' test_fw_iterators_detection.py
+        ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*paddle*' test_fw_iterators_detection
     fi
     ${python_new_invoke_test} -A 'paddle' test_fw_iterators
 }
diff --git a/qa/TL0_FW_iterators/test_pytorch.sh b/qa/TL0_FW_iterators/test_pytorch.sh
index 6fb9e241f60..fc7957dec46 100755
--- a/qa/TL0_FW_iterators/test_pytorch.sh
+++ b/qa/TL0_FW_iterators/test_pytorch.sh
@@ -19,7 +19,7 @@ test_body() {
                 --workers 3 --prefetch 2 -i 2 --epochs 2 --fp16
         done
     fi
-    ${python_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch*' test_fw_iterators_detection.py
+    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch*' test_fw_iterators_detection
     ${python_new_invoke_test} -A 'pytorch' test_fw_iterators
 }
 
diff --git a/qa/TL0_cpu_only/test_nofw.sh b/qa/TL0_cpu_only/test_nofw.sh
index a7bc1051b1f..ad4ba417c89 100755
--- a/qa/TL0_cpu_only/test_nofw.sh
+++ b/qa/TL0_cpu_only/test_nofw.sh
@@ -33,9 +33,9 @@ test_body() {
     "$FULLPATH" --gtest_filter="*CpuOnly*:*CApi*/0.*-*0.UseCopyKernel:*ForceNoCopyFail:*daliOutputCopySamples"
   done
   if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-    ${python_invoke_test} --attr '!pytorch' test_dali_cpu_only.py
+    ${python_new_invoke_test} -A '!pytorch' test_dali_cpu_only
   else
-    ${python_invoke_test} --attr '!pytorch,!numba' test_dali_cpu_only.py
+    ${python_new_invoke_test} -A '!pytorch,!numba' test_dali_cpu_only
   fi
 }
 
diff --git a/qa/TL0_cpu_only/test_pytorch.sh b/qa/TL0_cpu_only/test_pytorch.sh
index 7fea55ceac6..adff0a6944e 100755
--- a/qa/TL0_cpu_only/test_pytorch.sh
+++ b/qa/TL0_cpu_only/test_pytorch.sh
@@ -8,7 +8,7 @@ test_body() {
   # CPU only test, remove CUDA from the search path just in case
   export LD_LIBRARY_PATH=""
   export PATH=${PATH/cuda/}
-  ${python_invoke_test} --attr 'pytorch' test_dali_cpu_only.py
+  ${python_new_invoke_test} -A 'pytorch' test_dali_cpu_only
 }
 
 pushd ../..
diff --git a/qa/TL0_cpu_only/test_tf.sh b/qa/TL0_cpu_only/test_tf.sh
index 75c7721ac0d..a0abd099d92 100755
--- a/qa/TL0_cpu_only/test_tf.sh
+++ b/qa/TL0_cpu_only/test_tf.sh
@@ -11,8 +11,8 @@ test_body() {
     # CPU only test, remove CUDA from the search path just in case
     export LD_LIBRARY_PATH=""
     export PATH=${PATH/cuda/}
-    ${python_invoke_test} test_dali_tf_plugin_cpu_only.py
-    ${python_invoke_test} test_dali_tf_plugin_cpu_only_dataset.py
+    ${python_new_invoke_test} test_dali_tf_plugin_cpu_only
+    ${python_new_invoke_test} test_dali_tf_plugin_cpu_only_dataset
   fi
 }
 
diff --git a/qa/TL0_multigpu/test_body.sh b/qa/TL0_multigpu/test_body.sh
index 0c1942d13d4..27609a9ed0e 100644
--- a/qa/TL0_multigpu/test_body.sh
+++ b/qa/TL0_multigpu/test_body.sh
@@ -37,12 +37,12 @@ test_gtest() {
 }
 
 test_cupy() {
-    ${python_invoke_test} --attr 'multigpu' test_external_source_cupy.py
+    ${python_new_invoke_test} -A 'multigpu' test_external_source_cupy
 }
 
 
 test_pytorch() {
-    ${python_invoke_test} --attr 'multigpu' test_external_source_pytorch_gpu.py
+    ${python_new_invoke_test} -A 'multigpu' test_external_source_pytorch_gpu
     ${python_new_invoke_test} -A 'pytorch,multi_gpu' -s experimental_mode
 }
 
diff --git a/qa/TL0_plugin_manager/test.sh b/qa/TL0_plugin_manager/test.sh
index 49b9c426fde..57bfe75af06 100755
--- a/qa/TL0_plugin_manager/test.sh
+++ b/qa/TL0_plugin_manager/test.sh
@@ -12,7 +12,7 @@ if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
 fi
 
 test_body() {
-    ${python_invoke_test} test_plugin_manager.py
+    ${python_new_invoke_test} test_plugin_manager
 }
 
 pushd ../..
diff --git a/qa/TL0_python-self-test-core/test_body.sh b/qa/TL0_python-self-test-core/test_body.sh
index 7bea77882f9..3c8b7807afe 100644
--- a/qa/TL0_python-self-test-core/test_body.sh
+++ b/qa/TL0_python-self-test-core/test_body.sh
@@ -7,7 +7,7 @@ test_different_numpy_versions() {
     for test_script in $(ls test_pipeline.py \
                             test_pipeline_decorator.py \
                             test_pipeline_segmentation.py); do
-        ${python_invoke_test} ${test_script}
+        ${python_new_invoke_test} ${test_script%.py}
     done
 }
 
@@ -23,9 +23,9 @@ test_py_with_framework() {
                             test_functional_api.py \
                             test_external_source_impl_utils.py); do
         if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-            ${python_invoke_test} --attr "!slow,!pytorch,!mxnet,!cupy" ${test_script}
+            ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy" ${test_script%.py}
         else
-            ${python_invoke_test} --attr "!slow,!pytorch,!mxnet,!cupy,!numba" ${test_script}
+            ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy,!numba" ${test_script%.py}
         fi
     done
 
@@ -74,7 +74,7 @@ test_dynamic_mode_torch() {
 }
 
 test_pytorch() {
-    ${python_invoke_test} --attr '!slow,pytorch' test_dali_variable_batch_size.py
+    ${python_new_invoke_test} --attr '!slow,pytorch' test_dali_variable_batch_size.py
     test_dynamic_mode_torch
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
         ${python_new_invoke_test} -A 'pytorch' -s type_annotations
diff --git a/qa/TL0_python-self-test-readers-decoders/test_body.sh b/qa/TL0_python-self-test-readers-decoders/test_body.sh
index f36881f8b03..1c23b32c04e 100644
--- a/qa/TL0_python-self-test-readers-decoders/test_body.sh
+++ b/qa/TL0_python-self-test-readers-decoders/test_body.sh
@@ -14,7 +14,7 @@ test_py_with_framework() {
       test_pool.py test_external_source_parallel.py test_external_source_parallel_shared_batch.py \
       test_external_source_parallel_large_sample.py \
       | sed "/$FILTER_PATTERN/d"); do
-        ${python_invoke_test} --attr '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script}
+        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script%.py}
     done
 
 
diff --git a/qa/TL0_python-self-test_tegra/test_body.sh b/qa/TL0_python-self-test_tegra/test_body.sh
index 0b7500e1ff2..3bbe3e710e3 100644
--- a/qa/TL0_python-self-test_tegra/test_body.sh
+++ b/qa/TL0_python-self-test_tegra/test_body.sh
@@ -19,16 +19,11 @@ test_py_with_framework() {
         for exclude in "${EXCLUDE_PACKAGES[@]}"; do
             grep -qiE ${exclude} ${test_script} && status=$((status+1))
         done
-        # if nose2 is used isnide the test use it
-        if grep -qiE "nose2" ${test_script}; then
-            PYTHON_TEST_CMD=${python_new_invoke_test}
-            test_script=${test_script/.py/}
-        else
-            PYTHON_TEST_CMD=${python_invoke_test}
-        fi
+        # All tests now use nose2 - strip .py extension
+        test_script=${test_script%.py}
         # execute only when no matches are found
         if [ ${status} -eq 0 ]; then
-            ${PYTHON_TEST_CMD} --attr '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' ${test_script}
+            ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' ${test_script}
         fi
     done
 
diff --git a/qa/TL0_python_self_test_frameworks/test_cupy.sh b/qa/TL0_python_self_test_frameworks/test_cupy.sh
index 9d818fba357..7fd2c36d57d 100755
--- a/qa/TL0_python_self_test_frameworks/test_cupy.sh
+++ b/qa/TL0_python_self_test_frameworks/test_cupy.sh
@@ -4,12 +4,12 @@ pip_packages='${python_test_runner_package} numpy cupy pycuda'
 target_dir=./dali/test/python
 
 test_body() {
-    ${python_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*cupy' test_dltensor_operator.py
-    ${python_invoke_test} test_gpu_python_function_operator.py
-    ${python_invoke_test} test_backend_impl_gpu.py
-    ${python_invoke_test} test_external_source_cupy.py
-    ${python_invoke_test} --attr 'cupy' test_external_source_impl_utils.py
-    ${python_invoke_test} --attr 'cupy' test_pipeline_debug.py
+    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*cupy' test_dltensor_operator
+    ${python_new_invoke_test} test_gpu_python_function_operator
+    ${python_new_invoke_test} test_backend_impl_gpu
+    ${python_new_invoke_test} test_external_source_cupy
+    ${python_new_invoke_test} -A 'cupy' test_external_source_impl_utils
+    ${python_new_invoke_test} -A 'cupy' test_pipeline_debug
     ${python_new_invoke_test} -A '!slow,cupy' checkpointing.test_dali_checkpointing
     ${python_new_invoke_test} -A '!slow,cupy' checkpointing.test_dali_stateless_operators
 }
diff --git a/qa/TL0_python_self_test_frameworks/test_pytorch.sh b/qa/TL0_python_self_test_frameworks/test_pytorch.sh
index 24c8368925e..d3bac5032c0 100755
--- a/qa/TL0_python_self_test_frameworks/test_pytorch.sh
+++ b/qa/TL0_python_self_test_frameworks/test_pytorch.sh
@@ -4,19 +4,19 @@ pip_packages='${python_test_runner_package} numpy librosa torch psutil torchvisi
 target_dir=./dali/test/python
 
 test_body() {
-    ${python_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_pytorch_operator.py
-    ${python_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_dltensor_operator.py
-    ${python_invoke_test} test_torch_pipeline_rnnt.py
-    ${python_invoke_test} test_external_source_pytorch_cpu.py
-    ${python_invoke_test} test_external_source_pytorch_gpu.py
-    ${python_invoke_test} test_external_source_pytorch_dlpack.py
-    ${python_invoke_test} test_external_source_parallel_pytorch.py
-    ${python_invoke_test} test_backend_impl_torch_dlpack.py
-    ${python_invoke_test} test_dali_fork_torch.py
-    ${python_invoke_test} test_copy_to_external_torch.py
-    ${python_invoke_test} --attr 'pytorch' test_external_source_impl_utils.py
-    ${python_invoke_test} --attr 'pytorch' test_pipeline_debug.py
-    ${python_invoke_test} --attr 'pytorch' test_functional_api.py
+    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_pytorch_operator
+    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_dltensor_operator
+    ${python_new_invoke_test} test_torch_pipeline_rnnt
+    ${python_new_invoke_test} test_external_source_pytorch_cpu
+    ${python_new_invoke_test} test_external_source_pytorch_gpu
+    ${python_new_invoke_test} test_external_source_pytorch_dlpack
+    ${python_new_invoke_test} test_external_source_parallel_pytorch
+    ${python_new_invoke_test} test_backend_impl_torch_dlpack
+    ${python_new_invoke_test} test_dali_fork_torch
+    ${python_new_invoke_test} test_copy_to_external_torch
+    ${python_new_invoke_test} -A 'pytorch' test_external_source_impl_utils
+    ${python_new_invoke_test} -A 'pytorch' test_pipeline_debug
+    ${python_new_invoke_test} -A 'pytorch' test_functional_api
     ${python_new_invoke_test} -s . test_dali_proxy
 }
 
diff --git a/qa/TL0_self_test_Ampere/test.sh b/qa/TL0_self_test_Ampere/test.sh
index 9c83f546a47..338df9489fa 100644
--- a/qa/TL0_self_test_Ampere/test.sh
+++ b/qa/TL0_self_test_Ampere/test.sh
@@ -35,7 +35,7 @@ test_body() {
   # test Optical Flow
   ${python_new_invoke_test} -s operator_1 test_optical_flow
   ${python_new_invoke_test} -s checkpointing test_dali_stateless_operators.test_optical_flow_stateless
-  ${python_invoke_test} test_dali_variable_batch_size.py:test_optical_flow
+  ${python_new_invoke_test} test_dali_variable_batch_size:test_optical_flow
 }
 
 pushd ../..
diff --git a/qa/TL0_tensorflow_plugin/test.sh b/qa/TL0_tensorflow_plugin/test.sh
index 5cfa62aac89..2bc7890ecfa 100755
--- a/qa/TL0_tensorflow_plugin/test.sh
+++ b/qa/TL0_tensorflow_plugin/test.sh
@@ -19,11 +19,11 @@ test_body() {
     pip uninstall -y `pip list | grep nvidia-dali-tf-plugin | cut -d " " -f1` || true
 
     # No plugin installed, should fail
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadFail
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadFail
 
     # Installing "current" dali tf (built against installed TF)
     pip install ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
 
     # Installing "current" dali tf (built against installed TF) - force rebuild without DALI using internal stubs
     # and then install DALI again
@@ -31,23 +31,23 @@ test_body() {
     pip uninstall -y `pip list | grep nvidia-dali | cut -d " " -f1` || true
     DALI_TF_ALWAYS_BUILD=1 pip install --no-deps ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
     pip install ../../../nvidia_dali_*.whl
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
 
     # DALI TF run
-    ${python_invoke_test} test_dali_tf_plugin_run.py
+    ${python_new_invoke_test} test_dali_tf_plugin_run
 
     # DALI TF DATASET run
-    ${python_invoke_test} test_dali_tf_dataset.py
-    ${python_invoke_test} test_dali_tf_conditionals.py
+    ${python_new_invoke_test} test_dali_tf_dataset
+    ${python_new_invoke_test} test_dali_tf_conditionals
     ${python_new_invoke_test} checkpointing.test_dali_checkpointing_tf_plugin
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-        ${python_invoke_test} test_dali_tf_dataset_shape.py
-        ${python_invoke_test} test_dali_tf_dataset_eager.py
-        ${python_invoke_test} test_dali_tf_dataset_graph.py
+        ${python_new_invoke_test} test_dali_tf_dataset_shape
+        ${python_new_invoke_test} test_dali_tf_dataset_eager
+        ${python_new_invoke_test} test_dali_tf_dataset_graph
     fi
 
     # DALI TF + dynamic executor
-    ${python_invoke_test} test_dali_tf_exec2.py
+    ${python_new_invoke_test} test_dali_tf_exec2
 }
 
 pushd ../..
diff --git a/qa/TL0_tensorflow_plugin_conda/test.sh b/qa/TL0_tensorflow_plugin_conda/test.sh
index a683786d110..990e184ad03 100755
--- a/qa/TL0_tensorflow_plugin_conda/test.sh
+++ b/qa/TL0_tensorflow_plugin_conda/test.sh
@@ -10,15 +10,15 @@ prolog=(enable_conda)
 epilog=(disable_conda)
 
 test_body() {
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
 
     # DALI TF run
-    ${python_invoke_test} test_dali_tf_plugin_run.py
+    ${python_new_invoke_test} test_dali_tf_plugin_run
 
     # DALI TF DATASET run
-    ${python_invoke_test} test_dali_tf_dataset.py
+    ${python_new_invoke_test} test_dali_tf_dataset
 
-    ${python_invoke_test} test_dali_tf_dataset_shape.py
+    ${python_new_invoke_test} test_dali_tf_dataset_shape
 }
 
 pushd ../..
diff --git a/qa/TL0_video_plugin/test.sh b/qa/TL0_video_plugin/test.sh
index 8b16451f26b..31d0f38d927 100755
--- a/qa/TL0_video_plugin/test.sh
+++ b/qa/TL0_video_plugin/test.sh
@@ -28,7 +28,7 @@ test_body() {
         pip install -v ../../../nvidia_dali_video*.tar.gz --no-build-isolation
 
     # Check that the plugin can be loaded
-    ${python_invoke_test} test_dali_video_plugin.py:TestDaliVideoPluginLoadOk
+    ${python_new_invoke_test} test_dali_video_plugin:TestDaliVideoPluginLoadOk
 
     ${python_new_invoke_test} -s . test_dali_video_plugin_decoder
 }
diff --git a/qa/TL0_videoreader_test/test.sh b/qa/TL0_videoreader_test/test.sh
index 3969cb31ef5..6b473202d9c 100755
--- a/qa/TL0_videoreader_test/test.sh
+++ b/qa/TL0_videoreader_test/test.sh
@@ -47,8 +47,8 @@ test_body() {
     python video_label_example.py
 
     echo $(pwd)
-    ${python_invoke_test} ../../../../dali/test/python/test_video_pipeline.py
-    ${python_invoke_test} ../../../../dali/test/python/test_video_reader_resize.py
+    ${python_new_invoke_test} ../../../../dali/test/python/test_video_pipeline
+    ${python_new_invoke_test} ../../../../dali/test/python/test_video_reader_resize
 
     cd ../../../../dali/test/python/
     ${python_new_invoke_test} test_video_reader
diff --git a/qa/TL1_naive_histogram/test.sh b/qa/TL1_naive_histogram/test.sh
index 727155a97b9..1d230506185 100755
--- a/qa/TL1_naive_histogram/test.sh
+++ b/qa/TL1_naive_histogram/test.sh
@@ -12,7 +12,7 @@ do_once() {
 test_body() {
     pushd $(pwd)/docs/examples/custom_operations/custom_operator/naive_histogram
     (mkdir build && cd build && cmake .. && make -j"$(grep ^processor /proc/cpuinfo | wc -l)")
-    ${python_invoke_test} test_naive_histogram.py
+    ${python_new_invoke_test} test_naive_histogram
     popd
 }
 
diff --git a/qa/TL1_python-self-test-slow/test.sh b/qa/TL1_python-self-test-slow/test.sh
index 5fcc7129a30..5b0def1e846 100755
--- a/qa/TL1_python-self-test-slow/test.sh
+++ b/qa/TL1_python-self-test-slow/test.sh
@@ -7,7 +7,7 @@ test_body() {
     for test_script in $(ls test_pipeline.py test_pipeline_debug.py test_pipeline_debug_resnet50.py \
                             test_pipeline_decorator.py test_pipeline_multichannel.py test_pipeline_segmentation.py \
                             test_functional_api.py); do
-        ${python_invoke_test} --attr 'slow' ${test_script}
+        ${python_new_invoke_test} -A 'slow' ${test_script%.py}
     done
 
     ${python_new_invoke_test} -A "slow" test_backend_impl
diff --git a/qa/TL1_python-self-test_conda/test_body.sh b/qa/TL1_python-self-test_conda/test_body.sh
index 495d55fc32e..9ca4fc9b95b 100644
--- a/qa/TL1_python-self-test_conda/test_body.sh
+++ b/qa/TL1_python-self-test_conda/test_body.sh
@@ -2,7 +2,7 @@
 
 test_py_with_framework() {
     for test_script in $(ls test_pipeline*.py test_external_source_dali.py test_external_source_numpy.py test_external_source_parallel_garbage_collection_order.py test_functional_api.py); do
-        ${python_invoke_test} --attr '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script}
+        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script%.py}
     done
 
     ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy' test_backend_impl
diff --git a/qa/TL1_tensorflow_dataset/test_impl.sh b/qa/TL1_tensorflow_dataset/test_impl.sh
index 0b01d50ed9f..5828092b3ec 100755
--- a/qa/TL1_tensorflow_dataset/test_impl.sh
+++ b/qa/TL1_tensorflow_dataset/test_impl.sh
@@ -13,13 +13,13 @@ test_body() {
     is_compatible=$(python -c 'import nvidia.dali.plugin.tf as dali_tf; print(dali_tf.dataset_compatible_tensorflow())')
     if [ $is_compatible = 'True' ]; then
         # DALI TF DATASET run
-        ${python_invoke_test} test_dali_tf_dataset_graph.py:_test_tf_dataset_other_gpu
-        ${python_invoke_test} test_dali_tf_dataset_graph.py:_test_tf_dataset_multigpu_manual_placement
-        ${python_invoke_test} test_dali_tf_dataset_eager.py:_test_tf_dataset_other_gpu
-        ${python_invoke_test} test_dali_tf_dataset_eager.py:_test_tf_dataset_multigpu_manual_placement
-        ${python_invoke_test} test_dali_tf_dataset_eager.py:_test_tf_dataset_multigpu_mirrored_strategy
-        ${python_invoke_test} test_dali_tf_dataset_mnist_eager.py
-        ${python_invoke_test} test_dali_tf_dataset_mnist_graph.py
+        ${python_new_invoke_test} test_dali_tf_dataset_graph:_test_tf_dataset_other_gpu
+        ${python_new_invoke_test} test_dali_tf_dataset_graph:_test_tf_dataset_multigpu_manual_placement
+        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_other_gpu
+        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_multigpu_manual_placement
+        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_multigpu_mirrored_strategy
+        ${python_new_invoke_test} test_dali_tf_dataset_mnist_eager
+        ${python_new_invoke_test} test_dali_tf_dataset_mnist_graph
 
         # DALI TF Notebooks run
         pushd ../../../docs/examples/frameworks/tensorflow/
diff --git a/qa/TL1_tensorflow_plugin/test.sh b/qa/TL1_tensorflow_plugin/test.sh
index 60cdeba6636..40f7ad805f4 100755
--- a/qa/TL1_tensorflow_plugin/test.sh
+++ b/qa/TL1_tensorflow_plugin/test.sh
@@ -9,16 +9,16 @@ test_body() {
 
 
     # No plugin installed, should fail
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadFail
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadFail
 
     # Remove the old and installing "current" dali tf (built against installed TF)
     pip uninstall -y `pip list | grep nvidia-dali-tf-plugin | cut -d " " -f1` || true
 
     pip install --upgrade ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
-    ${python_invoke_test} test_dali_tf_plugin.py:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
 
     # DALI TF run
-    ${python_invoke_test} test_dali_tf_plugin_run.py
+    ${python_new_invoke_test} test_dali_tf_plugin_run
 }
 
 pushd ../..
diff --git a/qa/nose_wrapper/__main__.py b/qa/nose_wrapper/__main__.py
deleted file mode 100644
index 1d5eb1977f0..00000000000
--- a/qa/nose_wrapper/__main__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import sys
-
-# before running the test we add dali/test/python to the python path
-import nose_utils  # noqa:F401  - for Python 3.10
-from nose.core import run_exit
-import inspect
-
-if sys.version_info >= (3, 11):
-
-    def legacy_getargspec(fun):
-        args, varargs, varkw, defaults, *_ = inspect.getfullargspec(fun)
-        return (args, varargs, varkw, defaults)
-
-    inspect.getargspec = legacy_getargspec
-
-if sys.argv[0].endswith("__main__.py"):
-    sys.argv[0] = "%s -m nose_wrapper" % sys.executable
-
-run_exit()
diff --git a/qa/test_template_impl.sh b/qa/test_template_impl.sh
index 5229870f9cd..a7f7c1d71b1 100755
--- a/qa/test_template_impl.sh
+++ b/qa/test_template_impl.sh
@@ -17,15 +17,8 @@ source $topdir/qa/setup_test_common.sh
 
 # Set runner for python tests
 export PYTHONPATH=${PYTHONPATH}:$topdir/qa:$topdir/dali/test/python
-python_test_runner_package="nose nose2 nose-timer nose2-test-timer"
-# use DALI nose wrapper to patch nose to support Python 3.10
-python_test_runner="python -m nose_wrapper"
-python_test_args="--verbose --with-timer --timer-top-n 20 -s"
-python_invoke_test="${python_test_runner} ${python_test_args}"
-
-# New framework for Python Tests
-# During the transition we run both
-# When all tests are ported old will be removed
+python_test_runner_package="nose2 nose2-test-timer"
+# Python test runner (nose2)
 python_new_test_runner="python -m nose2"
 python_new_test_args="--verbose --plugin=nose2_test_timer.plugin --with-timer --timer-color --timer-top-n 20"
 python_new_invoke_test="${python_new_test_runner} ${python_new_test_args}"

From 116bb822f3a852cd61453ae924eed2bf71c444e8 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 16:57:23 +0100
Subject: [PATCH 02/19] Replace unsupported -m regex by attributes

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/test_dltensor_operator.py         |  6 +++++-
 dali/test/python/test_fw_iterators_detection.py    | 10 ++++++++--
 dali/test/python/test_pytorch_operator.py          |  5 ++++-
 qa/TL0_FW_iterators/test_paddle.sh                 |  2 +-
 qa/TL0_FW_iterators/test_pytorch.sh                |  2 +-
 qa/TL0_python_self_test_frameworks/test_cupy.sh    |  2 +-
 qa/TL0_python_self_test_frameworks/test_pytorch.sh |  4 ++--
 7 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/dali/test/python/test_dltensor_operator.py b/dali/test/python/test_dltensor_operator.py
index f96fdc11a5e..13ebffbdbd0 100644
--- a/dali/test/python/test_dltensor_operator.py
+++ b/dali/test/python/test_dltensor_operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019, 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 from nvidia.dali.pipeline import Pipeline
 from nvidia.dali import fn, pipeline_def
 from nvidia.dali.python_function_plugin import current_dali_stream
+from nose_utils import attr
 
 test_data_root = os.environ["DALI_EXTRA_PATH"]
 images_dir = os.path.join(test_data_root, "db", "single", "jpeg")
@@ -182,6 +183,7 @@ def pytorch_red_channel_op(in1, in2):
     return [t.narrow(2, 0, 1).squeeze() for t in in1], [t.narrow(2, 0, 1).squeeze() for t in in2]
 
 
+@attr('pytorch')
 def test_pytorch():
     setup_pytorch()
     for testcase in [simple_pytorch_op, pytorch_red_channel_op]:
@@ -327,6 +329,7 @@ def cupy_kernel_gray_scale(in1, in2, stream=None):
     return out1, out2
 
 
+@attr('cupy')
 def test_cupy():
     setup_cupy()
     print(cupy)
@@ -335,6 +338,7 @@ def test_cupy():
     yield from _cupy_flip_with_negative_strides_suite()
 
 
+@attr('cupy')
 def test_cupy_kernel_gray_scale():
     setup_cupy()
     cupy_case(cupy_kernel_gray_scale, synchronize=False)
diff --git a/dali/test/python/test_fw_iterators_detection.py b/dali/test/python/test_fw_iterators_detection.py
index ce783812e1f..90f523ffdc8 100644
--- a/dali/test/python/test_fw_iterators_detection.py
+++ b/dali/test/python/test_fw_iterators_detection.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from nvidia.dali.pipeline import Pipeline
 
 from test_utils import get_dali_extra_path
-from nose_utils import assert_raises
+from nose_utils import assert_raises, attr
 
 DALI_EXTRA_PATH = get_dali_extra_path()
 EPOCH_SIZE = 32
@@ -74,6 +74,7 @@ def test_mxnet_pipeline_dynamic_shape():
         assert data is not None
 
 
+@attr('pytorch')
 def test_pytorch_pipeline_dynamic_shape():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
@@ -86,6 +87,7 @@ def test_pytorch_pipeline_dynamic_shape():
         assert data is not None
 
 
+@attr('paddle')
 def test_paddle_pipeline_dynamic_shape():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
@@ -98,6 +100,7 @@ def test_paddle_pipeline_dynamic_shape():
         assert data is not None
 
 
+@attr('pytorch')
 def test_api_fw_check1_pytorch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
@@ -117,6 +120,7 @@ def test_api_fw_check1_mxnet():
     )
 
 
+@attr('paddle')
 def test_api_fw_check1_paddle():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
@@ -172,12 +176,14 @@ def test_api_fw_check2_mxnet():
     )
 
 
+@attr('pytorch')
 def test_api_fw_check2_pytorch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
     yield from test_api_fw_check2(PyTorchIterator, ["data", "bboxes", "label"])
 
 
+@attr('paddle')
 def test_api_fw_check2_paddle():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
diff --git a/dali/test/python/test_pytorch_operator.py b/dali/test/python/test_pytorch_operator.py
index e1d7b7eb6e5..386801515c3 100644
--- a/dali/test/python/test_pytorch_operator.py
+++ b/dali/test/python/test_pytorch_operator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 from nvidia.dali.pipeline import Pipeline
 
 from test_utils import get_dali_extra_path
+from nose_utils import attr
 
 test_data_root = get_dali_extra_path()
 images_dir = os.path.join(test_data_root, "db", "single", "jpeg")
@@ -113,6 +114,7 @@ def check_pytorch_operator(device):
             assert numpy.allclose(res2, exp2_t.numpy())
 
 
+@attr('pytorch')
 def test_pytorch_operator():
     for device in {"cpu", "gpu"}:
         yield check_pytorch_operator, device
@@ -136,6 +138,7 @@ def check_pytorch_operator_batch_processing(device):
             assert numpy.allclose(res2, exp2[i].numpy())
 
 
+@attr('pytorch')
 def test_pytorch_operator_batch_processing():
     for device in {"cpu", "gpu"}:
         yield check_pytorch_operator_batch_processing, device
diff --git a/qa/TL0_FW_iterators/test_paddle.sh b/qa/TL0_FW_iterators/test_paddle.sh
index 5408c2aba92..473f40a0d7f 100755
--- a/qa/TL0_FW_iterators/test_paddle.sh
+++ b/qa/TL0_FW_iterators/test_paddle.sh
@@ -18,7 +18,7 @@ test_body() {
             python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
                 --workers 3 --prefetch 2 -i 2 --epochs 2 --fp16
         done
-        ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*paddle*' test_fw_iterators_detection
+        ${python_new_invoke_test} -A 'paddle' test_fw_iterators_detection
     fi
     ${python_new_invoke_test} -A 'paddle' test_fw_iterators
 }
diff --git a/qa/TL0_FW_iterators/test_pytorch.sh b/qa/TL0_FW_iterators/test_pytorch.sh
index fc7957dec46..9100ebf23ca 100755
--- a/qa/TL0_FW_iterators/test_pytorch.sh
+++ b/qa/TL0_FW_iterators/test_pytorch.sh
@@ -19,7 +19,7 @@ test_body() {
                 --workers 3 --prefetch 2 -i 2 --epochs 2 --fp16
         done
     fi
-    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch*' test_fw_iterators_detection
+    ${python_new_invoke_test} -A 'pytorch' test_fw_iterators_detection
     ${python_new_invoke_test} -A 'pytorch' test_fw_iterators
 }
 
diff --git a/qa/TL0_python_self_test_frameworks/test_cupy.sh b/qa/TL0_python_self_test_frameworks/test_cupy.sh
index 7fd2c36d57d..fea0ba52859 100755
--- a/qa/TL0_python_self_test_frameworks/test_cupy.sh
+++ b/qa/TL0_python_self_test_frameworks/test_cupy.sh
@@ -4,7 +4,7 @@ pip_packages='${python_test_runner_package} numpy cupy pycuda'
 target_dir=./dali/test/python
 
 test_body() {
-    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*cupy' test_dltensor_operator
+    ${python_new_invoke_test} -A 'cupy' test_dltensor_operator
     ${python_new_invoke_test} test_gpu_python_function_operator
     ${python_new_invoke_test} test_backend_impl_gpu
     ${python_new_invoke_test} test_external_source_cupy
diff --git a/qa/TL0_python_self_test_frameworks/test_pytorch.sh b/qa/TL0_python_self_test_frameworks/test_pytorch.sh
index d3bac5032c0..95b42abcdd3 100755
--- a/qa/TL0_python_self_test_frameworks/test_pytorch.sh
+++ b/qa/TL0_python_self_test_frameworks/test_pytorch.sh
@@ -4,8 +4,8 @@ pip_packages='${python_test_runner_package} numpy librosa torch psutil torchvisi
 target_dir=./dali/test/python
 
 test_body() {
-    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_pytorch_operator
-    ${python_new_invoke_test} -m '(?:^|[\b_\./-])[Tt]est.*pytorch' test_dltensor_operator
+    ${python_new_invoke_test} -A 'pytorch' test_pytorch_operator
+    ${python_new_invoke_test} -A 'pytorch' test_dltensor_operator
     ${python_new_invoke_test} test_torch_pipeline_rnnt
     ${python_new_invoke_test} test_external_source_pytorch_cpu
     ${python_new_invoke_test} test_external_source_pytorch_gpu

From b34940e2032df209205db37a4b1f81a392293cfb Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 17:04:21 +0100
Subject: [PATCH 03/19] Fix : -> .

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 qa/TL0_python-self-test-core/test_body.sh             |  2 +-
 qa/TL0_python-self-test-readers-decoders/test_body.sh |  4 ++--
 qa/TL0_self_test_Ampere/test.sh                       |  2 +-
 qa/TL0_tensorflow_plugin/test.sh                      |  6 +++---
 qa/TL0_tensorflow_plugin_conda/test.sh                |  2 +-
 qa/TL0_video_plugin/test.sh                           |  2 +-
 qa/TL1_python-self-test_conda/test_body.sh            |  2 +-
 qa/TL1_tensorflow_dataset/test_impl.sh                | 10 +++++-----
 qa/TL1_tensorflow_plugin/test.sh                      |  4 ++--
 9 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/qa/TL0_python-self-test-core/test_body.sh b/qa/TL0_python-self-test-core/test_body.sh
index 3c8b7807afe..7e450425e9b 100644
--- a/qa/TL0_python-self-test-core/test_body.sh
+++ b/qa/TL0_python-self-test-core/test_body.sh
@@ -93,7 +93,7 @@ test_checkpointing() {
         ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!sanitizer_skip' checkpointing.test_dali_checkpointing
 
         # External source tests are slow and Python-side mostly, but let's run just one of them
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing:1
+        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing.1
     fi
 }
 
diff --git a/qa/TL0_python-self-test-readers-decoders/test_body.sh b/qa/TL0_python-self-test-readers-decoders/test_body.sh
index 1c23b32c04e..f1d39109504 100644
--- a/qa/TL0_python-self-test-readers-decoders/test_body.sh
+++ b/qa/TL0_python-self-test-readers-decoders/test_body.sh
@@ -52,10 +52,10 @@ test_jpeg_scan_limit() {
       # test various broken cases with smaller limit to make the test faster
       DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit
       # test default limit for one case
-      ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
+      ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
     else
       # let's check if error handling does not lead to leaks
-      DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
+      DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
     fi
 }
 
diff --git a/qa/TL0_self_test_Ampere/test.sh b/qa/TL0_self_test_Ampere/test.sh
index 338df9489fa..71275d75a91 100644
--- a/qa/TL0_self_test_Ampere/test.sh
+++ b/qa/TL0_self_test_Ampere/test.sh
@@ -35,7 +35,7 @@ test_body() {
   # test Optical Flow
   ${python_new_invoke_test} -s operator_1 test_optical_flow
   ${python_new_invoke_test} -s checkpointing test_dali_stateless_operators.test_optical_flow_stateless
-  ${python_new_invoke_test} test_dali_variable_batch_size:test_optical_flow
+  ${python_new_invoke_test} test_dali_variable_batch_size.test_optical_flow
 }
 
 pushd ../..
diff --git a/qa/TL0_tensorflow_plugin/test.sh b/qa/TL0_tensorflow_plugin/test.sh
index 2bc7890ecfa..1e2e080cba1 100755
--- a/qa/TL0_tensorflow_plugin/test.sh
+++ b/qa/TL0_tensorflow_plugin/test.sh
@@ -19,11 +19,11 @@ test_body() {
     pip uninstall -y `pip list | grep nvidia-dali-tf-plugin | cut -d " " -f1` || true
 
     # No plugin installed, should fail
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadFail
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadFail
 
     # Installing "current" dali tf (built against installed TF)
     pip install ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadOk
 
     # Installing "current" dali tf (built against installed TF) - force rebuild without DALI using internal stubs
     # and then install DALI again
@@ -31,7 +31,7 @@ test_body() {
     pip uninstall -y `pip list | grep nvidia-dali | cut -d " " -f1` || true
     DALI_TF_ALWAYS_BUILD=1 pip install --no-deps ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
     pip install ../../../nvidia_dali_*.whl
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadOk
 
     # DALI TF run
     ${python_new_invoke_test} test_dali_tf_plugin_run
diff --git a/qa/TL0_tensorflow_plugin_conda/test.sh b/qa/TL0_tensorflow_plugin_conda/test.sh
index 990e184ad03..2e11d50c5a5 100755
--- a/qa/TL0_tensorflow_plugin_conda/test.sh
+++ b/qa/TL0_tensorflow_plugin_conda/test.sh
@@ -10,7 +10,7 @@ prolog=(enable_conda)
 epilog=(disable_conda)
 
 test_body() {
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadOk
 
     # DALI TF run
     ${python_new_invoke_test} test_dali_tf_plugin_run
diff --git a/qa/TL0_video_plugin/test.sh b/qa/TL0_video_plugin/test.sh
index 31d0f38d927..1562aa40081 100755
--- a/qa/TL0_video_plugin/test.sh
+++ b/qa/TL0_video_plugin/test.sh
@@ -28,7 +28,7 @@ test_body() {
         pip install -v ../../../nvidia_dali_video*.tar.gz --no-build-isolation
 
     # Check that the plugin can be loaded
-    ${python_new_invoke_test} test_dali_video_plugin:TestDaliVideoPluginLoadOk
+    ${python_new_invoke_test} test_dali_video_plugin.TestDaliVideoPluginLoadOk
 
     ${python_new_invoke_test} -s . test_dali_video_plugin_decoder
 }
diff --git a/qa/TL1_python-self-test_conda/test_body.sh b/qa/TL1_python-self-test_conda/test_body.sh
index 9ca4fc9b95b..3b3a7d2fe87 100644
--- a/qa/TL1_python-self-test_conda/test_body.sh
+++ b/qa/TL1_python-self-test_conda/test_body.sh
@@ -17,7 +17,7 @@ test_jpeg_scan_limit() {
     # test various broken cases with smaller limit for speed
     DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit
     # test default limit for one case
-    ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
+    ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
 }
 
 test_py() {
diff --git a/qa/TL1_tensorflow_dataset/test_impl.sh b/qa/TL1_tensorflow_dataset/test_impl.sh
index 5828092b3ec..c8d3c90f652 100755
--- a/qa/TL1_tensorflow_dataset/test_impl.sh
+++ b/qa/TL1_tensorflow_dataset/test_impl.sh
@@ -13,11 +13,11 @@ test_body() {
     is_compatible=$(python -c 'import nvidia.dali.plugin.tf as dali_tf; print(dali_tf.dataset_compatible_tensorflow())')
     if [ $is_compatible = 'True' ]; then
         # DALI TF DATASET run
-        ${python_new_invoke_test} test_dali_tf_dataset_graph:_test_tf_dataset_other_gpu
-        ${python_new_invoke_test} test_dali_tf_dataset_graph:_test_tf_dataset_multigpu_manual_placement
-        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_other_gpu
-        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_multigpu_manual_placement
-        ${python_new_invoke_test} test_dali_tf_dataset_eager:_test_tf_dataset_multigpu_mirrored_strategy
+        ${python_new_invoke_test} test_dali_tf_dataset_graph._test_tf_dataset_other_gpu
+        ${python_new_invoke_test} test_dali_tf_dataset_graph._test_tf_dataset_multigpu_manual_placement
+        ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_other_gpu
+        ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_multigpu_manual_placement
+        ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_multigpu_mirrored_strategy
         ${python_new_invoke_test} test_dali_tf_dataset_mnist_eager
         ${python_new_invoke_test} test_dali_tf_dataset_mnist_graph
 
diff --git a/qa/TL1_tensorflow_plugin/test.sh b/qa/TL1_tensorflow_plugin/test.sh
index 40f7ad805f4..dbab1909960 100755
--- a/qa/TL1_tensorflow_plugin/test.sh
+++ b/qa/TL1_tensorflow_plugin/test.sh
@@ -9,13 +9,13 @@ test_body() {
 
 
     # No plugin installed, should fail
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadFail
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadFail
 
     # Remove the old and installing "current" dali tf (built against installed TF)
     pip uninstall -y `pip list | grep nvidia-dali-tf-plugin | cut -d " " -f1` || true
 
     pip install --upgrade ../../../nvidia_dali_tf_plugin*.tar.gz --no-build-isolation
-    ${python_new_invoke_test} test_dali_tf_plugin:TestDaliTfPluginLoadOk
+    ${python_new_invoke_test} test_dali_tf_plugin.TestDaliTfPluginLoadOk
 
     # DALI TF run
     ${python_new_invoke_test} test_dali_tf_plugin_run

From 16191d85646cc8a51fd325bdfeab0641d4c54272 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 17:05:47 +0100
Subject: [PATCH 04/19] Greptile suggestion - use one TC instance

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/nose_utils.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/dali/test/python/nose_utils.py b/dali/test/python/nose_utils.py
index be0cd88af9e..c96306cd40f 100644
--- a/dali/test/python/nose_utils.py
+++ b/dali/test/python/nose_utils.py
@@ -69,6 +69,11 @@ def nop():
         pass
 
 
+# Module-level TestCase instance for assertions
+_test_case = unittest.TestCase()
+_test_case.maxDiff = None  # Show full diff on assertion failures
+
+
 def assert_equals(x, y):
     foo = empty_case()
     foo.assertEqual(x, y)
@@ -126,20 +131,17 @@ def assert_raises(exception, *args, glob=None, regex=None, match_case=None, **kw
 
         assert_raises(Exception, callable, arg1, arg2, kwarg=value)
     """
-    tc = unittest.TestCase()
-    tc.maxDiff = None  # Show full diff on assertion failures
-
     if glob is None and regex is None:
         # Use unittest's assertRaises
         if args:
             # Called with callable: assert_raises(Exception, callable, *args, **kwargs)
             callable_func = args[0]
             callable_args = args[1:]
-            with tc.assertRaises(exception):
+            with _test_case.assertRaises(exception):
                 callable_func(*callable_args, **kwargs)
         else:
             # Used as context manager
-            return tc.assertRaises(exception)
+            return _test_case.assertRaises(exception)
     else:
         pattern = get_pattern(glob, regex, match_case)
         # Use unittest's assertRaisesRegex
@@ -147,11 +149,11 @@ def assert_raises(exception, *args, glob=None, regex=None, match_case=None, **kw
             # Called with callable
             callable_func = args[0]
             callable_args = args[1:]
-            with tc.assertRaisesRegex(exception, pattern):
+            with _test_case.assertRaisesRegex(exception, pattern):
                 callable_func(*callable_args, **kwargs)
         else:
             # Used as context manager
-            return tc.assertRaisesRegex(exception, pattern)
+            return _test_case.assertRaisesRegex(exception, pattern)
 
 
 def assert_warns(exception=Warning, *args, glob=None, regex=None, match_case=None, **kwargs):
@@ -164,20 +166,17 @@ def assert_warns(exception=Warning, *args, glob=None, regex=None, match_case=Non
 
         assert_warns(UserWarning, callable, arg1, arg2, kwarg=value)
     """
-    tc = unittest.TestCase()
-    tc.maxDiff = None
-
     if glob is None and regex is None:
         # Use unittest's assertWarns
         if args:
             # Called with callable
             callable_func = args[0]
             callable_args = args[1:]
-            with tc.assertWarns(exception):
+            with _test_case.assertWarns(exception):
                 callable_func(*callable_args, **kwargs)
         else:
             # Used as context manager
-            return tc.assertWarns(exception)
+            return _test_case.assertWarns(exception)
     else:
         pattern = get_pattern(glob, regex, match_case)
         # Use unittest's assertWarnsRegex
@@ -185,11 +184,11 @@ def assert_warns(exception=Warning, *args, glob=None, regex=None, match_case=Non
             # Called with callable
             callable_func = args[0]
             callable_args = args[1:]
-            with tc.assertWarnsRegex(exception, pattern):
+            with _test_case.assertWarnsRegex(exception, pattern):
                 callable_func(*callable_args, **kwargs)
         else:
             # Used as context manager
-            return tc.assertWarnsRegex(exception, pattern)
+            return _test_case.assertWarnsRegex(exception, pattern)
 
 
 def raises(exception, glob=None, regex=None, match_case=None):

From a0e63e7cf3c10c83aaa215afc743f96b4536dd9f Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 17:06:50 +0100
Subject: [PATCH 05/19] Remove remaining nose

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/test_functional_api.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dali/test/python/test_functional_api.py b/dali/test/python/test_functional_api.py
index e8708b35194..90b70f5d6d3 100644
--- a/dali/test/python/test_functional_api.py
+++ b/dali/test/python/test_functional_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 from nose_utils import assert_raises, attr
 import sys
 import inspect
-import nose
 
 
 def _test_fn_rotate(device):
@@ -185,7 +184,7 @@ def _test_schema_name_for_module(module_name, base_name=""):
             # Check if we can reconstruct the name of the op from provided schema
             assert hasattr(member, "_schema_name")
             full_name = ops._op_name(member._schema_name)
-            nose.tools.eq_(base_name + "." + full_name, module_name + "." + member_name)
+            assert base_name + "." + full_name == module_name + "." + member_name
         elif inspect.ismodule(member) and (module_name + "." + member_name) in sys.modules.keys():
             # Recurse on DALI submodule (filter out non-DALI reexported modules like `sys`)
             _test_schema_name_for_module(module_name + "." + member_name, base_name)

From 8239ace52646e5da8a968eaeccfbf57da5a78176 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Mon, 22 Dec 2025 17:36:07 +0100
Subject: [PATCH 06/19] Lint

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/nose_utils.py                  | 10 ++++++++--
 dali/test/python/test_dltensor_operator.py      |  6 +++---
 dali/test/python/test_fw_iterators_detection.py | 12 ++++++------
 dali/test/python/test_pytorch_operator.py       |  4 ++--
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/dali/test/python/nose_utils.py b/dali/test/python/nose_utils.py
index c96306cd40f..809ce0f9863 100644
--- a/dali/test/python/nose_utils.py
+++ b/dali/test/python/nose_utils.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nose2.tools.decorators import with_setup as _nose2_with_setup, with_teardown as _nose2_with_teardown
+from nose2.tools.decorators import (
+    with_setup as _nose2_with_setup,
+    with_teardown as _nose2_with_teardown,
+)
 from unittest import SkipTest  # noqa: F401
 import unittest
 import re
 import fnmatch
 import functools
-import warnings
 
 
 def with_setup(setup=None, teardown=None):
@@ -31,12 +33,14 @@ def with_setup(setup=None, teardown=None):
         @with_setup(setup_func, teardown_func)
         @with_setup(teardown=teardown_func)
     """
+
     def decorator(func):
         if setup is not None:
             func = _nose2_with_setup(setup)(func)
         if teardown is not None:
             func = _nose2_with_teardown(teardown)(func)
         return func
+
     return decorator
 
 
@@ -51,10 +55,12 @@ def attr(*tags):
     Usage: @attr("pytorch", "slow")
     Filtering: nose2 -A 'pytorch' or nose2 -A '!slow'
     """
+
     def decorator(func):
         for tag in tags:
             setattr(func, tag, True)
         return func
+
     return decorator
 
 
diff --git a/dali/test/python/test_dltensor_operator.py b/dali/test/python/test_dltensor_operator.py
index 13ebffbdbd0..eff45ea4710 100644
--- a/dali/test/python/test_dltensor_operator.py
+++ b/dali/test/python/test_dltensor_operator.py
@@ -183,7 +183,7 @@ def pytorch_red_channel_op(in1, in2):
     return [t.narrow(2, 0, 1).squeeze() for t in in1], [t.narrow(2, 0, 1).squeeze() for t in in2]
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_pytorch():
     setup_pytorch()
     for testcase in [simple_pytorch_op, pytorch_red_channel_op]:
@@ -329,7 +329,7 @@ def cupy_kernel_gray_scale(in1, in2, stream=None):
     return out1, out2
 
 
-@attr('cupy')
+@attr("cupy")
 def test_cupy():
     setup_cupy()
     print(cupy)
@@ -338,7 +338,7 @@ def test_cupy():
     yield from _cupy_flip_with_negative_strides_suite()
 
 
-@attr('cupy')
+@attr("cupy")
 def test_cupy_kernel_gray_scale():
     setup_cupy()
     cupy_case(cupy_kernel_gray_scale, synchronize=False)
diff --git a/dali/test/python/test_fw_iterators_detection.py b/dali/test/python/test_fw_iterators_detection.py
index 90f523ffdc8..bca12f6222d 100644
--- a/dali/test/python/test_fw_iterators_detection.py
+++ b/dali/test/python/test_fw_iterators_detection.py
@@ -74,7 +74,7 @@ def test_mxnet_pipeline_dynamic_shape():
         assert data is not None
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_pytorch_pipeline_dynamic_shape():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
@@ -87,7 +87,7 @@ def test_pytorch_pipeline_dynamic_shape():
         assert data is not None
 
 
-@attr('paddle')
+@attr("paddle")
 def test_paddle_pipeline_dynamic_shape():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
@@ -100,7 +100,7 @@ def test_paddle_pipeline_dynamic_shape():
         assert data is not None
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_api_fw_check1_pytorch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
@@ -120,7 +120,7 @@ def test_api_fw_check1_mxnet():
     )
 
 
-@attr('paddle')
+@attr("paddle")
 def test_api_fw_check1_paddle():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
@@ -176,14 +176,14 @@ def test_api_fw_check2_mxnet():
     )
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_api_fw_check2_pytorch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
 
     yield from test_api_fw_check2(PyTorchIterator, ["data", "bboxes", "label"])
 
 
-@attr('paddle')
+@attr("paddle")
 def test_api_fw_check2_paddle():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
 
diff --git a/dali/test/python/test_pytorch_operator.py b/dali/test/python/test_pytorch_operator.py
index 386801515c3..6e3c7f8427a 100644
--- a/dali/test/python/test_pytorch_operator.py
+++ b/dali/test/python/test_pytorch_operator.py
@@ -114,7 +114,7 @@ def check_pytorch_operator(device):
             assert numpy.allclose(res2, exp2_t.numpy())
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_pytorch_operator():
     for device in {"cpu", "gpu"}:
         yield check_pytorch_operator, device
@@ -138,7 +138,7 @@ def check_pytorch_operator_batch_processing(device):
             assert numpy.allclose(res2, exp2[i].numpy())
 
 
-@attr('pytorch')
+@attr("pytorch")
 def test_pytorch_operator_batch_processing():
     for device in {"cpu", "gpu"}:
         yield check_pytorch_operator_batch_processing, device

From cd8e0f74859ea80e07030790cd7a0e4a31c9ce9c Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Tue, 30 Dec 2025 16:14:28 +0100
Subject: [PATCH 07/19] Path attrib plugin to work with generators

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/test/python/nose2_attrib_generators.py   | 134 ++++++++++++++++++
 .../python/test_fw_iterators_detection.py     |  37 +----
 dali/test/python/unittest.cfg                 |   3 +-
 dali/test/python/unittest_failure.cfg         |   3 +-
 dali/test/python/unittest_slow.cfg            |   3 +-
 5 files changed, 143 insertions(+), 37 deletions(-)
 create mode 100644 dali/test/python/nose2_attrib_generators.py

diff --git a/dali/test/python/nose2_attrib_generators.py b/dali/test/python/nose2_attrib_generators.py
new file mode 100644
index 00000000000..8cafb68f7e7
--- /dev/null
+++ b/dali/test/python/nose2_attrib_generators.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Custom nose2 plugin to filter generator test functions by attributes
+before they are called (preventing imports of optional dependencies or other code execution).
+
+This plugin monkey-patches the Generators plugin's _testsFromGeneratorFunc
+method to check attributes before calling generator functions.
+"""
+from nose2.events import Plugin
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class AttributeGeneratorFilter(Plugin):
+    """Filter generator functions by attributes before calling them."""
+
+    configSection = "attrib-generators"
+    alwaysOn = True
+
+    def __init__(self):
+        super().__init__()
+        self._patched = False
+
+    def _get_attrib_plugin(self):
+        """Get the attrib plugin from the session."""
+        for plugin in self.session.plugins:
+            if plugin.__class__.__name__ == "AttributeSelector":
+                return plugin
+        return None
+
+    def _build_attribs_list(self, attrib_plugin):
+        """Build the attribs list from the attrib plugin's -A configuration.
+
+        This replicates the logic from AttributeSelector.moduleLoadedSuite
+        for -A filters only (not -E eval filters).
+        """
+        attribs = []
+
+        # Handle -A (attribute) filters
+        for attr in attrib_plugin.attribs:
+            attr_group = []
+            for attrib in attr.strip().split(","):
+                if not attrib:
+                    continue
+                items = attrib.split("=", 1)
+                if len(items) > 1:
+                    # "name=value"
+                    key, value = items
+                else:
+                    key = items[0]
+                    if key[0] == "!":
+                        # "!name"
+                        key = key[1:]
+                        value = False
+                    else:
+                        # "name"
+                        value = True
+                attr_group.append((key, value))
+            attribs.append(attr_group)
+
+        return attribs
+
+    def _matches_attrib_filter(self, test_func, attrib_plugin):
+        """Check if test_func matches the attribute filter from attrib plugin."""
+        if not attrib_plugin:
+            return True
+
+        if not attrib_plugin.attribs:
+            return True
+
+        # Build attribs list using attrib plugin's logic
+        attribs = self._build_attribs_list(attrib_plugin)
+
+        if not attribs:
+            return True
+
+        # Use the plugin's validateAttrib method
+        return attrib_plugin.validateAttrib(test_func, attribs)
+
+    def _patch_generator_plugin(self):
+        """Monkey-patch the Generators plugin to check attributes first."""
+        if self._patched:
+            return
+
+        # Find the Generators plugin
+        gen_plugin = None
+        for plugin in self.session.plugins:
+            if plugin.__class__.__name__ == "Generators":
+                gen_plugin = plugin
+                break
+
+        if not gen_plugin:
+            log.warning("Could not find Generators plugin to patch")
+            return
+
+        # Save original method
+        original_tests_from_gen = gen_plugin._testsFromGeneratorFunc
+        attrib_filter_self = self
+
+        # Create patched method
+        def patched_tests_from_gen(event, obj):
+            """Check attributes before calling generator function."""
+            attrib_plugin = attrib_filter_self._get_attrib_plugin()
+
+            # Check if generator function matches attribute filter
+            if not attrib_filter_self._matches_attrib_filter(obj, attrib_plugin):
+                log.debug(f"Skipping generator {obj.__name__} due to attribute filter")
+                return []  # Return empty list
+
+            # Call original method
+            return original_tests_from_gen(event, obj)
+
+        # Monkey-patch it
+        gen_plugin._testsFromGeneratorFunc = patched_tests_from_gen
+        self._patched = True
+        log.debug("Patched Generators plugin to check attributes")
+
+    def handleArgs(self, event):
+        """Patch right after argument handling, before test discovery."""
+        self._patch_generator_plugin()
diff --git a/dali/test/python/test_fw_iterators_detection.py b/dali/test/python/test_fw_iterators_detection.py
index bca12f6222d..f72d8718676 100644
--- a/dali/test/python/test_fw_iterators_detection.py
+++ b/dali/test/python/test_fw_iterators_detection.py
@@ -17,7 +17,7 @@
 from nvidia.dali.pipeline import Pipeline
 
 from test_utils import get_dali_extra_path
-from nose_utils import assert_raises, attr
+from nose_utils import assert_raises, attr, nottest
 
 DALI_EXTRA_PATH = get_dali_extra_path()
 EPOCH_SIZE = 32
@@ -54,26 +54,6 @@ def data_paths():
 ##############
 
 
-def test_mxnet_pipeline_dynamic_shape():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    root, annotations = data_paths()
-    pipeline = DetectionPipeline(BATCH_SIZE, 0, root, annotations)
-    train_loader = MXNetIterator(
-        [pipeline],
-        [
-            ("data", MXNetIterator.DATA_TAG),
-            ("bboxes", MXNetIterator.LABEL_TAG),
-            ("label", MXNetIterator.LABEL_TAG),
-        ],
-        EPOCH_SIZE,
-        auto_reset=False,
-        dynamic_shape=True,
-    )
-    for data in train_loader:
-        assert data is not None
-
-
 @attr("pytorch")
 def test_pytorch_pipeline_dynamic_shape():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
@@ -127,6 +107,7 @@ def test_api_fw_check1_paddle():
     yield from test_api_fw_check1(PaddleIterator, ["data", "bboxes", "label"])
 
 
+@nottest
 def test_api_fw_check1(iter_type, data_definition):
     root, annotations = data_paths()
     pipe = DetectionPipeline(BATCH_SIZE, 0, root, annotations)
@@ -163,19 +144,6 @@ def test_api_fw_check1(iter_type, data_definition):
     yield check, iter_type
 
 
-def test_api_fw_check2_mxnet():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    yield from test_api_fw_check2(
-        MXNetIterator,
-        [
-            ("data", MXNetIterator.DATA_TAG),
-            ("bboxes", MXNetIterator.LABEL_TAG),
-            ("label", MXNetIterator.LABEL_TAG),
-        ],
-    )
-
-
 @attr("pytorch")
 def test_api_fw_check2_pytorch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
@@ -190,6 +158,7 @@ def test_api_fw_check2_paddle():
     yield from test_api_fw_check2(PaddleIterator, ["data", "bboxes", "label"])
 
 
+@nottest
 def test_api_fw_check2(iter_type, data_definition):
     root, annotations = data_paths()
 
diff --git a/dali/test/python/unittest.cfg b/dali/test/python/unittest.cfg
index 14aca7ebfff..d480344c6c0 100644
--- a/dali/test/python/unittest.cfg
+++ b/dali/test/python/unittest.cfg
@@ -1,5 +1,6 @@
 [unittest]
-plugins = nose2.plugins.attrib
+plugins = nose2_attrib_generators
+          nose2.plugins.attrib
           nose2.plugins.collect
           nose2.plugins.printhooks
 
diff --git a/dali/test/python/unittest_failure.cfg b/dali/test/python/unittest_failure.cfg
index 2a4c17eeed3..df8d2c528f1 100644
--- a/dali/test/python/unittest_failure.cfg
+++ b/dali/test/python/unittest_failure.cfg
@@ -1,5 +1,6 @@
 [unittest]
-plugins = nose2.plugins.attrib
+plugins = nose2_attrib_generators
+          nose2.plugins.attrib
           nose2.plugins.collect
           nose2.plugins.printhooks
 
diff --git a/dali/test/python/unittest_slow.cfg b/dali/test/python/unittest_slow.cfg
index 05fcca59ec6..476556cc7aa 100644
--- a/dali/test/python/unittest_slow.cfg
+++ b/dali/test/python/unittest_slow.cfg
@@ -1,5 +1,6 @@
 [unittest]
-plugins = nose2.plugins.attrib
+plugins = nose2_attrib_generators
+          nose2.plugins.attrib
           nose2.plugins.collect
           nose2.plugins.printhooks
 

From d222d9b82617ed6b3b1c891a909b08f6c9e121f9 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Wed, 31 Dec 2025 14:23:31 +0100
Subject: [PATCH 08/19] Fix test discovery for nose2

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 qa/TL0_videoreader_test/test.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/qa/TL0_videoreader_test/test.sh b/qa/TL0_videoreader_test/test.sh
index 6b473202d9c..71d8e733374 100755
--- a/qa/TL0_videoreader_test/test.sh
+++ b/qa/TL0_videoreader_test/test.sh
@@ -47,11 +47,13 @@ test_body() {
     python video_label_example.py
 
     echo $(pwd)
-    ${python_new_invoke_test} ../../../../dali/test/python/test_video_pipeline
-    ${python_new_invoke_test} ../../../../dali/test/python/test_video_reader_resize
+    pushd ../../../../dali/test/python/
 
-    cd ../../../../dali/test/python/
+    ${python_new_invoke_test} test_video_pipeline
+    ${python_new_invoke_test} test_video_reader_resize
     ${python_new_invoke_test} test_video_reader
+
+    popd
 }
 
 pushd ../..

From 532ef3f682423c81a4a149eac61c7f35f9f66d74 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Fri, 16 Jan 2026 22:13:29 +0100
Subject: [PATCH 09/19] Fixes

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/nose2_attrib_generators.py   |    1 +
 dali/test/python/nose_utils.py                |   30 -
 dali/test/python/operator_1/test_constant.py  |    9 -
 .../test/python/operator_1/test_numba_func.py |  485 +++---
 .../python/test_RN50_data_fw_iterators.py     |    7 -
 dali/test/python/test_dali_tf_conditionals.py |   68 +-
 .../test/python/test_dali_tf_dataset_eager.py |  287 ++--
 .../test/python/test_dali_tf_dataset_graph.py |  136 +-
 .../test_dali_tf_dataset_mnist_eager.py       |   78 +-
 .../test_dali_tf_dataset_mnist_graph.py       |  134 +-
 dali/test/python/test_dali_tf_exec2.py        |   79 +-
 dali/test/python/test_dltensor_operator.py    |   50 -
 .../python/test_external_source_impl_utils.py |   15 -
 .../python/test_external_source_parallel.py   | 1439 +++++++++--------
 ...al_source_parallel_custom_serialization.py |    5 +-
 ...t_external_source_parallel_large_sample.py |   80 +-
 .../test_external_source_parallel_mxnet.py    |   60 -
 .../test_external_source_parallel_pytorch.py  |   37 +-
 .../test_external_source_parallel_utils.py    |    7 +-
 dali/test/python/test_fw_iterators.py         | 1143 -------------
 .../python/test_fw_iterators_detection.py     |   13 -
 dali/test/python/test_pool.py                 |  441 ++---
 qa/TL0_multigpu/test_body.sh                  |    2 +-
 qa/TL0_python-self-test-core/test_body.sh     |   22 +-
 .../test_body.sh                              |   11 +-
 qa/TL0_python-self-test_tegra/test_body.sh    |    9 +-
 qa/TL1_python-self-test_conda/test_body.sh    |   14 +-
 qa/TL1_tensorflow_dataset/test_impl.sh        |    2 +-
 qa/test_template_impl.sh                      |    2 +-
 29 files changed, 1808 insertions(+), 2858 deletions(-)
 delete mode 100644 dali/test/python/test_external_source_parallel_mxnet.py

diff --git a/dali/test/python/nose2_attrib_generators.py b/dali/test/python/nose2_attrib_generators.py
index 8cafb68f7e7..389fd63a50d 100644
--- a/dali/test/python/nose2_attrib_generators.py
+++ b/dali/test/python/nose2_attrib_generators.py
@@ -19,6 +19,7 @@
 This plugin monkey-patches the Generators plugin's _testsFromGeneratorFunc
 method to check attributes before calling generator functions.
 """
+
 from nose2.events import Plugin
 import logging
 
diff --git a/dali/test/python/nose_utils.py b/dali/test/python/nose_utils.py
index 809ce0f9863..3580cc011cc 100644
--- a/dali/test/python/nose_utils.py
+++ b/dali/test/python/nose_utils.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nose2.tools.decorators import (
-    with_setup as _nose2_with_setup,
-    with_teardown as _nose2_with_teardown,
-)
 from unittest import SkipTest  # noqa: F401
 import unittest
 import re
@@ -23,32 +19,6 @@
 import functools
 
 
-def with_setup(setup=None, teardown=None):
-    """
-    Decorator to add setup and/or teardown functions to a test function.
-    Compatible with nose's with_setup(setup, teardown) signature.
-
-    Usage:
-        @with_setup(setup_func)
-        @with_setup(setup_func, teardown_func)
-        @with_setup(teardown=teardown_func)
-    """
-
-    def decorator(func):
-        if setup is not None:
-            func = _nose2_with_setup(setup)(func)
-        if teardown is not None:
-            func = _nose2_with_teardown(teardown)(func)
-        return func
-
-    return decorator
-
-
-def with_teardown(teardown):
-    """Decorator to add teardown function to a test function."""
-    return _nose2_with_teardown(teardown)
-
-
 def attr(*tags):
     """Set test attributes for nose2 filtering with -A flag.
 
diff --git a/dali/test/python/operator_1/test_constant.py b/dali/test/python/operator_1/test_constant.py
index a3484e638f9..130c73296c2 100644
--- a/dali/test/python/operator_1/test_constant.py
+++ b/dali/test/python/operator_1/test_constant.py
@@ -36,15 +36,6 @@
     print("ConstantOp: PyTorch support disabled")
     pass
 
-try:
-    import mxnet
-
-    array_interfaces.append((mxnet.ndarray.array, None))
-    print("ConstantOp: MXNet support enabled")
-except ModuleNotFoundError:
-    print("ConstantOp: MXNet support disabled")
-    pass
-
 
 class ConstantPipeline(Pipeline):
     def __init__(self, device):
diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index 8e6a1f855df..ab95c9f3ce7 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -19,7 +19,8 @@
 import nvidia.dali as dali
 import nvidia.dali.fn as fn
 import nvidia.dali.types as dali_types
-from nose_utils import with_setup, attr
+from nose2.tools import params
+from nose_utils import attr
 from test_utils import (
     get_dali_extra_path,
     to_array,
@@ -202,13 +203,12 @@ def numba_func_pipe(
             assert np.array_equal(out_arr, expected_out[i])
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_numba_func():
-    # shape, dtype, run_fn, out_types,
-    # in_types, out_ndim, in_ndim, setup_fn, batch_processing,
-    # expected_out
-    args = [
+class TestNumbaFuncCPU:
+    def setUp(self):
+        check_numba_compatibility_cpu()
+
+    @attr("sanitizer_skip")
+    @params(
         (
             [(10, 10, 10)],
             np.bool_,
@@ -293,10 +293,9 @@ def test_numba_func():
             None,
             [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
         ),
-    ]
-
-    device = "cpu"
-    for (
+    )
+    def test_numba_func(
+        self,
         shape,
         dtype,
         run_fn,
@@ -307,9 +306,9 @@ def test_numba_func():
         setup_fn,
         batch_processing,
         expected_out,
-    ) in args:
-        yield (
-            _testimpl_numba_func,
+    ):
+        device = "cpu"
+        _testimpl_numba_func(
             device,
             shape,
             dtype,
@@ -323,56 +322,49 @@ def test_numba_func():
             expected_out,
         )
 
+    def test_numba_func_with_cond(self):
+        # When the function is not converted, the numba still works with no issues.
+        # AG conversion or using a complex enough decorator would break this.
+        # TODO(klecki): Can we add any additional safeguards?
+        _testimpl_numba_func(
+            device="cpu",
+            shapes=[(10, 10, 10)],
+            dtype=np.uint8,
+            run_fn=set_all_values_to_255_batch,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3],
+            ins_ndim=[3],
+            setup_fn=None,
+            batch_processing=True,
+            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
+            enable_conditionals=True,
+        )
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_numba_func_with_cond():
-    # When the function is not converted, the numba still works with no issues.
-    # AG conversion or using a complex enough decorator would break this.
-    # TODO(klecki): Can we add any additional safeguards?
-    _testimpl_numba_func(
-        device="cpu",
-        shapes=[(10, 10, 10)],
-        dtype=np.uint8,
-        run_fn=set_all_values_to_255_batch,
-        out_types=[dali_types.UINT8],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[3],
-        ins_ndim=[3],
-        setup_fn=None,
-        batch_processing=True,
-        expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
-        enable_conditionals=True,
-    )
+    def test_numba_func_with_cond_do_not_convert(self):
+        # Test if do_not_convert decorated functions still work.
+        _testimpl_numba_func(
+            device="cpu",
+            shapes=[(10, 10, 10)],
+            dtype=np.uint8,
+            run_fn=do_not_convert(set_all_values_to_255_batch),
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3],
+            ins_ndim=[3],
+            setup_fn=None,
+            batch_processing=True,
+            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
+            enable_conditionals=True,
+        )
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_numba_func_with_cond_do_not_convert():
-    # Test if do_not_convert decorated functions still work.
-    _testimpl_numba_func(
-        device="cpu",
-        shapes=[(10, 10, 10)],
-        dtype=np.uint8,
-        run_fn=do_not_convert(set_all_values_to_255_batch),
-        out_types=[dali_types.UINT8],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[3],
-        ins_ndim=[3],
-        setup_fn=None,
-        batch_processing=True,
-        expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
-        enable_conditionals=True,
-    )
-
+class TestNumbaFuncGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_gpu)
-def test_numba_func_gpu():
-    # shape, dtype, run_fn, out_types,
-    # in_types, out_ndim, in_ndim, setup_fn, batch_processing,
-    # expected_out
-    args = [
+    @attr("sanitizer_skip")
+    @params(
         (
             [(10, 10, 10)],
             np.bool_,
@@ -436,12 +428,9 @@ def test_numba_func_gpu():
             None,
             [change_dim_expected_out(20), change_dim_expected_out(30)],
         ),
-    ]
-
-    device = "gpu"
-    blocks = [32, 32, 1]
-    threads_per_block = [32, 16, 1]
-    for (
+    )
+    def test_numba_func_gpu(
+        self,
         shape,
         dtype,
         run_fn,
@@ -452,9 +441,11 @@ def test_numba_func_gpu():
         setup_fn,
         batch_processing,
         expected_out,
-    ) in args:
-        yield (
-            _testimpl_numba_func,
+    ):
+        device = "gpu"
+        blocks = [32, 32, 1]
+        threads_per_block = [32, 16, 1]
+        _testimpl_numba_func(
             device,
             shape,
             dtype,
@@ -590,10 +581,12 @@ def rot_image_setup(outs, ins):
         out0[sample_id][2] = in0[sample_id][2]
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_numba_func_image():
-    args = [
+class TestNumbaFuncImageCPU:
+    def setUp(self):
+        check_numba_compatibility_cpu()
+
+    @attr("sanitizer_skip")
+    @params(
         (
             reverse_col_batch,
             [dali_types.UINT8],
@@ -634,9 +627,9 @@ def test_numba_func_image():
             None,
             lambda x: np.rot90(x),
         ),
-    ]
-    device = "cpu"
-    for (
+    )
+    def test_numba_func_image(
+        self,
         run_fn,
         out_types,
         in_types,
@@ -645,9 +638,9 @@ def test_numba_func_image():
         setup_fn,
         batch_processing,
         transform,
-    ) in args:
-        yield (
-            _testimpl_numba_func_image,
+    ):
+        device = "cpu"
+        _testimpl_numba_func_image(
             device,
             run_fn,
             out_types,
@@ -660,10 +653,12 @@ def test_numba_func_image():
         )
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_gpu)
-def test_numba_func_image_gpu():
-    args = [
+class TestNumbaFuncImageGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
+
+    @attr("sanitizer_skip")
+    @params(
         (
             reverse_col_sample_gpu,
             [dali_types.UINT8],
@@ -684,11 +679,9 @@ def test_numba_func_image_gpu():
             None,
             np.rot90,
         ),
-    ]
-    device = "gpu"
-    blocks = [32, 32, 1]
-    threads_per_block = [32, 8, 1]
-    for (
+    )
+    def test_numba_func_image_gpu(
+        self,
         run_fn,
         out_types,
         in_types,
@@ -697,9 +690,11 @@ def test_numba_func_image_gpu():
         setup_fn,
         batch_processing,
         transform,
-    ) in args:
-        yield (
-            _testimpl_numba_func_image,
+    ):
+        device = "gpu"
+        blocks = [32, 32, 1]
+        threads_per_block = [32, 8, 1]
+        _testimpl_numba_func_image(
             device,
             run_fn,
             out_types,
@@ -778,53 +773,61 @@ def numba_func_split_image_pipe(
     return images_in, out0, out1, out2
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_split_images_col():
-    pipe = numba_func_split_image_pipe(
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=split_images_col_sample,
-        setup_fn=setup_split_images_col,
-        out_types=[dali_types.UINT8 for i in range(3)],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[2, 2, 2],
-        ins_ndim=[3],
-        device="cpu",
-    )
-    for _ in range(3):
-        images_in, R, G, B = pipe.run()
-        for i in range(len(images_in)):
-            assert np.array_equal(images_in.at(i), np.stack([R.at(i), G.at(i), B.at(i)], axis=2))
-
-
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_gpu)
-def test_split_images_col_gpu():
-    blocks = [32, 32, 1]
-    threads_per_block = [32, 8, 1]
-    pipe = numba_func_split_image_pipe(
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=split_images_col_sample_gpu,
-        setup_fn=setup_split_images_col,
-        out_types=[dali_types.UINT8 for i in range(3)],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[2, 2, 2],
-        ins_ndim=[3],
-        device="gpu",
-        blocks=blocks,
-        threads_per_block=threads_per_block,
-    )
-    for _ in range(3):
-        images_in, R, G, B = pipe.run()
-        for i in range(len(images_in)):
-            assert np.array_equal(
-                to_array(images_in[i]),
-                np.stack([to_array(R[i]), to_array(G[i]), to_array(B[i])], axis=2),
-            )
+class TestSplitImagesCol:
+    def setUp(self):
+        check_numba_compatibility_cpu()
+
+    @attr("sanitizer_skip")
+    def test_split_images_col(self):
+        pipe = numba_func_split_image_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=split_images_col_sample,
+            setup_fn=setup_split_images_col,
+            out_types=[dali_types.UINT8 for i in range(3)],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[2, 2, 2],
+            ins_ndim=[3],
+            device="cpu",
+        )
+        for _ in range(3):
+            images_in, R, G, B = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(
+                    images_in.at(i), np.stack([R.at(i), G.at(i), B.at(i)], axis=2)
+                )
+
+
+class TestSplitImagesColGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
+
+    @attr("sanitizer_skip")
+    def test_split_images_col_gpu(self):
+        blocks = [32, 32, 1]
+        threads_per_block = [32, 8, 1]
+        pipe = numba_func_split_image_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=split_images_col_sample_gpu,
+            setup_fn=setup_split_images_col,
+            out_types=[dali_types.UINT8 for i in range(3)],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[2, 2, 2],
+            ins_ndim=[3],
+            device="gpu",
+            blocks=blocks,
+            threads_per_block=threads_per_block,
+        )
+        for _ in range(3):
+            images_in, R, G, B = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(
+                    to_array(images_in[i]),
+                    np.stack([to_array(R[i]), to_array(G[i]), to_array(B[i])], axis=2),
+                )
 
 
 def multiple_ins_setup(outs, ins):
@@ -891,54 +894,60 @@ def numba_multiple_ins_pipe(
     )
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_multiple_ins():
-    pipe = numba_multiple_ins_pipe(
-        shapes=[(10, 10)],
-        dtype=np.uint8,
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=multiple_ins_run,
-        setup_fn=multiple_ins_setup,
-        out_types=[dali_types.UINT8],
-        in_types=[dali_types.UINT8 for i in range(3)],
-        outs_ndim=[3],
-        ins_ndim=[2, 2, 2],
-        device="cpu",
-    )
-    for _ in range(3):
-        outs = pipe.run()
-        out_arr = np.array(outs[0][0])
-        assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
-
-
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_gpu)
-def test_multiple_ins_gpu():
-    blocks = [32, 32, 1]
-    threads_per_block = [32, 8, 1]
-    pipe = numba_multiple_ins_pipe(
-        shapes=[(10, 10)],
-        dtype=np.uint8,
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=multiple_ins_run_gpu,
-        setup_fn=multiple_ins_setup,
-        out_types=[dali_types.UINT8],
-        in_types=[dali_types.UINT8 for i in range(3)],
-        outs_ndim=[3],
-        ins_ndim=[2, 2, 2],
-        device="gpu",
-        blocks=blocks,
-        threads_per_block=threads_per_block,
-    )
-    for _ in range(3):
-        outs = pipe.run()
-        out_arr = to_array(outs[0][0])
-        assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
+class TestMultipleIns:
+    def setUp(self):
+        check_numba_compatibility_cpu()
+
+    @attr("sanitizer_skip")
+    def test_multiple_ins(self):
+        pipe = numba_multiple_ins_pipe(
+            shapes=[(10, 10)],
+            dtype=np.uint8,
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=multiple_ins_run,
+            setup_fn=multiple_ins_setup,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8 for i in range(3)],
+            outs_ndim=[3],
+            ins_ndim=[2, 2, 2],
+            device="cpu",
+        )
+        for _ in range(3):
+            outs = pipe.run()
+            out_arr = np.array(outs[0][0])
+            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
+
+
+class TestMultipleInsGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
+
+    @attr("sanitizer_skip")
+    def test_multiple_ins_gpu(self):
+        blocks = [32, 32, 1]
+        threads_per_block = [32, 8, 1]
+        pipe = numba_multiple_ins_pipe(
+            shapes=[(10, 10)],
+            dtype=np.uint8,
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=multiple_ins_run_gpu,
+            setup_fn=multiple_ins_setup,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8 for i in range(3)],
+            outs_ndim=[3],
+            ins_ndim=[2, 2, 2],
+            device="gpu",
+            blocks=blocks,
+            threads_per_block=threads_per_block,
+        )
+        for _ in range(3):
+            outs = pipe.run()
+            out_arr = to_array(outs[0][0])
+            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
 
 
 def nonuniform_types_setup(outs, ins):
@@ -1002,52 +1011,58 @@ def nonuniform_types_pipe(
     return images_in, out_img, out_shape
 
 
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_cpu)
-def test_nonuniform_types_cpu():
-    pipe = nonuniform_types_pipe(
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=nonuniform_types_run_cpu,
-        out_types=[dali_types.UINT8, dali_types.INT64],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[3, 1],
-        ins_ndim=[3],
-        device="cpu",
-    )
-    for _ in range(3):
-        images_in, images_out, img_shape = pipe.run()
-        for i in range(len(images_in)):
-            assert np.array_equal(255 - images_in.at(i), images_out.at(i))
-            assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
-
-
-@attr("sanitizer_skip")
-@with_setup(check_numba_compatibility_gpu)
-def test_nonuniform_types_gpu():
-    blocks = [16, 16, 1]
-    threads_per_block = [32, 16, 1]
-    pipe = nonuniform_types_pipe(
-        batch_size=8,
-        num_threads=1,
-        device_id=0,
-        run_fn=nonuniform_types_run_gpu,
-        out_types=[dali_types.UINT8, dali_types.INT64],
-        in_types=[dali_types.UINT8],
-        outs_ndim=[3, 1],
-        ins_ndim=[3],
-        device="gpu",
-        blocks=blocks,
-        threads_per_block=threads_per_block,
-    )
-    for _ in range(3):
-        images_in, images_out, img_shape = pipe.run()
-        images_in, images_out, img_shape = (
-            images_in.as_cpu(),
-            images_out.as_cpu(),
-            img_shape.as_cpu(),
+class TestNonuniformTypes:
+    def setUp(self):
+        check_numba_compatibility_cpu()
+
+    @attr("sanitizer_skip")
+    def test_nonuniform_types_cpu(self):
+        pipe = nonuniform_types_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=nonuniform_types_run_cpu,
+            out_types=[dali_types.UINT8, dali_types.INT64],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3, 1],
+            ins_ndim=[3],
+            device="cpu",
         )
-        for i in range(len(images_in)):
-            assert np.array_equal(255 - images_in.at(i), images_out.at(i))
-            assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
+        for _ in range(3):
+            images_in, images_out, img_shape = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(255 - images_in.at(i), images_out.at(i))
+                assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
+
+
+class TestNonuniformTypesGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
+
+    @attr("sanitizer_skip")
+    def test_nonuniform_types_gpu(self):
+        blocks = [16, 16, 1]
+        threads_per_block = [32, 16, 1]
+        pipe = nonuniform_types_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=nonuniform_types_run_gpu,
+            out_types=[dali_types.UINT8, dali_types.INT64],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3, 1],
+            ins_ndim=[3],
+            device="gpu",
+            blocks=blocks,
+            threads_per_block=threads_per_block,
+        )
+        for _ in range(3):
+            images_in, images_out, img_shape = pipe.run()
+            images_in, images_out, img_shape = (
+                images_in.as_cpu(),
+                images_out.as_cpu(),
+                img_shape.as_cpu(),
+            )
+            for i in range(len(images_in)):
+                assert np.array_equal(255 - images_in.at(i), images_out.at(i))
+                assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
diff --git a/dali/test/python/test_RN50_data_fw_iterators.py b/dali/test/python/test_RN50_data_fw_iterators.py
index b8fdd3b724f..3658c9f9026 100644
--- a/dali/test/python/test_RN50_data_fw_iterators.py
+++ b/dali/test/python/test_RN50_data_fw_iterators.py
@@ -231,12 +231,6 @@ def test_fw_iter(IteratorClass, args):
                     break
 
 
-def import_mxnet():
-    from nvidia.dali.plugin.mxnet import DALIClassificationIterator as MXNetIterator
-
-    return MXNetIterator
-
-
 def import_pytorch():
     from nvidia.dali.plugin.pytorch import DALIClassificationIterator as PyTorchIterator
 
@@ -275,7 +269,6 @@ def import_tf():
 
 
 Iterators = {
-    "mxnet": [import_mxnet],
     "pytorch": [import_pytorch],
     "tf": [import_tf],
     "paddle": [import_paddle],
diff --git a/dali/test/python/test_dali_tf_conditionals.py b/dali/test/python/test_dali_tf_conditionals.py
index 6f8fa40fbcc..ea54d7f8ffe 100644
--- a/dali/test/python/test_dali_tf_conditionals.py
+++ b/dali/test/python/test_dali_tf_conditionals.py
@@ -18,42 +18,44 @@
 import nvidia.dali.fn as fn
 import nvidia.dali.types as types
 import nvidia.dali.plugin.tf as dali_tf
-from nose_utils import with_setup
 from test_utils_tensorflow import skip_inputs_for_incompatible_tf
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_both_tf_and_dali_conditionals():
-    @pipeline_def(enable_conditionals=True, batch_size=5, num_threads=4, device_id=0)
-    def dali_conditional_pipeline():
-        iter_id = fn.external_source(source=lambda x: np.array(x.iteration), batch=False)
-        if iter_id & 1 == 0:
-            output = types.Constant(np.array(-1), device="cpu")
-        else:
-            output = types.Constant(np.array(1), device="cpu")
-        return output
+class TestBothTFAndDALIConditionals:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
 
-    with tf.device("/cpu:0"):
-        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
-            pipeline=dali_conditional_pipeline(),
-            batch_size=5,
-            output_shapes=(5,),
-            output_dtypes=(tf.int32),
-            num_threads=4,
-            device_id=0,
-        )
+    def test_both_tf_and_dali_conditionals(self):
+        @pipeline_def(enable_conditionals=True, batch_size=5, num_threads=4, device_id=0)
+        def dali_conditional_pipeline():
+            iter_id = fn.external_source(source=lambda x: np.array(x.iteration), batch=False)
+            if iter_id & 1 == 0:
+                output = types.Constant(np.array(-1), device="cpu")
+            else:
+                output = types.Constant(np.array(1), device="cpu")
+            return output
 
-        @tf.function
-        def tf_function_with_conditionals(dali_dataset):
-            negative = tf.constant(0)
-            positive = tf.constant(0)
-            for input in dali_dataset:
-                if tf.reduce_sum(input) < 0:
-                    negative = negative + 1
-                else:
-                    positive = positive + 1
-            return negative, positive
+        with tf.device("/cpu:0"):
+            dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
+                pipeline=dali_conditional_pipeline(),
+                batch_size=5,
+                output_shapes=(5,),
+                output_dtypes=(tf.int32),
+                num_threads=4,
+                device_id=0,
+            )
 
-        pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
-        assert pos == 3
-        assert neg == 2
+            @tf.function
+            def tf_function_with_conditionals(dali_dataset):
+                negative = tf.constant(0)
+                positive = tf.constant(0)
+                for input in dali_dataset:
+                    if tf.reduce_sum(input) < 0:
+                        negative = negative + 1
+                    else:
+                        positive = positive + 1
+                return negative, positive
+
+            pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
+            assert pos == 3
+            assert neg == 2
diff --git a/dali/test/python/test_dali_tf_dataset_eager.py b/dali/test/python/test_dali_tf_dataset_eager.py
index a8e64e4422b..8ab53534ec1 100644
--- a/dali/test/python/test_dali_tf_dataset_eager.py
+++ b/dali/test/python/test_dali_tf_dataset_eager.py
@@ -18,7 +18,8 @@
 import nvidia.dali.plugin.tf as dali_tf
 from nvidia.dali.plugin.tf.experimental import Input
 from nvidia.dali import fn
-from nose_utils import with_setup, raises
+from nose2.tools import params
+from nose_utils import raises
 from test_dali_tf_dataset_pipelines import (
     FixedSampleIterator,
     RandomSampleIterator,
@@ -80,14 +81,25 @@ def run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch):
     run_tf_dataset_eager_mode(dev, get_pipeline_desc=get_pipeline_desc, to_dataset=to_dataset)
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_constant_input():
+def _generate_tf_dataset_with_constant_input_test_cases():
+    rng = random.Random(42)
+    cases = []
     for dev in ["cpu", "gpu"]:
         for shape in [(7, 42), (64, 64, 3), (3, 40, 40, 4)]:
             for dtype in [np.uint8, np.int32, np.float32]:
                 for batch in ["dataset", True, False, None]:
-                    value = random.choice([42, 255])
-                    yield run_tf_dataset_with_constant_input, dev, shape, value, dtype, batch
+                    value = rng.choice([42, 255])
+                    cases.append((dev, shape, value, dtype, batch))
+    return cases
+
+
+class TestTFDatasetWithInputs:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_with_constant_input_test_cases())
+    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
+        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
 
 
 def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch="dataset"):
@@ -100,13 +112,22 @@ def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch="dataset"):
     run_tf_dataset_eager_mode(dev, get_pipeline_desc=get_pipeline_desc, to_dataset=to_dataset)
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_random_input():
-    for dev in ["cpu", "gpu"]:
-        for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-            for dtype in [np.uint8, np.int32, np.float32]:
-                for batch in ["dataset", False, True, None]:
-                    yield run_tf_dataset_with_random_input, dev, max_shape, dtype, batch
+_tf_dataset_with_random_input_test_cases = [
+    (dev, max_shape, dtype, batch)
+    for dev in ["cpu", "gpu"]
+    for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]
+    for dtype in [np.uint8, np.int32, np.float32]
+    for batch in ["dataset", False, True, None]
+]
+
+
+class TestTFDatasetWithRandomInput:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_tf_dataset_with_random_input_test_cases)
+    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
 
 
 # Run with everything on GPU (External Source op as well)
@@ -120,12 +141,21 @@ def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
     run_tf_dataset_eager_mode("gpu", get_pipeline_desc=get_pipeline_desc, to_dataset=to_dataset)
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_random_input_gpu():
-    for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-        for dtype in [np.uint8, np.int32, np.float32]:
-            for batch in ["dataset", False, True, None]:
-                yield run_tf_dataset_with_random_input_gpu, max_shape, dtype, batch
+_tf_dataset_with_random_input_gpu_test_cases = [
+    (max_shape, dtype, batch)
+    for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]
+    for dtype in [np.uint8, np.int32, np.float32]
+    for batch in ["dataset", False, True, None]
+]
+
+
+class TestTFDatasetWithRandomInputGPU:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_tf_dataset_with_random_input_gpu_test_cases)
+    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
 
 
 def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
@@ -139,15 +169,25 @@ def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
 
 
 # Check if setting no_copy flags in all placement scenarios is ok as we override it internally
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_no_copy():
+def _generate_tf_dataset_with_no_copy_test_cases():
+    cases = []
     for max_shape in [(10, 20), (120, 120, 3)]:
         for dataset_dev in ["cpu", "gpu"]:
             for es_dev in ["cpu", "gpu"]:
                 if dataset_dev == "cpu" and es_dev == "gpu":
                     continue  # GPU op in CPU dataset not supported
                 for no_copy in [True, False, None]:
-                    yield run_tf_dataset_no_copy, max_shape, np.uint8, dataset_dev, es_dev, no_copy
+                    cases.append((max_shape, np.uint8, dataset_dev, es_dev, no_copy))
+    return cases
+
+
+class TestTFDatasetWithNoCopy:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_with_no_copy_test_cases())
+    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
+        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
 
 
 def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
@@ -161,20 +201,22 @@ def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_stop_iter():
-    batch_size = 12
-    for dev in ["cpu", "gpu"]:
-        for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-            for dtype in [np.uint8, np.int32, np.float32]:
-                for iters in [1, 2, 3, 4, 5]:
-                    yield (
-                        run_tf_dataset_with_stop_iter,
-                        dev,
-                        max_shape,
-                        dtype,
-                        iters * batch_size - 3,
-                    )
+class TestTFDatasetWithStopIter:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    def test_tf_dataset_with_stop_iter(self):
+        batch_size = 12
+        for dev in ["cpu", "gpu"]:
+            for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
+                for dtype in [np.uint8, np.int32, np.float32]:
+                    for iters in [1, 2, 3, 4, 5]:
+                        run_tf_dataset_with_stop_iter(
+                            dev,
+                            max_shape,
+                            dtype,
+                            iters * batch_size - 3,
+                        )
 
 
 def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
@@ -198,13 +240,23 @@ def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
 input_names = [["input_{}".format(i) for i, _ in enumerate(vals)] for vals in start_values]
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_multi_input():
+def _generate_tf_dataset_multi_input_test_cases():
+    cases = []
     for dev in ["cpu", "gpu"]:
         for starts, names in zip(start_values, input_names):
-            yield run_tf_dataset_multi_input, dev, starts, names, ["dataset" for _ in input_names]
+            cases.append((dev, starts, names, ["dataset" for _ in input_names]))
             for batches in list(itertools.product([True, False], repeat=len(input_names))):
-                yield run_tf_dataset_multi_input, dev, starts, names, batches
+                cases.append((dev, starts, names, batches))
+    return cases
+
+
+class TestTFDatasetMultiInput:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_multi_input_test_cases())
+    def test_tf_dataset_multi_input(self, dev, starts, names, batches):
+        run_tf_dataset_multi_input(dev, starts, names, batches)
 
 
 @raises(tf.errors.InternalError, glob="TF device and DALI device mismatch")
@@ -263,40 +315,47 @@ def check_tf_dataset_wrong_input_type(wrong_input_datasets):
     check_basic_dataset_build(wrong_input_datasets)
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_wrong_input_type():
-    input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-    # wrong `input_datasets` type (no dictionary)
-    for wrong_input_dataset in ["a", input_dataset, [input_dataset]]:
-        yield check_tf_dataset_wrong_input_type, wrong_input_dataset
-    # wrong values in dictionary
-    for wrong_input_dataset in ["str", [input_dataset]]:
-        yield check_tf_dataset_wrong_input_type, {
-            "a": wrong_input_dataset,
-            "b": wrong_input_dataset,
-        }
-    # wrong keys in dictionary
-    for wrong_input_name in [42, ("a", "b")]:
-        yield check_tf_dataset_wrong_input_type, {wrong_input_name: input_dataset}
-
-
-@raises(
-    ValueError,
-    glob="Found External Source nodes in the Pipeline, that were not assigned any inputs.",
-)
-@with_setup(skip_for_incompatible_tf)
-def test_input_not_provided():
-    input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-    check_basic_dataset_build({"a": input_dataset})
-
+class TestTFDatasetInputValidation:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    def test_tf_dataset_wrong_input_type(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        # wrong `input_datasets` type (no dictionary)
+        for wrong_input_dataset in ["a", input_dataset, [input_dataset]]:
+            check_tf_dataset_wrong_input_type(wrong_input_dataset)
+        # wrong values in dictionary
+        for wrong_input_dataset in ["str", [input_dataset]]:
+            check_tf_dataset_wrong_input_type(
+                {
+                    "a": wrong_input_dataset,
+                    "b": wrong_input_dataset,
+                }
+            )
+        # wrong keys in dictionary
+        for wrong_input_name in [42, ("a", "b")]:
+            check_tf_dataset_wrong_input_type({wrong_input_name: input_dataset})
+
+
+class TestTFDatasetExternalSourceValidation:
+    def setUp(self):
+        skip_for_incompatible_tf()
+
+    @raises(
+        ValueError,
+        glob="Found External Source nodes in the Pipeline, that were not assigned any inputs.",
+    )
+    def test_input_not_provided(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_basic_dataset_build({"a": input_dataset})
 
-@raises(
-    ValueError, glob="Did not find an External Source placeholder node * in the provided pipeline"
-)
-@with_setup(skip_for_incompatible_tf)
-def test_missing_es_node():
-    input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-    check_basic_dataset_build({"a": input_dataset, "b": input_dataset, "c": input_dataset})
+    @raises(
+        ValueError,
+        glob="Did not find an External Source placeholder node * in the provided pipeline",
+    )
+    def test_missing_es_node(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_basic_dataset_build({"a": input_dataset, "b": input_dataset, "c": input_dataset})
 
 
 @pipeline_def(batch_size=10, num_threads=4, device_id=0)
@@ -320,31 +379,32 @@ def check_single_es_pipeline(kwargs, input_datasets):
         return dali_dataset
 
 
-@raises(
-    ValueError, glob="Did not find an External Source placeholder node * in the provided pipeline"
-)
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_es_with_source():
-    in_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-    check_single_es_pipeline({"name": "a", "source": []}, {"a": in_dataset})
-
+class TestTFDatasetESParameters:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
 
-@raises(
-    ValueError,
-    glob="The parameter ``num_outputs`` is only valid when using ``source`` to provide data.",
-)
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_es_num_outputs_provided():
-    in_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-    check_single_es_pipeline({"name": "a", "num_outputs": 1}, {"a": in_dataset})
+    @raises(
+        ValueError,
+        glob="Did not find an External Source placeholder node * in the provided pipeline",
+    )
+    def test_tf_dataset_es_with_source(self):
+        in_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_single_es_pipeline({"name": "a", "source": []}, {"a": in_dataset})
 
+    @raises(
+        ValueError,
+        glob="The parameter ``num_outputs`` is only valid when using ``source`` to provide data.",
+    )
+    def test_tf_dataset_es_num_outputs_provided(self):
+        in_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_single_es_pipeline({"name": "a", "num_outputs": 1}, {"a": in_dataset})
 
-@raises(
-    ValueError, glob="Found placeholder External Source node * in the Pipeline that was not named"
-)
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_disallowed_es():
-    check_single_es_pipeline({}, {})
+    @raises(
+        ValueError,
+        glob="Found placeholder External Source node * in the Pipeline that was not named",
+    )
+    def test_tf_dataset_disallowed_es(self):
+        check_single_es_pipeline({}, {})
 
 
 def check_layout(kwargs, input_datasets, layout):
@@ -377,21 +437,25 @@ def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_with_dali_external_source():
-    yield from gen_tf_with_dali_external_source(run_tf_with_dali_external_source)
+class TestTFWithDALIExternalSource:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
 
+    @params(*gen_tf_with_dali_external_source(run_tf_with_dali_external_source))
+    def test_tf_with_dali_external_source(
+        self, test_run, dev, es_args, es_dev, dtype, iter_limit, dense
+    ):
+        test_run(dev, es_args, es_dev, dtype, iter_limit, dense)
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_layouts():
-    for shape, layout in [((2, 3), "XY"), ((10, 20, 3), "HWC"), ((4, 128, 64, 3), "FHWC")]:
-        in_dataset = tf.data.Dataset.from_tensors(np.full(shape, 42)).repeat()
-        # Captured from pipeline
-        yield check_layout, {"layout": layout, "name": "in"}, {"in": in_dataset}, layout
-        # Captured from pipeline
-        yield check_layout, {"layout": layout, "name": "in"}, {"in": Input(in_dataset)}, layout
-        # Set via experimental.Input, not specified in external source
-        yield check_layout, {"name": "in"}, {"in": Input(in_dataset, layout=layout)}, layout
+    def test_tf_dataset_layouts(self):
+        for shape, layout in [((2, 3), "XY"), ((10, 20, 3), "HWC"), ((4, 128, 64, 3), "FHWC")]:
+            in_dataset = tf.data.Dataset.from_tensors(np.full(shape, 42)).repeat()
+            # Captured from pipeline
+            check_layout({"layout": layout, "name": "in"}, {"in": in_dataset}, layout)
+            # Captured from pipeline
+            check_layout({"layout": layout, "name": "in"}, {"in": Input(in_dataset)}, layout)
+            # Set via experimental.Input, not specified in external source
+            check_layout({"name": "in"}, {"in": Input(in_dataset, layout=layout)}, layout)
 
 
 # Test if the TypeError is raised for unsupported arguments for regular DALIDataset
@@ -426,6 +490,9 @@ def _test_tf_dataset_multigpu_manual_placement():
 
 
 # This test should be private (name starts with _) as it is called separately in L1
-@with_setup(skip_for_incompatible_tf)
-def _test_tf_dataset_multigpu_mirrored_strategy():
-    run_tf_dataset_multigpu_eager_mirrored_strategy()
+class TestTFDatasetMultiGPU:
+    def setUp(self):
+        skip_for_incompatible_tf()
+
+    def _test_tf_dataset_multigpu_mirrored_strategy(self):
+        run_tf_dataset_multigpu_eager_mirrored_strategy()
diff --git a/dali/test/python/test_dali_tf_dataset_graph.py b/dali/test/python/test_dali_tf_dataset_graph.py
index 32702536a8f..d35a9800e69 100644
--- a/dali/test/python/test_dali_tf_dataset_graph.py
+++ b/dali/test/python/test_dali_tf_dataset_graph.py
@@ -16,7 +16,8 @@
 import numpy as np
 import random as random
 import tensorflow as tf
-from nose_utils import with_setup, raises
+from nose2.tools import params, cartesian_params
+from nose_utils import raises
 from test_dali_tf_dataset_pipelines import (
     FixedSampleIterator,
     external_source_tester,
@@ -63,14 +64,25 @@ def run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_constant_input():
+def _generate_tf_dataset_with_constant_input_test_cases():
+    rng = random.Random(42)
+    cases = []
     for dev in ["cpu", "gpu"]:
         for shape in [(7, 42), (64, 64, 3), (3, 40, 40, 4)]:
             for dtype in [np.uint8, np.int32, np.float32]:
                 for batch in ["dataset", True, False, None]:
-                    value = random.choice([42, 255])
-                    yield run_tf_dataset_with_constant_input, dev, shape, value, dtype, batch
+                    value = rng.choice([42, 255])
+                    cases.append((dev, shape, value, dtype, batch))
+    return cases
+
+
+class TestTFDatasetWithConstantInput:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_with_constant_input_test_cases())
+    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
+        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
 
 
 def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch):
@@ -85,13 +97,18 @@ def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_random_input():
-    for dev in ["cpu", "gpu"]:
-        for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-            for dtype in [np.uint8, np.int32, np.float32]:
-                for batch in ["dataset", True, False, None]:
-                    yield run_tf_dataset_with_random_input, dev, max_shape, dtype, batch
+class TestTFDatasetWithRandomInput:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @cartesian_params(
+        ["cpu", "gpu"],
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        ["dataset", True, False, None],
+    )
+    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
 
 
 # Run with everything on GPU (External Source op as well)
@@ -107,12 +124,17 @@ def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_random_input_gpu():
-    for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-        for dtype in [np.uint8, np.int32, np.float32]:
-            for batch in ["dataset", True, False, None]:
-                yield run_tf_dataset_with_random_input_gpu, max_shape, dtype, batch
+class TestTFDatasetWithRandomInputGPU:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @cartesian_params(
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        ["dataset", True, False, None],
+    )
+    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
 
 
 def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
@@ -126,15 +148,25 @@ def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
 
 
 # Check if setting no_copy flags in all placement scenarios is ok as we override it internally
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_no_copy():
+def _generate_tf_dataset_with_no_copy_test_cases():
+    cases = []
     for max_shape in [(10, 20), (120, 120, 3)]:
         for dataset_dev in ["cpu", "gpu"]:
             for es_dev in ["cpu", "gpu"]:
                 if dataset_dev == "cpu" and es_dev == "gpu":
                     continue  # GPU op in CPU dataset not supported
                 for no_copy in [True, False, None]:
-                    yield run_tf_dataset_no_copy, max_shape, np.uint8, dataset_dev, es_dev, no_copy
+                    cases.append((max_shape, np.uint8, dataset_dev, es_dev, no_copy))
+    return cases
+
+
+class TestTFDatasetWithNoCopy:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_with_no_copy_test_cases())
+    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
+        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
 
 
 def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
@@ -150,20 +182,24 @@ def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_with_stop_iter():
-    batch_size = 12
-    for dev in ["cpu", "gpu"]:
-        for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-            for dtype in [np.uint8, np.int32, np.float32]:
-                for iters in [1, 2, 3, 4, 5]:
-                    yield (
-                        run_tf_dataset_with_stop_iter,
-                        dev,
-                        max_shape,
-                        dtype,
-                        iters * batch_size - 3,
-                    )
+class TestTFDatasetWithStopIter:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @cartesian_params(
+        ["cpu", "gpu"],
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        [1, 2, 3, 4, 5],
+    )
+    def test_tf_dataset_with_stop_iter(self, dev, max_shape, dtype, iters):
+        batch_size = 12
+        run_tf_dataset_with_stop_iter(
+            dev,
+            max_shape,
+            dtype,
+            iters * batch_size - 3,
+        )
 
 
 def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
@@ -187,13 +223,23 @@ def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
 input_names = [["input_{}".format(i) for i, _ in enumerate(vals)] for vals in start_values]
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_multi_input():
+def _generate_tf_dataset_multi_input_test_cases():
+    cases = []
     for dev in ["cpu", "gpu"]:
         for starts, names in zip(start_values, input_names):
-            yield run_tf_dataset_multi_input, dev, starts, names, ["dataset" for _ in input_names]
+            cases.append((dev, starts, names, ["dataset" for _ in input_names]))
             for batches in list(itertools.product([True, False], repeat=len(input_names))):
-                yield run_tf_dataset_multi_input, dev, starts, names, batches
+                cases.append((dev, starts, names, batches))
+    return cases
+
+
+class TestTFDatasetMultiInput:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*_generate_tf_dataset_multi_input_test_cases())
+    def test_tf_dataset_multi_input(self, dev, starts, names, batches):
+        run_tf_dataset_multi_input(dev, starts, names, batches)
 
 
 def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
@@ -205,9 +251,15 @@ def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
     )
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_with_dali_external_source():
-    yield from gen_tf_with_dali_external_source(run_tf_with_dali_external_source)
+class TestTFWithDALIExternalSource:
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    @params(*gen_tf_with_dali_external_source(run_tf_with_dali_external_source))
+    def test_tf_with_dali_external_source(
+        self, test_run, dev, es_args, es_dev, dtype, iter_limit, dense
+    ):
+        test_run(dev, es_args, es_dev, dtype, iter_limit, dense)
 
 
 tf_dataset_wrong_placement_error_msg = (
diff --git a/dali/test/python/test_dali_tf_dataset_mnist_eager.py b/dali/test/python/test_dali_tf_dataset_mnist_eager.py
index dd742c314de..e26600e2e69 100644
--- a/dali/test/python/test_dali_tf_dataset_mnist_eager.py
+++ b/dali/test/python/test_dali_tf_dataset_mnist_eager.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
-from nose_utils import with_setup, raises, SkipTest
+from nose_utils import raises, SkipTest
 
 import test_dali_tf_dataset_mnist as mnist
 from test_utils_tensorflow import skip_for_incompatible_tf, available_gpus
@@ -34,54 +34,60 @@ def test_keras_single_cpu():
     mnist.run_keras_single_device("cpu", 0)
 
 
-@with_setup(skip_for_incompatible_tf)
-@raises(tf.errors.OpError, "TF device and DALI device mismatch")
-def test_keras_wrong_placement_gpu():
-    with tf.device("cpu:0"):
-        model = mnist.keras_model()
-        train_dataset = mnist.get_dataset("gpu", 0)
+class TestKerasWrongPlacement:
+    def setUp(self):
+        skip_for_incompatible_tf()
 
-        model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
+    @raises(tf.errors.OpError, "TF device and DALI device mismatch")
+    def test_keras_wrong_placement_gpu(self):
+        with tf.device("cpu:0"):
+            model = mnist.keras_model()
+            train_dataset = mnist.get_dataset("gpu", 0)
 
+            model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
 
-@with_setup(skip_for_incompatible_tf)
-@raises(tf.errors.OpError, "TF device and DALI device mismatch")
-def test_keras_wrong_placement_cpu():
-    with tf.device("gpu:0"):
-        model = mnist.keras_model()
-        train_dataset = mnist.get_dataset("cpu", 0)
+    @raises(tf.errors.OpError, "TF device and DALI device mismatch")
+    def test_keras_wrong_placement_cpu(self):
+        with tf.device("gpu:0"):
+            model = mnist.keras_model()
+            train_dataset = mnist.get_dataset("cpu", 0)
 
-        model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
+            model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
 
 
-@with_setup(skip_for_incompatible_tf)
-def test_keras_multi_gpu_mirrored_strategy():
-    # due to compatibility problems between the driver, cuda version and
-    # TensorFlow 2.12 test_keras_multi_gpu_mirrored_strategy doesn't work.
-    if Version(tf.__version__) >= Version("2.12.0"):
-        raise SkipTest("This test is not supported for TensorFlow 2.12")
-    strategy = tf.distribute.MirroredStrategy(devices=available_gpus())
+class TestKerasMultiGPUMirroredStrategy:
+    def setUp(self):
+        skip_for_incompatible_tf()
 
-    with strategy.scope():
-        model = mnist.keras_model()
+    def test_keras_multi_gpu_mirrored_strategy(self):
+        # due to compatibility problems between the driver, cuda version and
+        # TensorFlow 2.12 test_keras_multi_gpu_mirrored_strategy doesn't work.
+        if Version(tf.__version__) >= Version("2.12.0"):
+            raise SkipTest("This test is not supported for TensorFlow 2.12")
+        strategy = tf.distribute.MirroredStrategy(devices=available_gpus())
 
-    train_dataset = mnist.get_dataset_multi_gpu(strategy)
+        with strategy.scope():
+            model = mnist.keras_model()
 
-    model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
+        train_dataset = mnist.get_dataset_multi_gpu(strategy)
+
+        model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
 
-    assert model.evaluate(train_dataset, steps=mnist.ITERATIONS)[1] > mnist.TARGET
+        assert model.evaluate(train_dataset, steps=mnist.ITERATIONS)[1] > mnist.TARGET
 
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_gpu():
-    mnist.run_estimators_single_device("gpu", 0)
+class TestEstimators:
+    def setUp(self):
+        mnist.clear_checkpoints()
 
+    def tearDown(self):
+        mnist.clear_checkpoints()
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_other_gpu():
-    mnist.run_estimators_single_device("gpu", 1)
+    def test_estimators_single_gpu(self):
+        mnist.run_estimators_single_device("gpu", 0)
 
+    def test_estimators_single_other_gpu(self):
+        mnist.run_estimators_single_device("gpu", 1)
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_cpu():
-    mnist.run_estimators_single_device("cpu", 0)
+    def test_estimators_single_cpu(self):
+        mnist.run_estimators_single_device("cpu", 0)
diff --git a/dali/test/python/test_dali_tf_dataset_mnist_graph.py b/dali/test/python/test_dali_tf_dataset_mnist_graph.py
index 0a1aba5441c..d679b82f1af 100644
--- a/dali/test/python/test_dali_tf_dataset_mnist_graph.py
+++ b/dali/test/python/test_dali_tf_dataset_mnist_graph.py
@@ -14,32 +14,23 @@
 
 import tensorflow as tf
 import tensorflow.compat.v1 as tf_v1
-from nose_utils import with_setup, SkipTest, raises
+from nose2.tools import params
+from nose_utils import SkipTest, raises
 import test_dali_tf_dataset_mnist as mnist
 from packaging.version import Version
 
 mnist.tf.compat.v1.disable_eager_execution()
 
 
-@with_setup(tf.keras.backend.clear_session)
-def test_keras_single_gpu():
-    if Version(tf.__version__) >= Version("2.16"):
-        raise SkipTest("TF < 2.16 is required for this test")
-    mnist.run_keras_single_device("gpu", 0)
-
-
-@with_setup(tf.keras.backend.clear_session)
-def test_keras_single_other_gpu():
-    if Version(tf.__version__) >= Version("2.16"):
-        raise SkipTest("TF < 2.16 is required for this test")
-    mnist.run_keras_single_device("gpu", 1)
+class TestKeras:
+    def setUp(self):
+        tf.keras.backend.clear_session()
 
-
-@with_setup(tf.keras.backend.clear_session)
-def test_keras_single_cpu():
-    if Version(tf.__version__) >= Version("2.16"):
-        raise SkipTest("TF < 2.16 is required for this test")
-    mnist.run_keras_single_device("cpu", 0)
+    @params(("gpu", 0), ("gpu", 1), ("cpu", 0))
+    def test_keras_single_device(self, device, device_id):
+        if Version(tf.__version__) >= Version("2.16"):
+            raise SkipTest("TF < 2.16 is required for this test")
+        mnist.run_keras_single_device(device, device_id)
 
 
 @raises(tf.errors.OpError, "TF device and DALI device mismatch. TF*: CPU, DALI*: GPU for output")
@@ -64,19 +55,13 @@ def test_keras_wrong_placement_cpu():
         model.fit(train_dataset, epochs=mnist.EPOCHS, steps_per_epoch=mnist.ITERATIONS)
 
 
-@with_setup(tf.compat.v1.reset_default_graph)
-def test_graph_single_gpu():
-    mnist.run_graph_single_device("gpu", 0)
-
-
-@with_setup(tf.compat.v1.reset_default_graph)
-def test_graph_single_cpu():
-    mnist.run_graph_single_device("cpu", 0)
-
+class TestGraph:
+    def setUp(self):
+        tf.compat.v1.reset_default_graph()
 
-@with_setup(tf.compat.v1.reset_default_graph)
-def test_graph_single_other_gpu():
-    mnist.run_graph_single_device("gpu", 1)
+    @params(("gpu", 0), ("cpu", 0), ("gpu", 1))
+    def test_graph_single_device(self, device, device_id):
+        mnist.run_graph_single_device(device, device_id)
 
 
 # This function is copied form:
@@ -107,61 +92,64 @@ def average_gradients(tower_grads):
     return average_grads
 
 
-@with_setup(tf_v1.reset_default_graph)
-def test_graph_multi_gpu():
-    iterator_initializers = []
+class TestGraphMultiGPU:
+    def setUp(self):
+        tf_v1.reset_default_graph()
 
-    with tf.device("/cpu:0"):
-        tower_grads = []
+    def test_graph_multi_gpu(self):
+        iterator_initializers = []
 
-        for i in range(mnist.num_available_gpus()):
-            with tf.device("/gpu:{}".format(i)):
-                daliset = mnist.get_dataset("gpu", i, i, mnist.num_available_gpus())
+        with tf.device("/cpu:0"):
+            tower_grads = []
 
-                iterator = tf_v1.data.make_initializable_iterator(daliset)
-                iterator_initializers.append(iterator.initializer)
-                images, labels = iterator.get_next()
+            for i in range(mnist.num_available_gpus()):
+                with tf.device("/gpu:{}".format(i)):
+                    daliset = mnist.get_dataset("gpu", i, i, mnist.num_available_gpus())
 
-                images = tf_v1.reshape(
-                    images, [mnist.BATCH_SIZE, mnist.IMAGE_SIZE * mnist.IMAGE_SIZE]
-                )
-                labels = tf_v1.reshape(
-                    tf_v1.one_hot(labels, mnist.NUM_CLASSES), [mnist.BATCH_SIZE, mnist.NUM_CLASSES]
-                )
+                    iterator = tf_v1.data.make_initializable_iterator(daliset)
+                    iterator_initializers.append(iterator.initializer)
+                    images, labels = iterator.get_next()
 
-                logits_train = mnist.graph_model(images, reuse=(i != 0), is_training=True)
-                logits_test = mnist.graph_model(images, reuse=True, is_training=False)
+                    images = tf_v1.reshape(
+                        images, [mnist.BATCH_SIZE, mnist.IMAGE_SIZE * mnist.IMAGE_SIZE]
+                    )
+                    labels = tf_v1.reshape(
+                        tf_v1.one_hot(labels, mnist.NUM_CLASSES),
+                        [mnist.BATCH_SIZE, mnist.NUM_CLASSES],
+                    )
 
-                loss_op = tf_v1.reduce_mean(
-                    tf_v1.nn.softmax_cross_entropy_with_logits(logits=logits_train, labels=labels)
-                )
-                optimizer = tf_v1.train.AdamOptimizer()
-                grads = optimizer.compute_gradients(loss_op)
+                    logits_train = mnist.graph_model(images, reuse=(i != 0), is_training=True)
+                    logits_test = mnist.graph_model(images, reuse=True, is_training=False)
 
-                if i == 0:
-                    correct_pred = tf_v1.equal(
-                        tf_v1.argmax(logits_test, 1), tf_v1.argmax(labels, 1)
+                    loss_op = tf_v1.reduce_mean(
+                        tf_v1.nn.softmax_cross_entropy_with_logits(
+                            logits=logits_train, labels=labels
+                        )
                     )
-                    accuracy = tf_v1.reduce_mean(tf_v1.cast(correct_pred, tf_v1.float32))
-
-                tower_grads.append(grads)
+                    optimizer = tf_v1.train.AdamOptimizer()
+                    grads = optimizer.compute_gradients(loss_op)
 
-        tower_grads = average_gradients(tower_grads)
-        train_step = optimizer.apply_gradients(tower_grads)
+                    if i == 0:
+                        correct_pred = tf_v1.equal(
+                            tf_v1.argmax(logits_test, 1), tf_v1.argmax(labels, 1)
+                        )
+                        accuracy = tf_v1.reduce_mean(tf_v1.cast(correct_pred, tf_v1.float32))
 
-    mnist.train_graph(iterator_initializers, train_step, accuracy)
+                    tower_grads.append(grads)
 
+            tower_grads = average_gradients(tower_grads)
+            train_step = optimizer.apply_gradients(tower_grads)
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_gpu():
-    mnist.run_estimators_single_device("gpu", 0)
+        mnist.train_graph(iterator_initializers, train_step, accuracy)
 
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_other_gpu():
-    mnist.run_estimators_single_device("gpu", 1)
+class TestEstimators:
+    def setUp(self):
+        mnist.clear_checkpoints()
 
+    def tearDown(self):
+        mnist.clear_checkpoints()
 
-@with_setup(mnist.clear_checkpoints, mnist.clear_checkpoints)
-def test_estimators_single_cpu():
-    mnist.run_estimators_single_device("cpu", 0)
+    @params(("gpu", 0), ("gpu", 1), ("cpu", 0))
+    def test_estimators_single_device(self, device, device_id):
+        mnist.run_estimators_single_device(device, device_id)
diff --git a/dali/test/python/test_dali_tf_exec2.py b/dali/test/python/test_dali_tf_exec2.py
index 4593bdc3e98..0aa7ad0ebb7 100644
--- a/dali/test/python/test_dali_tf_exec2.py
+++ b/dali/test/python/test_dali_tf_exec2.py
@@ -19,9 +19,9 @@
 import nvidia.dali.fn as fn
 import nvidia.dali.types as types
 import nvidia.dali.plugin.tf as dali_tf
-from nose_utils import with_setup
 from test_utils_tensorflow import skip_inputs_for_incompatible_tf
 from test_utils import get_dali_extra_path
+import unittest
 
 test_data_root = get_dali_extra_path()
 lmdb_folder = os.path.join(test_data_root, "db", "lmdb")
@@ -43,34 +43,55 @@ def dali_exec2_pipeline():
     return output.cpu()
 
 
-@with_setup(skip_inputs_for_incompatible_tf)
-def test_tf_dataset_exec2():
-    """Test that exec_dynamic is propagated to DALI pipeline from dali_tf.DALIDatasetWithInputs"""
-    # From Tensorflow's perspective, this is a CPU pipeline
-    with tf.device("/cpu:0"):
-        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
-            pipeline=dali_exec2_pipeline(),
-            batch_size=5,
-            output_shapes=(5,),
-            output_dtypes=(tf.int32),
-            num_threads=4,
-            device_id=0,
-        )
-
-        @tf.function
-        def tf_function_with_conditionals(dali_dataset):
-            negative = tf.constant(0)
-            positive = tf.constant(0)
-            for input in dali_dataset:
-                if tf.reduce_sum(input) < 0:
-                    negative = negative + 1
-                else:
-                    positive = positive + 1
-            return negative, positive
-
-        pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
-        assert pos == 3
-        assert neg == 2
+class TestTFDatasetExec2(unittest.TestCase):
+    def setUp(self):
+        skip_inputs_for_incompatible_tf()
+
+    def test_tf_dataset_exec2(self):
+        """Test that exec_dynamic is propagated to DALI pipeline
+        by dali_tf.DALIDatasetWithInputs"""
+        # From Tensorflow's perspective, this is a CPU pipeline
+        with tf.device("/cpu:0"):
+            dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
+                pipeline=dali_exec2_pipeline(),
+                batch_size=5,
+                output_shapes=(5,),
+                output_dtypes=(tf.int32),
+                num_threads=4,
+                device_id=0,
+            )
+
+            @tf.function
+            def tf_function_with_conditionals(dali_dataset):
+                negative = tf.constant(0)
+                positive = tf.constant(0)
+                for input in dali_dataset:
+                    if tf.reduce_sum(input) < 0:
+                        negative = negative + 1
+                    else:
+                        positive = positive + 1
+                return negative, positive
+
+            pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
+            # Eager mode: integers, graph mode: tensors, need to fetch value if it's Tensor
+            if (
+                tf.executing_eagerly() is False
+                or getattr(tf.compat.v1, "_eager_context", None) is not None
+            ):
+                # get concrete function and run in session for static graph mode
+                # fallback for session-based TF execution (e.g. when other test turned eager off)
+                try:
+                    from tensorflow.compat.v1 import Session
+                except ImportError:
+                    # Older TF versions don't have compat.v1 layer
+                    from tensorflow import Session
+
+                with Session() as sess:
+                    pos_val, neg_val = sess.run([pos, neg])
+            else:
+                pos_val, neg_val = pos, neg
+            assert pos_val == 3
+            assert neg_val == 2
 
 
 @pipeline_def(num_threads=4, exec_dynamic=True)
diff --git a/dali/test/python/test_dltensor_operator.py b/dali/test/python/test_dltensor_operator.py
index eff45ea4710..ec7cfe52767 100644
--- a/dali/test/python/test_dltensor_operator.py
+++ b/dali/test/python/test_dltensor_operator.py
@@ -37,11 +37,6 @@ def setup_pytorch():
     torch_stream = torch.cuda.Stream()
 
 
-def setup_mxnet():
-    global mxnd
-    from mxnet import ndarray as mxnd
-
-
 def setup_cupy():
     global cupy
     global cupy_stream
@@ -194,51 +189,6 @@ def test_pytorch():
     yield from _gpu_permuted_extents_torch_suite()
 
 
-def mxnet_adapter(fun, in1, in2):
-    tin1 = [mxnd.from_dlpack(dltensor) for dltensor in in1]
-    tin2 = [mxnd.from_dlpack(dltensor) for dltensor in in2]
-    tout1, tout2 = fun(tin1, tin2)
-    return [mxnd.to_dlpack_for_read(tout) for tout in tout1], [
-        mxnd.to_dlpack_for_read(tout) for tout in tout2
-    ]
-
-
-def mxnet_wrapper(fun):
-    return lambda in1, in2: mxnet_adapter(fun, in1, in2)
-
-
-def mxnet_compare(fun, pre1, pre2, post1, post2):
-    mxnet_pre1 = [mxnd.array(pre1.at(i)) for i in range(BATCH_SIZE)]
-    mxnet_pre2 = [mxnd.array(pre2.at(i)) for i in range(BATCH_SIZE)]
-    mxnet_post1, mxnet_post2 = fun(mxnet_pre1, mxnet_pre2)
-    for i in range(BATCH_SIZE):
-        assert numpy.array_equal(post1.at(i), mxnet_post1[i].asnumpy())
-        assert numpy.array_equal(post2.at(i), mxnet_post2[i].asnumpy())
-
-
-def mxnet_case(fun, device):
-    setup_mxnet()
-    common_case(mxnet_wrapper(fun), device, partial(mxnet_compare, fun))
-
-
-def mxnet_flatten(in1, in2):
-    return [mxnd.flatten(t) for t in in1], [mxnd.flatten(t) for t in in2]
-
-
-def mxnet_slice(in1, in2):
-    return [t[:, :, 1] for t in in1], [t[:, :, 2] for t in in2]
-
-
-def mxnet_cast(in1, in2):
-    return [mxnd.cast(t, dtype="float32") for t in in1], [mxnd.cast(t, dtype="int64") for t in in2]
-
-
-def test_mxnet():
-    for testcase in [mxnet_flatten, mxnet_slice, mxnet_cast]:
-        for device in ["cpu", "gpu"]:
-            yield mxnet_case, testcase, device
-
-
 def cupy_adapter_sync(fun, in1, in2):
     with cupy_stream:
         tin1 = [cupy.fromDlpack(dltensor) for dltensor in in1]
diff --git a/dali/test/python/test_external_source_impl_utils.py b/dali/test/python/test_external_source_impl_utils.py
index 1617e306a6b..6048b36c703 100644
--- a/dali/test/python/test_external_source_impl_utils.py
+++ b/dali/test/python/test_external_source_impl_utils.py
@@ -96,21 +96,6 @@ def test_pytorch_containers():
     yield from run_checks(samples_cpu, batches_cpu, disallowed_samples, [])
 
 
-@attr("mxnet")
-def test_mxnet_containers():
-    import mxnet as mx
-
-    samples_cpu = [
-        (mx.nd.array(test_array), test_array),
-    ]
-    batches_cpu = [
-        ([mx.nd.array(test_array)], [test_array]),
-        ([mx.nd.array(test_array)] * 4, [test_array] * 4),
-    ]
-    disallowed_samples = [mx.nd.array(test_array, ctx=mx.gpu(0))]
-    yield from run_checks(samples_cpu, batches_cpu, disallowed_samples, [])
-
-
 @attr("cupy")
 def test_cupy_containers():
     import cupy as cp
diff --git a/dali/test/python/test_external_source_parallel.py b/dali/test/python/test_external_source_parallel.py
index 2aa6bd31ccd..3bf04e290fe 100644
--- a/dali/test/python/test_external_source_parallel.py
+++ b/dali/test/python/test_external_source_parallel.py
@@ -16,7 +16,12 @@
 import nvidia.dali as dali
 from nvidia.dali.types import SampleInfo, BatchInfo
 import test_external_source_parallel_utils as utils
-from nose_utils import raises, with_setup
+from test_pool_utils import setup_function, teardown_function, capture_processes
+from nose2.tools import params, cartesian_params
+from nose_utils import raises
+import unittest
+import functools
+import nvidia.dali.backend as _b
 
 
 def no_arg_fun():
@@ -128,45 +133,108 @@ def test_wrong_source():
         (generator_fun(), (TypeError, batch_required_msg.format("an iterable"))),
     ]
     for source, (error_type, error_msg) in disallowed_sources:
-        yield raises(error_type, error_msg)(check_source_build), source
+        raises(error_type, error_msg)(check_source_build)(source)
 
 
 # Test that we can launch several CPU-only pipelines by fork as we don't touch CUDA context.
-@with_setup(utils.setup_function, utils.teardown_function)
-def test_parallel_fork_cpu_only():
-    pipeline_pairs = 4
-    batch_size = 10
-    iters = 40
-    callback = utils.ExtCallback((4, 5), iters * batch_size, np.int32)
-    parallel_pipes = [
-        (
-            utils.create_pipe(
-                callback,
-                "cpu",
-                batch_size,
-                py_num_workers=4,
-                py_start_method="fork",
-                parallel=True,
-                device_id=None,
-            ),
-            utils.create_pipe(
-                callback,
-                "cpu",
+class TestParallelFork(unittest.TestCase):
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    # call it explicitly as it needs not GPU context in the process
+    def _test_parallel_fork_cpu_only(self):
+        pipeline_pairs = 4
+        batch_size = 10
+        iters = 40
+        callback = utils.ExtCallback((4, 5), iters * batch_size, np.int32)
+        parallel_pipes = [
+            (
+                utils.create_pipe(
+                    callback,
+                    "cpu",
+                    batch_size,
+                    py_num_workers=4,
+                    py_start_method="fork",
+                    parallel=True,
+                    device_id=None,
+                ),
+                utils.create_pipe(
+                    callback,
+                    "cpu",
+                    batch_size,
+                    py_num_workers=4,
+                    py_start_method="fork",
+                    parallel=True,
+                    device_id=None,
+                ),
+            )
+            for i in range(pipeline_pairs)
+        ]
+        for pipe0, pipe1 in parallel_pipes:
+            pipe0.build()
+            pipe1.build()
+            capture_processes(pipe0._py_pool)
+            capture_processes(pipe1._py_pool)
+            utils.compare_pipelines(pipe0, pipe1, batch_size, iters)
+
+    def test_parallel_fork(self):
+        epoch_size = 250
+        callback = utils.ExtCallback((4, 5), epoch_size, np.int32)
+        # if context is already initialized, use spawn to avoid fork wich will fail immediately
+        init_method = "fork" if not _b.IsDriverInitialized() else "spawn"
+        pipes = [
+            (
+                utils.create_pipe(
+                    callback,
+                    "cpu",
+                    batch_size,
+                    py_num_workers=num_workers,
+                    py_start_method=init_method,
+                    parallel=True,
+                ),
+                utils.create_pipe(callback, "cpu", batch_size, parallel=False),
+                dtype,
                 batch_size,
-                py_num_workers=4,
-                py_start_method="fork",
-                parallel=True,
-                device_id=None,
-            ),
+            )
+            for dtype in [np.float32, np.int16]
+            for num_workers in [1, 3, 4]
+            for batch_size in [1, 16, 150, 250]
+        ]
+        pipes.append(
+            (
+                utils.create_pipe(
+                    Iterable(32, (4, 5), dtype=np.int16),
+                    "cpu",
+                    32,
+                    py_num_workers=1,
+                    py_start_method=init_method,
+                    parallel=True,
+                    batch=True,
+                ),
+                utils.create_pipe(
+                    Iterable(32, (4, 5), dtype=np.int16), "cpu", 32, parallel=False, batch=True
+                ),
+                np.int16,
+                32,
+            )
         )
-        for i in range(pipeline_pairs)
-    ]
-    for pipe0, pipe1 in parallel_pipes:
-        pipe0.build()
-        pipe1.build()
-        utils.capture_processes(pipe0._py_pool)
-        utils.capture_processes(pipe1._py_pool)
-        utils.compare_pipelines(pipe0, pipe1, batch_size, iters)
+
+        for parallel_pipe, _, _, _ in pipes:
+            parallel_pipe.start_py_workers()
+        for parallel_pipe, pipe, dtype, batch_size in pipes:
+            utils.check_callback(parallel_pipe, pipe, epoch_size, batch_size, dtype)
+            parallel_pipe._py_pool.close()
+        # test that another pipeline with forking initialization fails
+        # as there is CUDA contexts already initialized
+        parallel_pipe = utils.create_pipe(
+            callback, "cpu", 16, py_num_workers=4, py_start_method="fork", parallel=True
+        )
+        raises(
+            RuntimeError, "Cannot fork a process when the CUDA has been initialized in the process."
+        )(utils.build_and_run_pipeline)(parallel_pipe, 1)
 
 
 @raises(
@@ -189,117 +257,67 @@ def test_parallel_no_workers():
     parallel_pipe.build()
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def test_parallel_fork():
-    epoch_size = 250
-    callback = utils.ExtCallback((4, 5), epoch_size, np.int32)
-    pipes = [
-        (
-            utils.create_pipe(
-                callback,
-                "cpu",
-                batch_size,
-                py_num_workers=num_workers,
-                py_start_method="fork",
-                parallel=True,
-            ),
-            utils.create_pipe(callback, "cpu", batch_size, parallel=False),
-            dtype,
-            batch_size,
-        )
-        for dtype in [np.float32, np.int16]
-        for num_workers in [1, 3, 4]
-        for batch_size in [1, 16, 150, 250]
-    ]
-    pipes.append(
-        (
-            utils.create_pipe(
-                Iterable(32, (4, 5), dtype=np.int16),
-                "cpu",
-                32,
-                py_num_workers=1,
-                py_start_method="fork",
-                parallel=True,
-                batch=True,
-            ),
-            utils.create_pipe(
-                Iterable(32, (4, 5), dtype=np.int16), "cpu", 32, parallel=False, batch=True
-            ),
-            np.int16,
-            32,
-        )
-    )
-    for parallel_pipe, _, _, _ in pipes:
-        parallel_pipe.start_py_workers()
-    for parallel_pipe, pipe, dtype, batch_size in pipes:
-        yield utils.check_callback, parallel_pipe, pipe, epoch_size, batch_size, dtype
-        # explicitly call py_pool close
-        # as nose might still reference parallel_pipe from the yield above
-        parallel_pipe._py_pool.close()
-    # test that another pipeline with forking initialization fails
-    # as there is CUDA contexts already initialized
-    parallel_pipe = utils.create_pipe(
-        callback, "cpu", 16, py_num_workers=4, py_start_method="fork", parallel=True
-    )
-    yield raises(
-        RuntimeError, "Cannot fork a process when the CUDA has been initialized in the process."
-    )(utils.build_and_run_pipeline), parallel_pipe, 1
-
-
-def test_dtypes():
-    yield from utils.check_spawn_with_callback(utils.ExtCallback)
+class TestSimpleCallbacks:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-def test_random_data():
-    yield from utils.check_spawn_with_callback(
-        utils.ExtCallback, shapes=[(100, 40, 3), (8, 64, 64, 3)], random_data=True
-    )
+    def test_dtypes(self):
+        utils.check_spawn_with_callback(utils.ExtCallback)
 
+    def test_random_data(self):
+        utils.check_spawn_with_callback(
+            utils.ExtCallback, shapes=[(100, 40, 3), (8, 64, 64, 3)], random_data=True
+        )
 
-def test_randomly_shaped_data():
-    yield from utils.check_spawn_with_callback(
-        utils.ExtCallback,
-        shapes=[(100, 40, 3), (8, 64, 64, 3)],
-        random_data=True,
-        random_shape=True,
-    )
+    def test_randomly_shaped_data(self):
+        utils.check_spawn_with_callback(
+            utils.ExtCallback,
+            shapes=[(100, 40, 3), (8, 64, 64, 3)],
+            random_data=True,
+            random_shape=True,
+        )
 
+    def test_num_outputs(self):
+        utils.check_spawn_with_callback(
+            utils.ExtCallbackMultipleOutputs,
+            utils.ExtCallbackMultipleOutputs,
+            num_outputs=2,
+            dtypes=[np.uint8, float],
+        )
 
-def test_num_outputs():
-    yield from utils.check_spawn_with_callback(
-        utils.ExtCallbackMultipleOutputs,
-        utils.ExtCallbackMultipleOutputs,
-        num_outputs=2,
-        dtypes=[np.uint8, float],
-    )
+    def test_tensor_cpu(self):
+        utils.check_spawn_with_callback(utils.ExtCallbackTensorCPU)
 
 
-def test_tensor_cpu():
-    yield from utils.check_spawn_with_callback(utils.ExtCallbackTensorCPU)
+class TestExceptionPropagation:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_exception_propagation(callback, batch_size, num_workers, expected):
-    pipe = utils.create_pipe(
-        callback,
-        "cpu",
-        batch_size,
-        py_num_workers=num_workers,
-        py_start_method="spawn",
-        parallel=True,
+    @cartesian_params(
+        [(StopIteration, StopIteration), (utils.CustomException, Exception)],
+        [1, 4],
+        [1, 15, 150],
     )
-    raises(expected)(utils.build_and_run_pipeline)(pipe, None)
-
-
-def test_exception_propagation():
-    for raised, expected in [(StopIteration, StopIteration), (utils.CustomException, Exception)]:
+    def test_exception_propagation(self, exceptions, batch_size, num_workers):
+        raised, expected = exceptions
         callback = utils.ExtCallback((4, 4), 250, np.int32, exception_class=raised)
-        for num_workers in [1, 4]:
-            for batch_size in [1, 15, 150]:
-                yield _test_exception_propagation, callback, batch_size, num_workers, expected
+        pipe = utils.create_pipe(
+            callback,
+            "cpu",
+            batch_size,
+            py_num_workers=num_workers,
+            py_start_method="spawn",
+            parallel=True,
+        )
+        raises(expected)(utils.build_and_run_pipeline)(pipe, None)
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
 def _test_stop_iteration_resume(callback, batch_size, layout, num_workers):
     pipe = utils.create_pipe(
         callback,
@@ -313,34 +331,57 @@ def _test_stop_iteration_resume(callback, batch_size, layout, num_workers):
     utils.check_stop_iteration_resume(pipe, batch_size, layout)
 
 
-def test_stop_iteration_resume():
-    callback = utils.ExtCallback((4, 4), 250, "int32")
-    layout = "XY"
-    for num_workers in [1, 4]:
-        for batch_size in [1, 15, 150]:
-            yield _test_stop_iteration_resume, callback, batch_size, layout, num_workers
+class TestStopIterationResume:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_layout(callback, batch_size, layout, num_workers):
-    pipe = utils.create_pipe(
-        callback,
-        "cpu",
-        batch_size,
-        layout=layout,
-        py_num_workers=num_workers,
-        py_start_method="spawn",
-        parallel=True,
+    @cartesian_params(
+        [1, 4],
+        [1, 15, 150],
     )
-    utils.check_layout(pipe, layout)
+    def test_stop_iteration_resume(self, num_workers, batch_size):
+        callback = utils.ExtCallback((4, 4), 250, "int32")
+        layout = "XY"
+        pipe = utils.create_pipe(
+            callback,
+            "cpu",
+            batch_size,
+            layout=layout,
+            py_num_workers=num_workers,
+            py_start_method="spawn",
+            parallel=True,
+        )
+        utils.check_stop_iteration_resume(pipe, batch_size, layout)
+
 
+class TestLayout:
+    def setUp(self):
+        setup_function()
 
-def test_layout():
-    for layout, dims in zip(["X", "XY", "XYZ"], ((4,), (4, 4), (4, 4, 4))):
+    def tearDown(self):
+        teardown_function()
+
+    @cartesian_params(
+        [((4,), "X"), ((4, 4), "XY"), ((4, 4, 4), "XYZ")],
+        [1, 4],
+        [1, 256, 600],
+    )
+    def test_layout(self, inputs_description, batch_size, num_workers):
+        dims, layout = inputs_description
         callback = utils.ExtCallback(dims, 1024, "int32")
-        for num_workers in [1, 4]:
-            for batch_size in [1, 256, 600]:
-                yield _test_layout, callback, batch_size, layout, num_workers
+        pipe = utils.create_pipe(
+            callback,
+            "cpu",
+            batch_size,
+            layout=layout,
+            py_num_workers=num_workers,
+            py_start_method="spawn",
+            parallel=True,
+        )
+        utils.check_layout(pipe, layout)
 
 
 class ext_cb:
@@ -352,38 +393,6 @@ def __call__(self, sinfo):
         return np.full(self.shape, sinfo.idx_in_epoch, dtype=np.int32)
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_vs_non_parallel(batch_size, cb_parallel, cb_seq, batch, py_num_workers):
-    pipe = dali.Pipeline(
-        batch_size=batch_size,
-        device_id=None,
-        num_threads=5,
-        py_num_workers=py_num_workers,
-        py_start_method="spawn",
-    )
-    with pipe:
-        ext_seq = dali.fn.external_source(cb_parallel, batch=batch, parallel=False)
-        ext_par = dali.fn.external_source(cb_seq, batch=batch, parallel=True)
-        pipe.set_outputs(ext_seq, ext_par)
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for i in range(10):
-        seq, par = pipe.run()
-        for j in range(batch_size):
-            s = seq.at(j)
-            p = par.at(j)
-            assert np.array_equal(s, p)
-
-
-def test_vs_non_parallel():
-    for shape in [[], [10], [100, 100, 100]]:
-        for batch_size, cb_parallel, cb_seq, batch, py_num_workers in [
-            (50, ext_cb("cb 1", shape), ext_cb("cb 2", shape), False, 14),
-            (50, Iterable(50, shape), Iterable(50, shape), True, 1),
-        ]:
-            yield _test_vs_non_parallel, batch_size, cb_parallel, cb_seq, batch, py_num_workers
-
-
 def generator_shape_empty():
     count = 0
     while True:
@@ -402,54 +411,65 @@ def generator_shape_100x3():
         yield [np.full([10, 10, 10], count + i) for i in range(50)]
 
 
-def test_generator_vs_non_parallel():
-    for cb in [generator_shape_empty, generator_shape_10, generator_shape_100x3]:
-        yield _test_vs_non_parallel, 50, cb, cb, True, 1
+class TestVsNonParallel:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_cycle_raise(cb, is_gen_fun, batch_size, epoch_size, reader_queue_size):
-    pipe = utils.create_pipe(
-        cb,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=1,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=None,
-        batch=True,
-        num_threads=5,
-        cycle="raise",
-        reader_queue_depth=reader_queue_size,
+    @cartesian_params(
+        [[], [10], [100, 100, 100]],
+        [
+            (50, functools.partial(ext_cb, "cb 1"), functools.partial(ext_cb, "cb 2"), False, 14),
+            (50, functools.partial(Iterable, 50), functools.partial(Iterable, 50), True, 1),
+        ],
     )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    if is_gen_fun:
-        refer_iter = cb()
-    else:
-        refer_iter = cb
-    for _ in range(3):
-        i = 0
-        while True:
-            try:
-                (batch,) = pipe.run()
-                expected_batch = next(refer_iter)
-                assert len(batch) == len(
-                    expected_batch
-                ), f"Batch length mismatch: expected {len(expected_batch)}, got {len(batch)}"
-                for sample, expected_sample in zip(batch, expected_batch):
-                    np.testing.assert_equal(sample, expected_sample)
-                i += 1
-            except StopIteration:
-                pipe.reset()
-                if is_gen_fun:
-                    refer_iter = cb()
-                else:
-                    refer_iter = iter(cb)
-                assert (
-                    i == epoch_size
-                ), f"Number of iterations mismatch: expected {epoch_size}, got {i}"
-                break
+    def test_vs_non_parallel(self, shape, pipe_description):
+        batch_size, cb_parallel, cb_seq, batch, py_num_workers = pipe_description
+        cb_parallel = cb_parallel(shape)
+        cb_seq = cb_seq(shape)
+        pipe = dali.Pipeline(
+            batch_size=batch_size,
+            device_id=None,
+            num_threads=5,
+            py_num_workers=py_num_workers,
+            py_start_method="spawn",
+        )
+        with pipe:
+            ext_seq = dali.fn.external_source(cb_parallel, batch=batch, parallel=False)
+            ext_par = dali.fn.external_source(cb_seq, batch=batch, parallel=True)
+            pipe.set_outputs(ext_seq, ext_par)
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for i in range(10):
+            seq, par = pipe.run()
+            for j in range(batch_size):
+                s = seq.at(j)
+                p = par.at(j)
+                assert np.array_equal(s, p)
+
+    @params(generator_shape_empty, generator_shape_10, generator_shape_100x3)
+    def test_generator_vs_non_parallel(self, cb):
+        pipe = dali.Pipeline(
+            batch_size=50,
+            device_id=None,
+            num_threads=5,
+            py_num_workers=1,
+            py_start_method="spawn",
+        )
+        with pipe:
+            ext_seq = dali.fn.external_source(cb, batch=True, parallel=False)
+            ext_par = dali.fn.external_source(cb, batch=True, parallel=True)
+            pipe.set_outputs(ext_seq, ext_par)
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for i in range(10):
+            seq, par = pipe.run()
+            for j in range(50):
+                s = seq.at(j)
+                p = par.at(j)
+                assert np.array_equal(s, p)
 
 
 def generator_epoch_size_1():
@@ -461,205 +481,203 @@ def generator_epoch_size_4():
         yield [np.full((4, 5), j + i) for i in range(20)]
 
 
-def test_cycle_raise():
-    batch_size = 20
-    for epoch_size, cb, is_gen_fun in [
-        (1, Iterable(batch_size, (4, 5), epoch_size=1), False),
-        (4, Iterable(batch_size, (4, 5), epoch_size=4), False),
-        (1, generator_epoch_size_1, True),
-        (4, generator_epoch_size_4, True),
-    ]:
-        for reader_queue_size in (1, 2, 6):
-            yield _test_cycle_raise, cb, is_gen_fun, batch_size, epoch_size, reader_queue_size
+class TestCycleRaise:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_cycle_quiet(cb, is_gen_fun, batch_size, epoch_size, reader_queue_size):
-    pipe = utils.create_pipe(
-        cb,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=1,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=None,
-        batch=True,
-        num_threads=5,
-        cycle="quiet",
-        reader_queue_depth=reader_queue_size,
+    BATCH_SIZE = 20
+
+    @cartesian_params(
+        [
+            (1, Iterable(BATCH_SIZE, (4, 5), epoch_size=1), False),
+            (4, Iterable(BATCH_SIZE, (4, 5), epoch_size=4), False),
+            (1, generator_epoch_size_1, True),
+            (4, generator_epoch_size_4, True),
+        ],
+        (1, 2, 6),
     )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    refer_iter = cb
-    for i in range(3 * epoch_size + 1):
-        if i % epoch_size == 0:
-            if is_gen_fun:
-                refer_iter = cb()
-            else:
-                refer_iter = iter(cb)
-        (batch,) = pipe.run()
-        expected_batch = next(refer_iter)
-        assert len(batch) == len(
-            expected_batch
-        ), f"Batch length mismatch: expected {len(expected_batch)}, got {len(batch)}"
-        for sample, expected_sample in zip(batch, expected_batch):
-            np.testing.assert_equal(sample, expected_sample)
-
-
-def test_cycle_quiet():
-    batch_size = 20
-    for epoch_size, cb, is_gen_fun in [
-        (1, Iterable(batch_size, (4, 5), epoch_size=1), False),
-        (4, Iterable(batch_size, (4, 5), epoch_size=4), False),
-        (1, generator_epoch_size_1, True),
-        (4, generator_epoch_size_4, True),
-    ]:
-        for reader_queue_size in (1, 2, 6):
-            yield _test_cycle_quiet, cb, is_gen_fun, batch_size, epoch_size, reader_queue_size
-
-
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_cycle_quiet_non_resetable(iterable, reader_queue_size, batch_size, epoch_size):
-    pipe = utils.create_pipe(
-        iterable,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=1,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=None,
-        batch=True,
-        num_threads=5,
-        cycle="quiet",
-        reader_queue_depth=reader_queue_size,
+    def test_cycle_raise(self, case_description, reader_queue_size):
+        epoch_size, cb, is_gen_fun = case_description
+        pipe = utils.create_pipe(
+            cb,
+            "cpu",
+            batch_size=self.BATCH_SIZE,
+            py_num_workers=1,
+            py_start_method="spawn",
+            parallel=True,
+            device_id=None,
+            batch=True,
+            num_threads=5,
+            cycle="raise",
+            reader_queue_depth=reader_queue_size,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        if is_gen_fun:
+            refer_iter = cb()
+        else:
+            refer_iter = cb
+        for _ in range(3):
+            i = 0
+            while True:
+                try:
+                    (batch,) = pipe.run()
+                    expected_batch = next(refer_iter)
+                    assert len(batch) == len(
+                        expected_batch
+                    ), f"Batch length mismatch: expected {len(expected_batch)}, got {len(batch)}"
+                    for sample, expected_sample in zip(batch, expected_batch):
+                        np.testing.assert_equal(sample, expected_sample)
+                    i += 1
+                except StopIteration:
+                    pipe.reset()
+                    if is_gen_fun:
+                        refer_iter = cb()
+                    else:
+                        refer_iter = iter(cb)
+                    assert (
+                        i == epoch_size
+                    ), f"Number of iterations mismatch: expected {epoch_size}, got {i}"
+                    break
+
+
+class TestCycleQuiet:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    BATCH_SIZE = 20
+    EPOCH_SIZE = 3
+
+    @cartesian_params(
+        [
+            (1, Iterable(BATCH_SIZE, (4, 5), epoch_size=1), False),
+            (4, Iterable(BATCH_SIZE, (4, 5), epoch_size=4), False),
+            (1, generator_epoch_size_1, True),
+            (4, generator_epoch_size_4, True),
+        ],
+        (1, 2, 6),
     )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for _ in range(epoch_size):
-        pipe.run()
-    try:
-        pipe.run()
-    except StopIteration:
-        pipe.reset()
+    def test_cycle_quiet(self, case_description, reader_queue_size):
+        epoch_size, cb, is_gen_fun = case_description
+        pipe = utils.create_pipe(
+            cb,
+            "cpu",
+            batch_size=self.BATCH_SIZE,
+            py_num_workers=1,
+            py_start_method="spawn",
+            parallel=True,
+            device_id=None,
+            batch=True,
+            num_threads=5,
+            cycle="quiet",
+            reader_queue_depth=reader_queue_size,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        refer_iter = cb
+        for i in range(3 * epoch_size + 1):
+            if i % epoch_size == 0:
+                if is_gen_fun:
+                    refer_iter = cb()
+                else:
+                    refer_iter = iter(cb)
+            (batch,) = pipe.run()
+            expected_batch = next(refer_iter)
+            assert len(batch) == len(
+                expected_batch
+            ), f"Batch length mismatch: expected {len(expected_batch)}, got {len(batch)}"
+            for sample, expected_sample in zip(batch, expected_batch):
+                np.testing.assert_equal(sample, expected_sample)
+
+    @params(1, 3, 6)
+    def test_cycle_quiet_non_resetable(self, reader_queue_size):
+        iterable = FaultyResetIterable(self.EPOCH_SIZE, (5, 4), epoch_size=self.EPOCH_SIZE)
+        pipe = utils.create_pipe(
+            iterable,
+            "cpu",
+            batch_size=self.EPOCH_SIZE,
+            py_num_workers=1,
+            py_start_method="spawn",
+            parallel=True,
+            device_id=None,
+            batch=True,
+            num_threads=5,
+            cycle="quiet",
+            reader_queue_depth=reader_queue_size,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for _ in range(self.EPOCH_SIZE):
+            pipe.run()
         try:
             pipe.run()
         except StopIteration:
-            pass
+            pipe.reset()
+            try:
+                pipe.run()
+            except StopIteration:
+                pass
+            else:
+                assert False, "Expected stop iteration"
         else:
-            assert False, "Expected stop iteration"
-    else:
-        assert False, "Expected stop iteration at the end of the epoch"
+            assert False, "Expected stop iteration at the end of the epoch"
 
 
-def test_cycle_quiet_non_resetable():
-    epoch_size = 3
-    batch_size = 20
-    iterable = FaultyResetIterable(batch_size, (5, 4), epoch_size=epoch_size)
-    for reader_queue_size in (1, 3, 6):
-        yield _test_cycle_quiet_non_resetable, iterable, reader_queue_size, batch_size, epoch_size
+class TestCycleNoResetting(unittest.TestCase):
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_cycle_no_resetting(cb, batch_size, epoch_size, reader_queue_size):
-    pipe = utils.create_pipe(
-        cb,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=1,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=None,
-        batch=True,
-        num_threads=5,
-        cycle=None,
-        reader_queue_depth=reader_queue_size,
-    )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for _ in range(epoch_size):
-        pipe.run()
-    try:
-        pipe.run()
-    except StopIteration:
-        pipe.reset()
-    else:
-        assert False, "Expected stop iteration"
-    pipe.run()
-
-
-def test_cycle_no_resetting():
-    batch_size = 20
-    for epoch_size, cb in [
-        (1, Iterable(batch_size, (4, 5), epoch_size=1)),
-        (4, Iterable(batch_size, (4, 5), epoch_size=4)),
-        (1, generator_epoch_size_1),
-        (4, generator_epoch_size_4),
-    ]:
-        for reader_queue_size in (1, 2, 6):
-            yield raises(StopIteration)(
-                _test_cycle_no_resetting
-            ), cb, batch_size, epoch_size, reader_queue_size
-
-
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_all_kinds_parallel(
-    sample_cb, batch_cb, iter_cb, batch_size, py_num_workers, reader_queue_sizes, num_iters
-):
-    @dali.pipeline_def(
-        batch_size=batch_size,
-        num_threads=4,
-        device_id=None,
-        py_num_workers=py_num_workers,
-        py_start_method="spawn",
-    )
-    def pipeline():
-        queue_size_1, queue_size_2, queue_size_3 = reader_queue_sizes
-        sample_out = dali.fn.external_source(
-            source=sample_cb, parallel=True, batch=False, prefetch_queue_depth=queue_size_1
-        )
-        batch_out = dali.fn.external_source(
-            source=batch_cb,
-            parallel=True,
-            batch=True,
-            prefetch_queue_depth=queue_size_2,
-            batch_info=True,
-        )
-        iter_out = dali.fn.external_source(
-            source=iter_cb,
-            parallel=True,
-            batch=True,
-            prefetch_queue_depth=queue_size_3,
-            cycle="raise",
-        )
-        return (sample_out, batch_out, iter_out)
+    BATCH_SIZE = 20
 
-    pipe = pipeline()
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for _ in range(3):
-        i = 0
-        while True:
+    @cartesian_params(
+        [
+            (1, Iterable(BATCH_SIZE, (4, 5), epoch_size=1)),
+            (4, Iterable(BATCH_SIZE, (4, 5), epoch_size=4)),
+            (1, generator_epoch_size_1),
+            (4, generator_epoch_size_4),
+        ],
+        (1, 2, 6),
+    )
+    def test_cycle_no_resetting(self, es_description, reader_queue_size):
+        with self.assertRaises(StopIteration):
+            epoch_size, cb = es_description
+            pipe = utils.create_pipe(
+                cb,
+                "cpu",
+                batch_size=self.BATCH_SIZE,
+                py_num_workers=1,
+                py_start_method="spawn",
+                parallel=True,
+                device_id=None,
+                batch=True,
+                num_threads=5,
+                cycle=None,
+                reader_queue_depth=reader_queue_size,
+            )
+            pipe.build()
+            capture_processes(pipe._py_pool)
+            for _ in range(epoch_size):
+                pipe.run()
             try:
-                sample_outs, batch_outs, iter_outs = pipe.run()
-                assert len(sample_outs) == len(
-                    batch_outs
-                ), f"Batch length mismatch: sample: {len(sample_outs)}, batch: {len(batch_outs)}"
-                assert len(batch_outs) == len(
-                    iter_outs
-                ), f"Batch length mismatch: batch: {len(batch_outs)}, iter: {len(iter_outs)}"
-                for sample_out, batch_out, iter_out in zip(sample_outs, batch_outs, iter_outs):
-                    np.testing.assert_equal(np.array(sample_out), np.array(batch_out))
-                    np.testing.assert_equal(np.array(batch_out), np.array(iter_out))
-                i += 1
+                pipe.run()
             except StopIteration:
                 pipe.reset()
-                assert (
-                    i == num_iters
-                ), f"Number of iterations mismatch: expected {num_iters}, got {i}"
-                break
+            else:
+                assert False, "Expected stop iteration"
+            pipe.run()
 
 
-def test_all_kinds_parallel():
+def _make_all_kinds_parallel_cases():
+    cases = []
     for batch_size in (1, 17):
         for num_iters in (1, 3, 31):
             for trailing in (0, 30):
@@ -678,16 +696,92 @@ def test_all_kinds_parallel():
                     (1, 1, 3),
                 ):
                     for num_workers in (1, 7):
-                        yield (
-                            _test_all_kinds_parallel,
-                            sample_cb,
-                            batch_cb,
-                            iterator_cb,
-                            batch_size,
-                            num_workers,
-                            reader_queue_sizes,
-                            num_iters,
+                        cases.append(
+                            (
+                                sample_cb,
+                                batch_cb,
+                                iterator_cb,
+                                batch_size,
+                                num_workers,
+                                reader_queue_sizes,
+                                num_iters,
+                            )
                         )
+    return cases
+
+
+class TestAllKindsParallel:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @params(*_make_all_kinds_parallel_cases())
+    def test_all_kinds_parallel(
+        self,
+        sample_cb,
+        batch_cb,
+        iterator_cb,
+        batch_size,
+        num_workers,
+        reader_queue_sizes,
+        num_iters,
+    ):
+        @dali.pipeline_def(
+            batch_size=batch_size,
+            num_threads=4,
+            device_id=None,
+            py_num_workers=num_workers,
+            py_start_method="spawn",
+        )
+        def pipeline():
+            queue_size_1, queue_size_2, queue_size_3 = reader_queue_sizes
+            sample_out = dali.fn.external_source(
+                source=sample_cb, parallel=True, batch=False, prefetch_queue_depth=queue_size_1
+            )
+            batch_out = dali.fn.external_source(
+                source=batch_cb,
+                parallel=True,
+                batch=True,
+                prefetch_queue_depth=queue_size_2,
+                batch_info=True,
+            )
+            iter_out = dali.fn.external_source(
+                source=iterator_cb,
+                parallel=True,
+                batch=True,
+                prefetch_queue_depth=queue_size_3,
+                cycle="raise",
+            )
+            return (sample_out, batch_out, iter_out)
+
+        pipe = pipeline()
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for _ in range(3):
+            i = 0
+            while True:
+                try:
+                    sample_outs, batch_outs, iter_outs = pipe.run()
+                    assert len(sample_outs) == len(
+                        batch_outs
+                    ), f"Batch length mismatch: sample: {len(sample_outs)}, "
+                    f"batch: {len(batch_outs)}"
+                    assert len(batch_outs) == len(iter_outs), (
+                        f"Batch length mismatch: batch: {len(batch_outs)}, "
+                        f"iter: {len(iter_outs)}"
+                    )
+                    for sample_out, batch_out, iter_out in zip(sample_outs, batch_outs, iter_outs):
+                        np.testing.assert_equal(np.array(sample_out), np.array(batch_out))
+                        np.testing.assert_equal(np.array(batch_out), np.array(iter_out))
+                    i += 1
+                except StopIteration:
+                    pipe.reset()
+                    assert (
+                        i == num_iters
+                    ), f"Number of iterations mismatch: expected {num_iters}, got {i}"
+                    break
 
 
 def collect_iterations(pipe, num_iters):
@@ -702,122 +796,123 @@ def collect_iterations(pipe, num_iters):
     return outs
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_cycle_multiple_iterators(
-    batch_size, iters_num, py_num_workers, reader_queue_sizes, cycle_policies, epoch_sizes
-):
-    @dali.pipeline_def(
-        batch_size=batch_size,
-        num_threads=4,
-        device_id=None,
-        py_num_workers=py_num_workers,
-        py_start_method="spawn",
-    )
-    def pipeline(sample_cb, iter_1, iter_2, parallel):
-        if parallel:
-            queue_size_0, queue_size_1, queue_size_2 = reader_queue_sizes
-        else:
-            queue_size_0, queue_size_1, queue_size_2 = None, None, None
-        cycle_1, cycle_2 = cycle_policies
-        sample_out = dali.fn.external_source(
-            source=sample_cb, parallel=parallel, batch=False, prefetch_queue_depth=queue_size_0
-        )
-        iter1_out = dali.fn.external_source(
-            source=iter_1,
-            parallel=parallel,
-            batch=True,
-            prefetch_queue_depth=queue_size_1,
-            cycle=cycle_1,
-        )
-        iter2_out = dali.fn.external_source(
-            source=iter_2,
-            parallel=parallel,
-            batch=True,
-            prefetch_queue_depth=queue_size_2,
-            cycle=cycle_2,
-        )
-        return (sample_out, iter1_out, iter2_out)
-
-    shape = (2, 3)
-    sample_epoch_size, iter_1_epoch_size, iter_2_epoch_size = epoch_sizes
-    sample_cb = utils.ExtCallback((4, 5), sample_epoch_size * batch_size, np.int32)
-    iter_1 = Iterable(batch_size, shape, epoch_size=iter_1_epoch_size, dtype=np.int32)
-    iter_2 = Iterable(batch_size, shape, epoch_size=iter_2_epoch_size, dtype=np.int32)
-    pipe_parallel = pipeline(sample_cb, iter_1, iter_2, parallel=True)
-    pipe_seq = pipeline(sample_cb, iter_1, iter_2, parallel=False)
-    pipe_parallel.build()
-    utils.capture_processes(pipe_parallel._py_pool)
-    pipe_seq.build()
-    parallel_outs = collect_iterations(pipe_parallel, iters_num)
-    seq_outs = collect_iterations(pipe_seq, iters_num)
-    assert len(parallel_outs) == len(seq_outs)
-    for parallel_out, seq_out in zip(parallel_outs, seq_outs):
-        if parallel_out == StopIteration or seq_out == StopIteration:
-            assert parallel_out == seq_out
-            continue
-        assert len(parallel_out) == len(seq_out) == 3
-        for batch_parallel, batch_seq in zip(parallel_out, seq_out):
-            assert len(batch_parallel) == len(batch_seq) == batch_size
-            for sample_parallel, sample_seq in zip(batch_parallel, batch_seq):
-                np.testing.assert_equal(np.array(sample_parallel), np.array(sample_seq))
-
-
-def test_cycle_multiple_iterators():
-    batch_size = 50
-    iters_num = 17
-    num_workers = 4
-    for prefetch_queue_depths in ((3, 1, 1), (1, 3, 1), (1, 1, 3), (1, 1, 1), (3, 3, 3)):
-        for cycle_policies in (
+class TestCycleMultipleIterators:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @cartesian_params(
+        ((3, 1, 1), (1, 3, 1), (1, 1, 3), (1, 1, 1), (3, 3, 3)),
+        (
             ("raise", "raise"),
             ("quiet", "raise"),
             ("raise", "quiet"),
             ("quiet", "quiet"),
             (True, True),
-        ):
-            for epoch_sizes in ((8, 4, 6), (8, 6, 4), (4, 6, 8), (1, 1, 1)):
-                yield (
-                    _test_cycle_multiple_iterators,
-                    batch_size,
-                    iters_num,
-                    num_workers,
-                    prefetch_queue_depths,
-                    cycle_policies,
-                    epoch_sizes,
-                )
+        ),
+        ((8, 4, 6), (8, 6, 4), (4, 6, 8), (1, 1, 1)),
+    )
+    def test_cycle_multiple_iterators(self, prefetch_queue_depths, cycle_policies, epoch_sizes):
+        batch_size = 50
+        iters_num = 17
+        num_workers = 4
+
+        @dali.pipeline_def(
+            batch_size=batch_size,
+            num_threads=4,
+            device_id=None,
+            py_num_workers=num_workers,
+            py_start_method="spawn",
+        )
+        def pipeline(sample_cb, iter_1, iter_2, parallel):
+            if parallel:
+                queue_size_0, queue_size_1, queue_size_2 = prefetch_queue_depths
+            else:
+                queue_size_0, queue_size_1, queue_size_2 = None, None, None
+            cycle_1, cycle_2 = cycle_policies
+            sample_out = dali.fn.external_source(
+                source=sample_cb, parallel=parallel, batch=False, prefetch_queue_depth=queue_size_0
+            )
+            iter1_out = dali.fn.external_source(
+                source=iter_1,
+                parallel=parallel,
+                batch=True,
+                prefetch_queue_depth=queue_size_1,
+                cycle=cycle_1,
+            )
+            iter2_out = dali.fn.external_source(
+                source=iter_2,
+                parallel=parallel,
+                batch=True,
+                prefetch_queue_depth=queue_size_2,
+                cycle=cycle_2,
+            )
+            return (sample_out, iter1_out, iter2_out)
+
+        shape = (2, 3)
+        sample_epoch_size, iter_1_epoch_size, iter_2_epoch_size = epoch_sizes
+        sample_cb = utils.ExtCallback((4, 5), sample_epoch_size * batch_size, np.int32)
+        iter_1 = Iterable(batch_size, shape, epoch_size=iter_1_epoch_size, dtype=np.int32)
+        iter_2 = Iterable(batch_size, shape, epoch_size=iter_2_epoch_size, dtype=np.int32)
+        pipe_parallel = pipeline(sample_cb, iter_1, iter_2, parallel=True)
+        pipe_seq = pipeline(sample_cb, iter_1, iter_2, parallel=False)
+        pipe_parallel.build()
+        capture_processes(pipe_parallel._py_pool)
+        pipe_seq.build()
+        parallel_outs = collect_iterations(pipe_parallel, iters_num)
+        seq_outs = collect_iterations(pipe_seq, iters_num)
+        assert len(parallel_outs) == len(seq_outs)
+        for parallel_out, seq_out in zip(parallel_outs, seq_outs):
+            if parallel_out == StopIteration or seq_out == StopIteration:
+                assert parallel_out == seq_out
+                continue
+            assert len(parallel_out) == len(seq_out) == 3
+            for batch_parallel, batch_seq in zip(parallel_out, seq_out):
+                assert len(batch_parallel) == len(batch_seq) == batch_size
+                for sample_parallel, sample_seq in zip(batch_parallel, batch_seq):
+                    np.testing.assert_equal(np.array(sample_parallel), np.array(sample_seq))
 
 
 def ext_cb2(sinfo):
     return np.array([sinfo.idx_in_epoch, sinfo.idx_in_batch, sinfo.iteration], dtype=np.int32)
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def test_discard():
-    bs = 5
-    pipe = dali.Pipeline(
-        batch_size=bs, device_id=None, num_threads=5, py_num_workers=4, py_start_method="spawn"
-    )
-    with pipe:
-        ext1 = dali.fn.external_source([[np.float32(i) for i in range(bs)]] * 3, cycle="raise")
-        ext2 = dali.fn.external_source(ext_cb2, batch=False, parallel=True)
-        ext3 = dali.fn.external_source(ext_cb2, batch=False, parallel=False)
-        pipe.set_outputs(ext1, ext2, ext3)
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    sample_in_epoch = 0
-    iteration = 0
-    for i in range(10):
-        try:
-            e1, e2, e3 = pipe.run()
-            for i in range(bs):
-                assert e1.at(i) == i
-                assert np.array_equal(e2.at(i), np.array([sample_in_epoch, i, iteration]))
-                assert np.array_equal(e3.at(i), np.array([sample_in_epoch, i, iteration]))
-                sample_in_epoch += 1
-            iteration += 1
-        except StopIteration:
-            sample_in_epoch = 0
-            iteration = 0
-            pipe.reset()
+class TestDiscard:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    def test_discard(self):
+        bs = 5
+        pipe = dali.Pipeline(
+            batch_size=bs, device_id=None, num_threads=5, py_num_workers=4, py_start_method="spawn"
+        )
+        with pipe:
+            ext1 = dali.fn.external_source([[np.float32(i) for i in range(bs)]] * 3, cycle="raise")
+            ext2 = dali.fn.external_source(ext_cb2, batch=False, parallel=True)
+            ext3 = dali.fn.external_source(ext_cb2, batch=False, parallel=False)
+            pipe.set_outputs(ext1, ext2, ext3)
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        sample_in_epoch = 0
+        iteration = 0
+        for i in range(10):
+            try:
+                e1, e2, e3 = pipe.run()
+                for i in range(bs):
+                    assert e1.at(i) == i
+                    assert np.array_equal(e2.at(i), np.array([sample_in_epoch, i, iteration]))
+                    assert np.array_equal(e3.at(i), np.array([sample_in_epoch, i, iteration]))
+                    sample_in_epoch += 1
+                iteration += 1
+            except StopIteration:
+                sample_in_epoch = 0
+                iteration = 0
+                pipe.reset()
 
 
 class SampleCb:
@@ -839,99 +934,112 @@ def __call__(self, sample_info):
         )
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_epoch_idx(
-    batch_size,
-    epoch_size,
-    cb,
-    py_num_workers,
-    prefetch_queue_depth,
-    reader_queue_depth,
-    batch_mode,
-    batch_info,
-):
-    num_epochs = 3
-    pipe = utils.create_pipe(
-        cb,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=py_num_workers,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=0,
-        batch=batch_mode,
-        num_threads=1,
-        cycle=None,
-        batch_info=batch_info,
-        prefetch_queue_depth=prefetch_queue_depth,
-        reader_queue_depth=reader_queue_depth,
-    )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for epoch_idx in range(num_epochs):
-        for iteration in range(epoch_size):
-            (batch,) = pipe.run()
-            assert len(batch) == batch_size
-            for sample_i, sample in enumerate(batch):
-                expected = np.array(
-                    [
-                        iteration * batch_size + sample_i,
-                        sample_i,
-                        iteration,
-                        epoch_idx if not batch_mode or batch_info else 0,
-                    ]
-                )
-                np.testing.assert_array_equal(sample, expected)
-        try:
-            pipe.run()
-        except StopIteration:
-            pipe.reset()
-        else:
-            assert False, "expected StopIteration"
-
-
-def test_epoch_idx():
+def _generate_epoch_idx_test_cases():
     num_workers = 4
     prefetch_queue_depth = 2
+    cases = []
     for batch_size in (1, 50):
         for epoch_size in (1, 3, 7):
             for reader_queue_depth in (1, 5):
                 sample_cb = SampleCb(batch_size, epoch_size)
-                yield (
-                    _test_epoch_idx,
-                    batch_size,
-                    epoch_size,
-                    sample_cb,
-                    num_workers,
-                    prefetch_queue_depth,
-                    reader_queue_depth,
-                    False,
-                    None,
+                cases.append(
+                    (
+                        batch_size,
+                        epoch_size,
+                        sample_cb,
+                        num_workers,
+                        prefetch_queue_depth,
+                        reader_queue_depth,
+                        False,
+                        None,
+                    )
                 )
-                batch_cb = SampleCallbackBatched(sample_cb, batch_size, True)
-                yield (
-                    _test_epoch_idx,
-                    batch_size,
-                    epoch_size,
-                    batch_cb,
-                    num_workers,
-                    prefetch_queue_depth,
-                    reader_queue_depth,
-                    True,
-                    True,
+                batch_cb_true = SampleCallbackBatched(sample_cb, batch_size, True)
+                cases.append(
+                    (
+                        batch_size,
+                        epoch_size,
+                        batch_cb_true,
+                        num_workers,
+                        prefetch_queue_depth,
+                        reader_queue_depth,
+                        True,
+                        True,
+                    )
                 )
-                batch_cb = SampleCallbackBatched(sample_cb, batch_size, False)
-                yield (
-                    _test_epoch_idx,
-                    batch_size,
-                    epoch_size,
-                    batch_cb,
-                    num_workers,
-                    prefetch_queue_depth,
-                    reader_queue_depth,
-                    True,
-                    False,
+                batch_cb_false = SampleCallbackBatched(sample_cb, batch_size, False)
+                cases.append(
+                    (
+                        batch_size,
+                        epoch_size,
+                        batch_cb_false,
+                        num_workers,
+                        prefetch_queue_depth,
+                        reader_queue_depth,
+                        True,
+                        False,
+                    )
                 )
+    return cases
+
+
+class TestEpochIdx:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @params(*_generate_epoch_idx_test_cases())
+    def test_epoch_idx(
+        self,
+        batch_size,
+        epoch_size,
+        cb,
+        py_num_workers,
+        prefetch_queue_depth,
+        reader_queue_depth,
+        batch_mode,
+        batch_info,
+    ):
+        num_epochs = 3
+        pipe = utils.create_pipe(
+            cb,
+            "cpu",
+            batch_size=batch_size,
+            py_num_workers=py_num_workers,
+            py_start_method="spawn",
+            parallel=True,
+            device_id=0,
+            batch=batch_mode,
+            num_threads=1,
+            cycle=None,
+            batch_info=batch_info,
+            prefetch_queue_depth=prefetch_queue_depth,
+            reader_queue_depth=reader_queue_depth,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for epoch_idx in range(num_epochs):
+            for iteration in range(epoch_size):
+                (batch,) = pipe.run()
+                assert len(batch) == batch_size
+                for sample_i, sample in enumerate(batch):
+                    expected = np.array(
+                        [
+                            iteration * batch_size + sample_i,
+                            sample_i,
+                            iteration,
+                            epoch_idx if not batch_mode or batch_info else 0,
+                        ]
+                    )
+                    np.testing.assert_array_equal(sample, expected)
+            try:
+                pipe.run()
+            except StopIteration:
+                pipe.reset()
+            else:
+                assert False, "expected StopIteration"
 
 
 class PermutableSampleCb:
@@ -956,66 +1064,61 @@ def __call__(self, sample_info):
         return np.array([self.perm[sample_info.idx_in_epoch]], dtype=np.int32)
 
 
-@with_setup(utils.setup_function, utils.teardown_function)
-def _test_permute_dataset(
-    batch_size,
-    epoch_size,
-    trailing_samples,
-    cb,
-    py_num_workers,
-    prefetch_queue_depth,
-    reader_queue_depth,
-):
-    num_epochs = 3
-    pipe = utils.create_pipe(
-        cb,
-        "cpu",
-        batch_size=batch_size,
-        py_num_workers=py_num_workers,
-        py_start_method="spawn",
-        parallel=True,
-        device_id=0,
-        batch=False,
-        num_threads=1,
-        cycle=None,
-        prefetch_queue_depth=prefetch_queue_depth,
-        reader_queue_depth=reader_queue_depth,
-    )
-    pipe.build()
-    utils.capture_processes(pipe._py_pool)
-    for epoch_idx in range(num_epochs):
-        epoch_data = [False for _ in range(epoch_size * batch_size + trailing_samples)]
-        for _ in range(epoch_size):
-            (batch,) = pipe.run()
-            assert len(batch) == batch_size
-            for sample in batch:
-                epoch_data[np.array(sample)[0]] = True
-        assert (
-            sum(epoch_data) == epoch_size * batch_size
-        ), "Epoch number {} did not contain some samples from data set".format(epoch_idx)
-        try:
-            pipe.run()
-        except StopIteration:
-            pipe.reset()
-        else:
-            assert False, "expected StopIteration"
+class TestPermuteDataset:
+    def setUp(self):
+        setup_function()
 
+    def tearDown(self):
+        teardown_function()
 
-def test_permute_dataset():
-    for batch_size, trailing_samples in ((4, 0), (100, 0), (100, 99)):
-        for epoch_size in (3, 7):
-            cb = PermutableSampleCb(batch_size, epoch_size, trailing_samples=trailing_samples)
-            for reader_queue_depth in (1, 5):
-                yield (
-                    _test_permute_dataset,
-                    batch_size,
-                    epoch_size,
-                    trailing_samples,
-                    cb,
-                    4,
-                    1,
-                    reader_queue_depth,
-                )
+    @cartesian_params(
+        ((4, 0), (100, 0), (100, 99)),
+        (3, 7),
+        (1, 5),
+    )
+    def test_permute_dataset(
+        self,
+        samples_data,
+        epoch_size,
+        reader_queue_depth,
+    ):
+        batch_size, trailing_samples = samples_data
+        num_epochs = 3
+        py_num_workers = 4
+        prefetch_queue_depth = 1
+        cb = PermutableSampleCb(batch_size, epoch_size, trailing_samples=trailing_samples)
+        pipe = utils.create_pipe(
+            cb,
+            "cpu",
+            batch_size=batch_size,
+            py_num_workers=py_num_workers,
+            py_start_method="spawn",
+            parallel=True,
+            device_id=0,
+            batch=False,
+            num_threads=1,
+            cycle=None,
+            prefetch_queue_depth=prefetch_queue_depth,
+            reader_queue_depth=reader_queue_depth,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for epoch_idx in range(num_epochs):
+            epoch_data = [False for _ in range(epoch_size * batch_size + trailing_samples)]
+            for _ in range(epoch_size):
+                (batch,) = pipe.run()
+                assert len(batch) == batch_size
+                for sample in batch:
+                    epoch_data[np.array(sample)[0]] = True
+            assert (
+                sum(epoch_data) == epoch_size * batch_size
+            ), "Epoch number {} did not contain some samples from data set".format(epoch_idx)
+            try:
+                pipe.run()
+            except StopIteration:
+                pipe.reset()
+            else:
+                assert False, "expected StopIteration"
 
 
 class PerIterShapeSource:
diff --git a/dali/test/python/test_external_source_parallel_custom_serialization.py b/dali/test/python/test_external_source_parallel_custom_serialization.py
index 550213e8f9e..0fc5917f683 100644
--- a/dali/test/python/test_external_source_parallel_custom_serialization.py
+++ b/dali/test/python/test_external_source_parallel_custom_serialization.py
@@ -317,8 +317,9 @@ def _create_and_compare_simple_pipelines(
         _run_and_compare_outputs(batch_size, parallel_pipeline, serial_pipeline)
 
 
-# It uses fork method to start so need to be run as the first test
-def test_no_pickling_in_forking_mode():
+# Make it private and run it explicitly as it uses fork method to start
+# so need to be run as the first test
+def _test_no_pickling_in_forking_mode():
     # modify callback name so that an attempt to pickle it in spawn mode would fail
     _simple_callback.__name__ = _simple_callback.__qualname__ = "simple_callback"
     _create_and_compare_simple_pipelines(
diff --git a/dali/test/python/test_external_source_parallel_large_sample.py b/dali/test/python/test_external_source_parallel_large_sample.py
index e88b4fabf86..602bebfcc09 100644
--- a/dali/test/python/test_external_source_parallel_large_sample.py
+++ b/dali/test/python/test_external_source_parallel_large_sample.py
@@ -13,54 +13,56 @@
 # limitations under the License.
 
 import numpy as np
-from nose_utils import with_setup
+from nose2.tools import params
 from nvidia.dali import pipeline_def
 import nvidia.dali.fn as fn
-from test_external_source_parallel_utils import setup_function, teardown_function, capture_processes
+from test_pool_utils import setup_function, teardown_function, capture_processes
 
 
 def large_sample_cb(sample_info):
     return np.full((512, 1024, 1024), sample_info.idx_in_epoch, dtype=np.int32)
 
 
-@with_setup(setup_function, teardown_function)
-def _test_large_sample(start_method):
-    batch_size = 2
+class TestLargeSample:
+    def setUp(self):
+        setup_function()
 
-    @pipeline_def
-    def create_pipeline():
-        large = fn.external_source(
-            large_sample_cb, batch=False, parallel=True, prefetch_queue_depth=1
-        )
-        # iteration over array in Python is too slow, so reduce the number of elements
-        # to iterate over
-        reduced = fn.reductions.sum(large, axes=(1, 2))
-        return reduced
+    def tearDown(self):
+        teardown_function()
 
-    pipe = create_pipeline(
-        batch_size=batch_size,
-        py_num_workers=2,
-        py_start_method=start_method,
-        prefetch_queue_depth=1,
-        num_threads=2,
-        device_id=0,
-    )
-    pipe.build()
-    capture_processes(pipe._py_pool)
-    for batch_idx in range(8):
-        (out,) = pipe.run()
-        for idx_in_batch in range(batch_size):
-            idx_in_epoch = batch_size * batch_idx + idx_in_batch
-            expected_val = idx_in_epoch * 1024 * 1024
-            a = np.array(out[idx_in_batch])
-            assert a.shape == (512,), "Expected shape (512,) but got {}".format(a.shape)
-            for val in a.flat:
-                assert val == expected_val, (
-                    f"Unexpected value in batch: got {val}, expected {expected_val}, "
-                    f"for batch {batch_idx}, sample {idx_in_batch}"
-                )
+    @params("fork", "spawn")
+    def test_large_sample(self, start_method):
+        batch_size = 2
 
+        @pipeline_def
+        def create_pipeline():
+            large = fn.external_source(
+                large_sample_cb, batch=False, parallel=True, prefetch_queue_depth=1
+            )
+            # iteration over array in Python is too slow, so reduce the number of elements
+            # to iterate over
+            reduced = fn.reductions.sum(large, axes=(1, 2))
+            return reduced
 
-def test_large_sample():
-    for start_method in ("fork", "spawn"):
-        yield _test_large_sample, start_method
+        pipe = create_pipeline(
+            batch_size=batch_size,
+            py_num_workers=2,
+            py_start_method=start_method,
+            prefetch_queue_depth=1,
+            num_threads=2,
+            device_id=0,
+        )
+        pipe.build()
+        capture_processes(pipe._py_pool)
+        for batch_idx in range(8):
+            (out,) = pipe.run()
+            for idx_in_batch in range(batch_size):
+                idx_in_epoch = batch_size * batch_idx + idx_in_batch
+                expected_val = idx_in_epoch * 1024 * 1024
+                a = np.array(out[idx_in_batch])
+                assert a.shape == (512,), "Expected shape (512,) but got {}".format(a.shape)
+                for val in a.flat:
+                    assert val == expected_val, (
+                        f"Unexpected value in batch: got {val}, expected {expected_val}, "
+                        f"for batch {batch_idx}, sample {idx_in_batch}"
+                    )
diff --git a/dali/test/python/test_external_source_parallel_mxnet.py b/dali/test/python/test_external_source_parallel_mxnet.py
deleted file mode 100644
index 7ea2abe965c..00000000000
--- a/dali/test/python/test_external_source_parallel_mxnet.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# it is enough to just import all functions from test_internals_operator_external_source
-# nose will query for the methods available and will run them
-# the test_internals_operator_external_source is 99% the same for cupy and numpy tests
-# so it is better to store everything in one file and just call `use_cupy`
-# to switch between the default numpy and cupy
-
-import mxnet as mx
-from nose_utils import raises, with_setup
-
-from test_pool_utils import setup_function
-from test_external_source_parallel_utils import (
-    ExtCallback,
-    check_spawn_with_callback,
-    create_pipe,
-    build_and_run_pipeline,
-)
-import numpy as np
-
-
-class ExtCallbackMX(ExtCallback):
-    def __call__(self, sample_info):
-        a = super().__call__(sample_info)
-        return mx.nd.array(a, dtype=a.dtype)
-
-
-def test_mxnet():
-    yield from check_spawn_with_callback(ExtCallbackMX)
-
-
-class ExtCallbackMXCuda(ExtCallback):
-    def __call__(self, sample_info):
-        a = super().__call__(sample_info)
-        return mx.nd.array(a, dtype=a.dtype, ctx=mx.gpu(0))
-
-
-@raises(
-    Exception,
-    "Exception traceback received from worker thread*"
-    "TypeError: Unsupported callback return type. GPU tensors*not supported*"
-    "Got*MXNet GPU tensor.",
-)
-@with_setup(setup_function)
-def test_mxnet_cuda():
-    callback = ExtCallbackMXCuda((4, 5), 10, np.int32)
-    pipe = create_pipe(callback, "cpu", 5, py_num_workers=6, py_start_method="spawn", parallel=True)
-    build_and_run_pipeline(pipe)
diff --git a/dali/test/python/test_external_source_parallel_pytorch.py b/dali/test/python/test_external_source_parallel_pytorch.py
index 0624e76f849..511bd5248ba 100644
--- a/dali/test/python/test_external_source_parallel_pytorch.py
+++ b/dali/test/python/test_external_source_parallel_pytorch.py
@@ -22,6 +22,7 @@
 import torch
 
 import test_external_source_parallel_utils as utils
+from test_pool_utils import setup_function, teardown_function
 from nose_utils import raises
 
 
@@ -47,24 +48,30 @@ def test_pytorch_cuda_context():
     pipe.start_py_workers()
 
 
-def test_pytorch():
-    yield from utils.check_spawn_with_callback(ExtCallbackTorch)
-
-
 class ExtCallbackTorchCuda(utils.ExtCallback):
     def __call__(self, sample_info):
         return torch.tensor(super().__call__(sample_info), device=torch.device("cuda:0"))
 
 
-@raises(
-    Exception,
-    "Exception traceback received from worker thread*"
-    "TypeError: Unsupported callback return type. GPU tensors*not supported*"
-    "Got*PyTorch GPU tensor",
-)
-def test_pytorch_cuda():
-    callback = ExtCallbackTorchCuda((4, 5), 10, np.int32)
-    pipe = utils.create_pipe(
-        callback, "cpu", 5, py_num_workers=6, py_start_method="spawn", parallel=True
+class TestExtCallbackTorch:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    def test_pytorch(self):
+        utils.check_spawn_with_callback(ExtCallbackTorch)
+
+    @raises(
+        Exception,
+        "Exception traceback received from worker thread*"
+        "TypeError: Unsupported callback return type. GPU tensors*not supported*"
+        "Got*PyTorch GPU tensor",
     )
-    utils.build_and_run_pipeline(pipe)
+    def test_pytorch_cuda(self):
+        callback = ExtCallbackTorchCuda((4, 5), 10, np.int32)
+        pipe = utils.create_pipe(
+            callback, "cpu", 5, py_num_workers=6, py_start_method="spawn", parallel=True
+        )
+        utils.build_and_run_pipeline(pipe)
diff --git a/dali/test/python/test_external_source_parallel_utils.py b/dali/test/python/test_external_source_parallel_utils.py
index 700937cdbee..b9823f49810 100644
--- a/dali/test/python/test_external_source_parallel_utils.py
+++ b/dali/test/python/test_external_source_parallel_utils.py
@@ -14,8 +14,7 @@
 
 import numpy as np
 import nvidia.dali as dali
-from nose_utils import with_setup
-from test_pool_utils import capture_processes, teardown_function, setup_function
+from test_pool_utils import capture_processes
 from test_utils import (
     compare_pipelines,
     check_batch,
@@ -136,7 +135,6 @@ def check_callback(parallel_pipe, pipe, epoch_size, batch_size, dtype=None):
     compare_pipelines(parallel_pipe, pipe, batch_size, iters_no)
 
 
-@with_setup(setup_function, teardown_function)
 def _check_spawn_with_callback(
     callback, callback_ref, batch_size, num_outputs, layout, workers_num, epoch_size, dtype
 ):
@@ -177,8 +175,7 @@ def check_spawn_with_callback(
             )
             for workers_num in [1, 4]:
                 for batch_size in [1, 16, 150]:
-                    yield (
-                        _check_spawn_with_callback,
+                    _check_spawn_with_callback(
                         callback,
                         callback_ref,
                         batch_size,
diff --git a/dali/test/python/test_fw_iterators.py b/dali/test/python/test_fw_iterators.py
index d8104b7ba5a..b11dc4ba98f 100644
--- a/dali/test/python/test_fw_iterators.py
+++ b/dali/test/python/test_fw_iterators.py
@@ -107,260 +107,6 @@ def create_pipeline(creator, batch_size, num_gpus):
     return pipes, iters
 
 
-@attr("mxnet")
-def test_mxnet_iterator_model_fit():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-    import mxnet as mx
-
-    num_gpus = 1
-    batch_size = 1
-
-    def create_test_pipeline(batch_size, num_threads, device_id, num_gpus, data_paths):
-        pipe = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id)
-        with pipe:
-            _, labels = fn.readers.file(
-                file_root=data_paths, shard_id=device_id, num_shards=num_gpus, name="Reader"
-            )
-        pipe.set_outputs(labels)
-        return pipe
-
-    pipes, _ = create_pipeline(
-        lambda gpu: create_test_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            device_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=image_data_set,
-        ),
-        batch_size,
-        num_gpus,
-    )
-    pipe = pipes[0]
-
-    class MXNetIteratorWrapper(MXNetIterator):
-        def __init__(self, iter):
-            self.iter = iter
-
-        def __getattr__(self, attr):
-            return getattr(self.iter, attr)
-
-        def __next__(self):
-            ret = self.iter.__next__()[0]
-            return ret
-
-    dali_train_iter = MXNetIterator(
-        pipe, [("labels", MXNetIterator.LABEL_TAG)], size=pipe.epoch_size("Reader")
-    )
-    data = mx.symbol.Variable("labels")
-
-    # create a dummy model
-    _ = mx.model.FeedForward.create(
-        data, X=MXNetIteratorWrapper(dali_train_iter), num_epoch=1, learning_rate=0.01
-    )
-
-
-@attr("mxnet")
-def test_mxnet_iterator_last_batch_no_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=False,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = MXNetIterator(
-        pipes,
-        [("ids", MXNetIterator.DATA_TAG)],
-        size=pipes[0].epoch_size("Reader"),
-        last_batch_policy=LastBatchPolicy.FILL,
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x.data[0].squeeze(-1).asnumpy(), lambda x: x.pad, data_size
-    )
-
-    assert len(img_ids_list) > data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) != 1
-
-
-@attr("mxnet")
-def test_mxnet_iterator_empty_array():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-    import mxnet as mx
-
-    batch_size = 4
-    size = 5
-
-    all_np_types = [
-        np.bool_,
-        np.int_,
-        np.intc,
-        np.intp,
-        np.int8,
-        np.int16,
-        np.int32,
-        np.int64,
-        np.uint8,
-        np.uint16,
-        np.uint32,
-        np.uint64,
-        np.float32,
-        np.float16,
-        np.short,
-        int,
-        np.longlong,
-        np.ushort,
-        np.ulonglong,
-    ]
-    np_types = []
-    # store in np_types only types supported by MXNet
-    for t in all_np_types:
-        try:
-            mx.nd.zeros([2, 2, 2], ctx=None, dtype=t)
-            np_types.append(t)
-        except mx.base.MXNetError:
-            pass
-
-    test_data_shape = [1, 3, 0, 4]
-
-    def get_data():
-        # create batch of [type_a, type_a, type_b, type_b, ...]
-        out = [[np.empty(test_data_shape, dtype=t)] * batch_size for t in np_types]
-        out = [val for pair in zip(out, out) for val in pair]
-        return out
-
-    pipe = Pipeline(batch_size=batch_size, num_threads=3, device_id=0)
-    outs = fn.external_source(source=get_data, num_outputs=len(np_types) * 2)
-    pipe.set_outputs(*outs)
-
-    # create map of [(data, type_a), (label, type_a), ...]
-    data_map = [("data_{}".format(i), MXNetIterator.DATA_TAG) for i, t in enumerate(np_types)]
-    label_map = [("label_{}".format(i), MXNetIterator.LABEL_TAG) for i, t in enumerate(np_types)]
-    out_map = [val for pair in zip(data_map, label_map) for val in pair]
-
-    iterator = MXNetIterator(pipe, output_map=out_map, size=size, dynamic_shape=True)
-
-    for batch in iterator:
-        for d, t in zip(batch[0].data, np_types):
-            shape = d.asnumpy().shape
-            assert shape[0] == batch_size
-            assert np.array_equal(shape[1:], test_data_shape)
-            assert d.asnumpy().dtype == t
-
-
-@attr("mxnet")
-def test_mxnet_iterator_last_batch_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=True,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = MXNetIterator(
-        pipes,
-        [("ids", MXNetIterator.DATA_TAG)],
-        size=pipes[0].epoch_size("Reader"),
-        last_batch_policy=LastBatchPolicy.FILL,
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x.data[0].squeeze(-1).asnumpy(), lambda x: x.pad, data_size
-    )
-
-    assert len(img_ids_list) > data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) == 1
-
-    dali_train_iter.reset()
-    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x.data[0].squeeze(-1).asnumpy(), lambda x: x.pad, data_size
-    )
-
-    assert len(next_img_ids_list) > data_size
-    assert len(next_img_ids_list_set) == data_size
-    assert len(set(next_mirrored_data)) == 1
-
-
-@attr("mxnet")
-def test_mxnet_iterator_not_fill_last_batch_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=True,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = MXNetIterator(
-        pipes,
-        [("ids", MXNetIterator.DATA_TAG)],
-        size=pipes[0].epoch_size("Reader"),
-        last_batch_policy=LastBatchPolicy.PARTIAL,
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, pad, remainder = gather_ids(
-        dali_train_iter, lambda x: x.data[0].squeeze(-1).asnumpy(), lambda x: x.pad, data_size
-    )
-
-    assert pad == remainder
-    assert len(img_ids_list) - pad == data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) == 1
-
-    dali_train_iter.reset()
-    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, pad, remainder = gather_ids(
-        dali_train_iter, lambda x: x.data[0].squeeze(-1).asnumpy(), lambda x: x.pad, data_size
-    )
-
-    assert pad == remainder
-    assert len(next_img_ids_list) - pad == data_size
-    assert len(next_img_ids_list_set) == data_size
-    assert len(set(next_mirrored_data)) == 1
-
-
 def check_iterator_results(
     pad,
     pipes_number,
@@ -449,456 +195,6 @@ def check_iterator_results(
     return (ids, sample_counter, per_gpu_counter, epoch_counter, rounded_shard_size)
 
 
-@attr("mxnet")
-def check_mxnet_iterator_pass_reader_name(
-    shards_num,
-    pipes_number,
-    batch_size,
-    stick_to_shard,
-    pad,
-    iters,
-    last_batch_policy,
-    auto_reset=False,
-):
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    pipes = [
-        create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=id,
-            num_gpus=shards_num,
-            data_paths=data_sets[0],
-            random_shuffle=False,
-            stick_to_shard=stick_to_shard,
-            shuffle_after_epoch=False,
-            pad_last_batch=pad,
-        )
-        for id in range(pipes_number)
-    ]
-
-    data_set_size = pipes[0].reader_meta("Reader")["epoch_size"]
-    rounded_shard_size = math.ceil(math.ceil(data_set_size / shards_num) / batch_size) * batch_size
-    ids = [pipe.reader_meta("Reader")["shard_id"] for pipe in pipes]
-    per_gpu_counter = [0] * shards_num
-    epoch_counter = 0
-    sample_counter = 0
-
-    if batch_size > data_set_size // shards_num and last_batch_policy == LastBatchPolicy.DROP:
-        assert_raises(
-            RuntimeError,
-            MXNetIterator,
-            pipes,
-            [("ids", MXNetIterator.DATA_TAG)],
-            reader_name="Reader",
-            last_batch_policy=last_batch_policy,
-            glob="It seems that there is no data in the pipeline*last_batch_policy*",
-        )
-        return
-    else:
-        dali_train_iter = MXNetIterator(
-            pipes,
-            [("ids", MXNetIterator.DATA_TAG)],
-            reader_name="Reader",
-            last_batch_policy=last_batch_policy,
-            auto_reset=auto_reset,
-        )
-
-    for _ in range(iters):
-        out_set = []
-        img_ids_list = [[] for _ in range(pipes_number)]
-        orig_length = length = len(dali_train_iter)
-        for it in iter(dali_train_iter):
-            for id in range(pipes_number):
-                tmp = it[id].data[0].squeeze(-1).asnumpy().copy()
-                if it[id].pad:
-                    tmp = tmp[0 : -it[id].pad]
-                img_ids_list[id].append(tmp)
-            sample_counter += batch_size
-            length -= 1
-
-        assert length == 0, (
-            f"The iterator has reported the length of {orig_length} "
-            f"but provided {orig_length - length} iterations."
-        )
-        if not auto_reset:
-            dali_train_iter.reset()
-        for id in range(pipes_number):
-            img_ids_list[id] = np.concatenate(img_ids_list[id])
-            out_set.append(set(img_ids_list[id]))
-
-        ret = check_iterator_results(
-            pad,
-            pipes_number,
-            shards_num,
-            out_set,
-            last_batch_policy,
-            img_ids_list,
-            ids,
-            data_set_size,
-            sample_counter,
-            per_gpu_counter,
-            stick_to_shard,
-            epoch_counter,
-            rounded_shard_size,
-        )
-        ids, sample_counter, per_gpu_counter, epoch_counter, rounded_shard_size = ret
-
-
-@attr("mxnet")
-def test_mxnet_iterator_pass_reader_name():
-    for shards_num in [3, 5, 17]:
-        for batch_size in [3, 5, 7]:
-            for stick_to_shard in [False, True]:
-                for pad in [True, False]:
-                    for last_batch_policy in [
-                        LastBatchPolicy.PARTIAL,
-                        LastBatchPolicy.FILL,
-                        LastBatchPolicy.DROP,
-                    ]:
-                        for iters in [1, 2, 3, 2 * shards_num]:
-                            for pipes_number in [1, shards_num]:
-                                yield (
-                                    check_mxnet_iterator_pass_reader_name,
-                                    shards_num,
-                                    pipes_number,
-                                    batch_size,
-                                    stick_to_shard,
-                                    pad,
-                                    iters,
-                                    last_batch_policy,
-                                    False,
-                                )
-
-
-@attr("mxnet")
-def test_mxnet_iterator_pass_reader_name_autoreset():
-    for auto_reset in [True, False]:
-        yield (
-            check_mxnet_iterator_pass_reader_name,
-            3,
-            1,
-            3,
-            False,
-            True,
-            3,
-            LastBatchPolicy.DROP,
-            auto_reset,
-        )
-
-
-@attr("gluon")
-def test_gluon_iterator_last_batch_no_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=False,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = GluonIterator(
-        pipes, size=pipes[0].epoch_size("Reader"), last_batch_policy=LastBatchPolicy.FILL
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x[0].squeeze(-1).asnumpy(), lambda x: 0, data_size
-    )
-
-    assert len(img_ids_list) > data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) != 1
-
-
-@attr("gluon")
-def test_gluon_iterator_last_batch_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=True,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = GluonIterator(
-        pipes, size=pipes[0].epoch_size("Reader"), last_batch_policy=LastBatchPolicy.FILL
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x[0].squeeze(-1).asnumpy(), lambda x: 0, data_size
-    )
-
-    assert len(img_ids_list) > data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) == 1
-
-    dali_train_iter.reset()
-    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x[0].squeeze(-1).asnumpy(), lambda x: 0, data_size
-    )
-
-    assert len(next_img_ids_list) > data_size
-    assert len(next_img_ids_list_set) == data_size
-    assert len(set(next_mirrored_data)) == 1
-
-
-@attr("gluon")
-def test_gluon_iterator_not_fill_last_batch_pad_last_batch():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    num_gpus = 1
-    batch_size = 100
-
-    pipes, data_size = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=False,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=True,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = GluonIterator(
-        pipes, size=pipes[0].epoch_size("Reader"), last_batch_policy=LastBatchPolicy.PARTIAL
-    )
-
-    img_ids_list, img_ids_list_set, mirrored_data, _, _ = gather_ids(
-        dali_train_iter, lambda x: x[0].squeeze(-1).asnumpy(), lambda x: 0, data_size
-    )
-
-    assert len(img_ids_list) == data_size
-    assert len(img_ids_list_set) == data_size
-    assert len(set(mirrored_data)) != 1
-
-    dali_train_iter.reset()
-    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, pad, remainder = gather_ids(
-        dali_train_iter, lambda x: x[0].squeeze(-1).asnumpy(), lambda x: 0, data_size
-    )
-
-    assert len(next_img_ids_list) == data_size
-    assert len(next_img_ids_list_set) == data_size
-    assert len(set(next_mirrored_data)) != 1
-
-
-@attr("gluon")
-def test_gluon_iterator_sparse_batch():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-    from mxnet.ndarray.ndarray import NDArray
-
-    num_gpus = 1
-    batch_size = 16
-
-    pipes, _ = create_pipeline(
-        lambda gpu: create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=data_sets[0],
-            random_shuffle=True,
-            stick_to_shard=False,
-            shuffle_after_epoch=False,
-            pad_last_batch=True,
-            return_labels=True,
-        ),
-        batch_size,
-        num_gpus,
-    )
-
-    dali_train_iter = GluonIterator(
-        pipes,
-        pipes[0].epoch_size("Reader"),
-        output_types=[GluonIterator.SPARSE_TAG, GluonIterator.DENSE_TAG],
-        last_batch_policy=LastBatchPolicy.FILL,
-    )
-
-    for it in dali_train_iter:
-        labels, ids = it[0]  # gpu 0
-        # labels should be a sparse batch: a list of per-sample NDArray's
-        # ids should be a dense batch: a single NDarray representing the batch
-        assert isinstance(labels, (tuple, list))
-        assert len(labels) == batch_size
-        assert isinstance(labels[0], NDArray)
-        assert isinstance(ids, NDArray)
-
-
-@attr("gluon")
-def check_gluon_iterator_pass_reader_name(
-    shards_num,
-    pipes_number,
-    batch_size,
-    stick_to_shard,
-    pad,
-    iters,
-    last_batch_policy,
-    auto_reset=False,
-):
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    pipes = [
-        create_coco_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            shard_id=id,
-            num_gpus=shards_num,
-            data_paths=data_sets[0],
-            random_shuffle=False,
-            stick_to_shard=stick_to_shard,
-            shuffle_after_epoch=False,
-            pad_last_batch=pad,
-        )
-        for id in range(pipes_number)
-    ]
-
-    data_set_size = pipes[0].reader_meta("Reader")["epoch_size"]
-    rounded_shard_size = math.ceil(math.ceil(data_set_size / shards_num) / batch_size) * batch_size
-    ids = [pipe.reader_meta("Reader")["shard_id"] for pipe in pipes]
-    per_gpu_counter = [0] * shards_num
-    epoch_counter = 0
-    sample_counter = 0
-
-    if batch_size > data_set_size // shards_num and last_batch_policy == LastBatchPolicy.DROP:
-        assert_raises(
-            RuntimeError,
-            GluonIterator,
-            pipes,
-            reader_name="Reader",
-            last_batch_policy=last_batch_policy,
-            glob="It seems that there is no data in the pipeline. This may happen "
-            "if `last_batch_policy` is set to PARTIAL and the requested "
-            "batch size is greater than the shard size.",
-        )
-        return
-    else:
-        dali_train_iter = GluonIterator(
-            pipes, reader_name="Reader", last_batch_policy=last_batch_policy, auto_reset=auto_reset
-        )
-
-    for _ in range(iters):
-        out_set = []
-        img_ids_list = [[] for _ in range(pipes_number)]
-        orig_length = length = len(dali_train_iter)
-        for it in iter(dali_train_iter):
-            for id in range(pipes_number):
-                if len(it[id][0]):
-                    tmp = it[id][0].squeeze(-1).asnumpy().copy()
-                else:
-                    tmp = np.empty([0])
-                img_ids_list[id].append(tmp)
-            sample_counter += batch_size
-            length -= 1
-
-        assert length == 0, (
-            f"The iterator has reported the length of {orig_length} "
-            f"but provided {orig_length - length} iterations."
-        )
-        if not auto_reset:
-            dali_train_iter.reset()
-        for id in range(pipes_number):
-            assert (
-                batch_size > data_set_size // shards_num
-                and last_batch_policy == LastBatchPolicy.DROP
-            ) or len(img_ids_list[id])
-            if len(img_ids_list[id]):
-                img_ids_list[id] = np.concatenate(img_ids_list[id])
-                out_set.append(set(img_ids_list[id]))
-
-        if len(out_set) == 0 and last_batch_policy == LastBatchPolicy.DROP:
-            return
-
-        ret = check_iterator_results(
-            pad,
-            pipes_number,
-            shards_num,
-            out_set,
-            last_batch_policy,
-            img_ids_list,
-            ids,
-            data_set_size,
-            sample_counter,
-            per_gpu_counter,
-            stick_to_shard,
-            epoch_counter,
-            rounded_shard_size,
-        )
-        ids, sample_counter, per_gpu_counter, epoch_counter, rounded_shard_size = ret
-
-
-@attr("gluon")
-def test_gluon_iterator_pass_reader_name():
-    for shards_num in [3, 5, 17]:
-        for batch_size in [3, 5, 7]:
-            for stick_to_shard in [False, True]:
-                for pad in [True, False]:
-                    for last_batch_policy in [
-                        LastBatchPolicy.PARTIAL,
-                        LastBatchPolicy.FILL,
-                        LastBatchPolicy.DROP,
-                    ]:
-                        for iters in [1, 2, 3, 2 * shards_num]:
-                            for pipes_number in [1, shards_num]:
-                                yield (
-                                    check_gluon_iterator_pass_reader_name,
-                                    shards_num,
-                                    pipes_number,
-                                    batch_size,
-                                    stick_to_shard,
-                                    pad,
-                                    iters,
-                                    last_batch_policy,
-                                    False,
-                                )
-
-
-@attr("gluon")
-def test_gluon_iterator_pass_reader_name_autoreset():
-    for auto_reset in [True, False]:
-        yield (
-            check_gluon_iterator_pass_reader_name,
-            3,
-            1,
-            3,
-            False,
-            True,
-            3,
-            LastBatchPolicy.DROP,
-            auto_reset,
-        )
-
-
 @attr("pytorch")
 def test_pytorch_iterator_last_batch_no_pad_last_batch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
@@ -1309,69 +605,6 @@ def test_ragged_iterator_sparse_list_batch(exec_dynamic):
         assert ids.is_sparse is False
 
 
-@attr("mxnet")
-def test_mxnet_iterator_feed_ndarray():
-    from nvidia.dali.plugin.mxnet import feed_ndarray as feed_ndarray
-    import mxnet as mx
-
-    num_gpus = 1
-    batch_size = 100
-    pipes, _ = create_pipeline(
-        lambda gpu: create_custom_pipeline(
-            batch_size=batch_size,
-            num_threads=4,
-            device_id=gpu,
-            num_gpus=num_gpus,
-            data_paths=image_data_set,
-        ),
-        batch_size,
-        num_gpus,
-    )
-    for gpu_id in range(num_gpus):
-        pipe = pipes[gpu_id]
-        outs = pipe.run()
-        out_data = outs[0].as_tensor()
-        with mx.Context(mx.gpu(gpu_id)):
-            arr = mx.nd.zeros(out_data.shape(), dtype=np.float32)
-            mx.base._LIB.MXNDArrayWaitToWrite(arr.handle)
-            # Using DALI's internal stream
-            feed_ndarray(out_data, arr, cuda_stream=None)
-            np.testing.assert_equal(arr.asnumpy(), outs[0].as_cpu().as_array())
-
-            arr2 = mx.nd.zeros(out_data.shape(), dtype=np.float32)
-            mx.base._LIB.MXNDArrayWaitToWrite(arr2.handle)
-            feed_ndarray(out_data, arr2, cuda_stream=0)  # Using default stream
-            np.testing.assert_equal(arr2.asnumpy(), outs[0].as_cpu().as_array())
-
-
-@attr("mxnet")
-def check_mxnet_iterator_feed_ndarray_types(data_type):
-    from nvidia.dali.plugin.mxnet import feed_ndarray as feed_ndarray
-    import mxnet as mx
-
-    shape = [3, 9]
-    if np.issubdtype(data_type, np.integer):
-        arr = np.random.randint(
-            np.iinfo(data_type).min, high=np.iinfo(data_type).max, size=shape, dtype=data_type
-        )
-    elif data_type == np.bool_:
-        arr = np.random.randint(0, high=2, size=shape, dtype=data_type)
-    else:
-        arr = np.random.randn(*shape).astype(data_type)
-    tensor = TensorCPU(arr)
-    mnt = mx.nd.empty(shape, dtype=data_type)
-    feed_ndarray(tensor, mnt)
-    assert np.all(mnt.asnumpy() == arr)
-
-
-@attr("mxnet")
-def test_mxnet_iterator_feed_ndarray_types():
-    # MXNet doesn't support int16
-    types = [np.float32, np.float64, np.float16, np.uint8, np.int8, np.bool_, np.int32, np.int64]
-    for data_type in types:
-        yield check_mxnet_iterator_feed_ndarray_types, data_type
-
-
 @attr("paddle")
 def test_paddle_iterator_feed_ndarray():
     from nvidia.dali.plugin.paddle import feed_ndarray as feed_ndarray
@@ -2100,276 +1333,6 @@ def get_data():
     assert counter == iter_limit * runs
 
 
-# MXNet
-
-
-@attr("mxnet")
-def test_stop_iteration_mxnet():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return MXNetIterator(
-            pipe, [("data", MXNetIterator.DATA_TAG)], size=size, auto_reset=auto_reset
-        )
-
-    iter_name = "MXNetIterator"
-    for (
-        batch_size,
-        epochs,
-        iter_num,
-        total_iter_num,
-        auto_reset,
-        infinite,
-    ) in stop_iteration_case_generator():
-        check_stop_iter(
-            fw_iter, iter_name, batch_size, epochs, iter_num, total_iter_num, auto_reset, infinite
-        )
-
-
-@attr("mxnet")
-def test_stop_iteration_mxnet_fail_multi():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return MXNetIterator(
-            pipe, [("data", MXNetIterator.DATA_TAG)], size=size, auto_reset=auto_reset
-        )
-
-    check_stop_iter_fail_multi(fw_iter)
-
-
-@attr("mxnet")
-def test_stop_iteration_mxnet_fail_single():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return MXNetIterator(
-            pipe, [("data", MXNetIterator.DATA_TAG)], size=size, auto_reset=auto_reset
-        )
-
-    check_stop_iter_fail_single(fw_iter)
-
-
-@attr("mxnet")
-def test_mxnet_iterator_wrapper_first_iteration():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_iterator_wrapper_first_iteration(
-        MXNetIterator, [("data", MXNetIterator.DATA_TAG)], size=100
-    )
-
-
-@attr("mxnet")
-def test_mxnet_external_source_autoreset():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_external_source_autoreset(
-        MXNetIterator, [("data", MXNetIterator.DATA_TAG)], to_np=lambda x: x[0].data[0].asnumpy()
-    )
-
-
-@attr("mxnet")
-def test_mxnet_external_source_do_not_prepare():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_external_source_autoreset(
-        MXNetIterator,
-        [("data", MXNetIterator.DATA_TAG)],
-        to_np=lambda x: x[0].data[0].asnumpy(),
-        prepare_first_batch=False,
-    )
-
-
-@attr("mxnet")
-def check_mxnet_iterator_properties(prepare_ahead):
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    def data_to_np(x):
-        return x.data[0].asnumpy()
-
-    def label_to_np(x):
-        return x.label[0].asnumpy()
-
-    max_batch_size = 4
-    iter_limit = 4
-    runs = 3
-    test_data_shape = [2, 3, 4]
-    test_label_shape = [2, 7, 5]
-    i = 0
-    dataset = [
-        [
-            [
-                np.random.randint(0, 255, size=test_data_shape, dtype=np.uint8)
-                for _ in range(max_batch_size)
-            ],
-            [
-                np.random.randint(0, 255, size=test_label_shape, dtype=np.uint8)
-                for _ in range(max_batch_size)
-            ],
-        ]
-        for _ in range(iter_limit)
-    ]
-
-    def get_data():
-        nonlocal i
-        if i == iter_limit:
-            i = 0
-            raise StopIteration
-        out = dataset[i]
-        i += 1
-        return out
-
-    pipe = Pipeline(batch_size=max_batch_size, num_threads=1, device_id=0)
-    with pipe:
-        outs = fn.external_source(source=get_data, num_outputs=2)
-    pipe.set_outputs(*outs)
-
-    it = MXNetIterator(
-        [pipe],
-        [("data", MXNetIterator.DATA_TAG), ("label", MXNetIterator.LABEL_TAG)],
-        auto_reset=True,
-        prepare_first_batch=prepare_ahead,
-    )
-    counter = 0
-    assert getattr(it, "provide_data")[0].shape == tuple([max_batch_size] + test_data_shape)
-    assert getattr(it, "provide_label")[0].shape == tuple([max_batch_size] + test_label_shape)
-    for _ in range(runs):
-        for j, data in enumerate(it):
-            assert (data_to_np(data[0]) == np.stack(dataset[j][0])).all()
-            assert (label_to_np(data[0]) == np.stack(dataset[j][1])).all()
-            assert getattr(it, "provide_data")[0].shape == tuple([max_batch_size] + test_data_shape)
-            assert getattr(it, "provide_label")[0].shape == tuple(
-                [max_batch_size] + test_label_shape
-            )
-            counter += 1
-    assert counter == iter_limit * runs
-
-
-@attr("mxnet")
-def test_mxnet_iterator_properties():
-    for prep in [True, False]:
-        yield check_mxnet_iterator_properties, prep
-
-
-@attr("mxnet")
-def test_mxnet_external_source_variable_size_pass():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_external_source_variable_size(
-        MXNetIterator,
-        [("data", MXNetIterator.DATA_TAG)],
-        to_np=lambda x: x.data[0].asnumpy(),
-        dynamic_shape=True,
-    )
-
-
-@attr("mxnet")
-def test_mxnet_external_source_variable_size_fail():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    assert_raises(
-        AssertionError,
-        check_external_source_variable_size,
-        MXNetIterator,
-        [("data", MXNetIterator.DATA_TAG)],
-        to_np=lambda x: x.data[0].asnumpy(),
-        iter_size=5,
-        dynamic_shape=True,
-    )
-
-
-# Gluon
-
-
-@attr("gluon")
-@params(*stop_iteration_case_generator())
-def test_stop_iteration_gluon(batch_size, epochs, iter_num, total_iter_num, auto_reset, infinite):
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return GluonIterator(
-            pipe, size, output_types=[GluonIterator.DENSE_TAG], auto_reset=auto_reset
-        )
-
-    iter_name = "GluonIterator"
-    check_stop_iter(
-        fw_iter, iter_name, batch_size, epochs, iter_num, total_iter_num, auto_reset, infinite
-    )
-
-
-@attr("gluon")
-def test_stop_iteration_gluon_fail_multi():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return GluonIterator(pipe, size, auto_reset=auto_reset)
-
-    check_stop_iter_fail_multi(fw_iter)
-
-
-@attr("gluon")
-def test_stop_iteration_gluon_fail_single():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    def fw_iter(pipe, size, auto_reset):
-        return GluonIterator(pipe, size=size, auto_reset=auto_reset)
-
-    check_stop_iter_fail_single(fw_iter)
-
-
-@attr("gluon")
-def test_gluon_iterator_wrapper_first_iteration():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_iterator_wrapper_first_iteration(
-        GluonIterator, output_types=[GluonIterator.DENSE_TAG], size=100
-    )
-
-
-@attr("gluon")
-def test_gluon_external_source_autoreset():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_external_source_autoreset(
-        GluonIterator, output_types=[GluonIterator.DENSE_TAG], to_np=lambda x: x[0][0].asnumpy()
-    )
-
-
-@attr("gluon")
-def test_gluon_external_source_do_not_prepare():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_external_source_autoreset(
-        GluonIterator,
-        output_types=[GluonIterator.DENSE_TAG],
-        to_np=lambda x: x[0][0].asnumpy(),
-        prepare_first_batch=False,
-    )
-
-
-@attr("gluon")
-def test_gluon_external_source_variable_size_pass():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_external_source_variable_size(
-        GluonIterator, output_types=[GluonIterator.DENSE_TAG], to_np=lambda x: x[0].asnumpy()
-    )
-
-
-@attr("gluon")
-def test_gluon_external_source_variable_size_fail():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    assert_raises(
-        AssertionError,
-        check_external_source_variable_size,
-        GluonIterator,
-        output_types=[GluonIterator.DENSE_TAG],
-        to_np=lambda x: x[0].asnumpy(),
-        iter_size=5,
-    )
-
-
 # PyTorch
 
 
@@ -2686,27 +1649,6 @@ def get_data():
     assert counter == iter_limit * runs
 
 
-@attr("mxnet")
-def test_mxnet_prepare_first_batch():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_prepare_first_batch(
-        MXNetIterator,
-        [("data", MXNetIterator.DATA_TAG)],
-        to_np=lambda x: x.data[0].asnumpy(),
-        dynamic_shape=True,
-    )
-
-
-@attr("gluon")
-def test_gluon_prepare_first_batch():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_prepare_first_batch(
-        GluonIterator, output_types=[GluonIterator.DENSE_TAG], to_np=lambda x: x[0].asnumpy()
-    )
-
-
 @attr("pytorch")
 def test_pytorch_prepare_first_batch():
     from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator
@@ -2737,24 +1679,6 @@ def feed_ndarray_test_pipeline():
     return np.array([1], dtype=float)
 
 
-@attr("mxnet")
-def test_mxnet_feed_ndarray():
-    from nvidia.dali.plugin.mxnet import feed_ndarray
-    import mxnet
-
-    pipe = feed_ndarray_test_pipeline(batch_size=1, num_threads=1, device_id=0)
-    out = pipe.run()[0]
-    mxnet_tensor = mxnet.nd.empty([1], None, np.int8)
-    assert_raises(
-        AssertionError,
-        feed_ndarray,
-        out,
-        mxnet_tensor,
-        glob="The element type of DALI Tensor/TensorList doesn't match "
-        "the element type of the target MXNet NDArray",
-    )
-
-
 @attr("pytorch")
 def test_pytorch_feed_ndarray():
     from nvidia.dali.plugin.pytorch import feed_ndarray
@@ -2824,32 +1748,6 @@ def test_paddle_wrong_last_batch_policy_type():
     )
 
 
-@attr("mxnet")
-def test_mxnet_wrong_last_batch_policy_type():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    check_iterator_build_error(
-        ValueError,
-        MXNetIterator,
-        glob="Wrong type for `last_batch_policy`.",
-        output_map=[("data", MXNetIterator.DATA_TAG)],
-        last_batch_policy="FILL",
-    )
-
-
-@attr("gluon")
-def test_gluon_wrong_last_batch_policy_type():
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    check_iterator_build_error(
-        ValueError,
-        GluonIterator,
-        glob="Wrong type for `last_batch_policy`.",
-        output_types=[GluonIterator.DENSE_TAG],
-        last_batch_policy="FILL",
-    )
-
-
 @attr("jax")
 def test_jax_wrong_last_batch_policy_type():
     from nvidia.dali.plugin.jax import DALIGenericIterator as JaxIterator
@@ -2921,47 +1819,6 @@ def autoreset_iter_params():
             yield auto_reset_op, policy
 
 
-@attr("mxnet")
-@params(*autoreset_iter_params())
-def test_mxnet_autoreset_iter(auto_reset_op, policy):
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    def fw_iterator(pipeline, reader_name, auto_reset, last_batch_policy):
-        return MXNetIterator(
-            pipeline,
-            [("data", MXNetIterator.DATA_TAG)],
-            reader_name=reader_name,
-            auto_reset=auto_reset,
-            last_batch_policy=last_batch_policy,
-        )
-
-    def extract_data(x):
-        data = x.data[0].asnumpy()
-        data = data[0 : -x.pad]
-        return data
-
-    check_autoreset_iter(fw_iterator, extract_data, auto_reset_op, policy)
-
-
-@attr("gluon")
-@params(*autoreset_iter_params())
-def test_gluon_autoreset_iter(auto_reset_op, policy):
-    from nvidia.dali.plugin.mxnet import DALIGluonIterator as GluonIterator
-
-    def fw_iterator(pipeline, reader_name, auto_reset, last_batch_policy):
-        return GluonIterator(
-            pipeline,
-            reader_name=reader_name,
-            auto_reset=auto_reset,
-            last_batch_policy=last_batch_policy,
-        )
-
-    def extract_data(x):
-        return x[0].asnumpy()
-
-    check_autoreset_iter(fw_iterator, extract_data, auto_reset_op, policy)
-
-
 @attr("pytorch")
 @params(*autoreset_iter_params())
 def test_pytorch_autoreset_iter(auto_reset_op, policy):
diff --git a/dali/test/python/test_fw_iterators_detection.py b/dali/test/python/test_fw_iterators_detection.py
index f72d8718676..a7b71ab2b54 100644
--- a/dali/test/python/test_fw_iterators_detection.py
+++ b/dali/test/python/test_fw_iterators_detection.py
@@ -87,19 +87,6 @@ def test_api_fw_check1_pytorch():
     yield from test_api_fw_check1(PyTorchIterator, ["data", "bboxes", "label"])
 
 
-def test_api_fw_check1_mxnet():
-    from nvidia.dali.plugin.mxnet import DALIGenericIterator as MXNetIterator
-
-    yield from test_api_fw_check1(
-        MXNetIterator,
-        [
-            ("data", MXNetIterator.DATA_TAG),
-            ("bboxes", MXNetIterator.LABEL_TAG),
-            ("label", MXNetIterator.LABEL_TAG),
-        ],
-    )
-
-
 @attr("paddle")
 def test_api_fw_check1_paddle():
     from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
diff --git a/dali/test/python/test_pool.py b/dali/test/python/test_pool.py
index 6f37d98c72f..d9f3ac80c00 100644
--- a/dali/test/python/test_pool.py
+++ b/dali/test/python/test_pool.py
@@ -17,10 +17,10 @@
 from contextlib import closing
 from nvidia.dali._utils.external_source_impl import get_callback_from_source
 from nvidia.dali.types import SampleInfo
-from functools import wraps
 import numpy as np
 import os
-from nose_utils import raises, with_setup
+from nose2.tools import params
+from nose_utils import raises
 
 from test_pool_utils import capture_processes, setup_function, teardown_function
 
@@ -94,28 +94,20 @@ def assert_scheduled_num(context, num_tasks):
 
 start_methods = ["fork", "spawn"]
 
-# Invoke the `fn` with all start methods. Call setup and teardown before and after the test.
+# Invoke the `fn` with all start methods. Call setUp and tearDown before and after the test.
 #
 # We do this to not repeat the pattern of:
 #
-# def check_something(start_method):
-#    ...
+# class TestPoolOneCallback:
+#   def setUp(self):
+#     setup_function()
 #
-# @with_setup(setup_function, teardown_function)
-# def test_something():
-#   for start_method in start_methods:
-#      yield check_something, start_method
-
-
-def check_pool(fn):
-    @wraps(fn)
-    def wrapper():
-        for start_method in start_methods:
-            setup_function()
-            yield fn, start_method
-            teardown_function()
-
-    return wrapper
+#   def tearDown(self):
+#       teardown_function()
+#
+# @cartesian_params(*start_methods)
+# def test_something(start_method):
+#   ...
 
 
 # ################################################################################################ #
@@ -123,61 +115,68 @@ def wrapper():
 # ################################################################################################ #
 
 
-@check_pool
-def test_pool_one_task(start_method):
-    groups = [MockGroup.from_callback(simple_callback)]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
-    ) as pool:
-        pids = get_pids(pool)
-        pid = pids[0]
-        tasks = [(SampleInfo(0, 0, 0, 0),)]
-        work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        batch = pool.receive_batch(context_i=0)
-        for task, sample in zip(tasks, batch):
-            np.testing.assert_array_equal(answer(pid, *task), sample)
-
-
-@check_pool
-def test_pool_multi_task(start_method):
-    groups = [MockGroup.from_callback(simple_callback)]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
-    ) as pool:
-        pids = get_pids(pool)
-        pid = pids[0]
-        tasks = [(SampleInfo(i, i, 0, 0),) for i in range(10)]
-        work_batch = TaskArgs.make_sample(SampleRange(0, 10, 0, 0))
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        batch = pool.receive_batch(context_i=0)
-        for task, sample in zip(tasks, batch):
-            np.testing.assert_array_equal(answer(pid, *task), sample)
-
-
-# Test that we can safely hold as many results as the keep_alive_queue_size
-@check_pool
-def test_pool_no_overwrite_batch(start_method):
-    groups = [MockGroup.from_callback(simple_callback, prefetch_queue_depth=0)]
-    for depth in [1, 2, 4, 8]:
+class TestPoolOneCallback:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @params(*start_methods)
+    def test_pool_one_task(self, start_method):
+        groups = [MockGroup.from_callback(simple_callback)]
         with create_pool(
-            groups, keep_alive_queue_size=depth, num_workers=1, start_method=start_method
+            groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
         ) as pool:
             pids = get_pids(pool)
             pid = pids[0]
-            work_batches = [TaskArgs.make_sample(SampleRange(i, i + 1, i, 0)) for i in range(depth)]
-            task_list = [[(SampleInfo(i, 0, i, 0),)] for i in range(depth)]
-            for i, work_batch in enumerate(work_batches):
-                pool.schedule_batch(context_i=0, work_batch=work_batch)
-            assert_scheduled_num(pool.contexts[0], depth)
-            batches = []
-            for i in range(depth):
-                batches.append(pool.receive_batch(context_i=0))
-                assert_scheduled_num(pool.contexts[0], depth - 1 - i)
-            tasks_batches = zip(task_list, batches)
-            for tasks, batch in tasks_batches:
-                for task, sample in zip(tasks, batch):
-                    np.testing.assert_array_equal(answer(pid, *task), sample)
+            tasks = [(SampleInfo(0, 0, 0, 0),)]
+            work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
+            pool.schedule_batch(context_i=0, work_batch=work_batch)
+            batch = pool.receive_batch(context_i=0)
+            for task, sample in zip(tasks, batch):
+                np.testing.assert_array_equal(answer(pid, *task), sample)
+
+    @params(*start_methods)
+    def test_pool_multi_task(self, start_method):
+        groups = [MockGroup.from_callback(simple_callback)]
+        with create_pool(
+            groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
+        ) as pool:
+            pids = get_pids(pool)
+            pid = pids[0]
+            tasks = [(SampleInfo(i, i, 0, 0),) for i in range(10)]
+            work_batch = TaskArgs.make_sample(SampleRange(0, 10, 0, 0))
+            pool.schedule_batch(context_i=0, work_batch=work_batch)
+            batch = pool.receive_batch(context_i=0)
+            for task, sample in zip(tasks, batch):
+                np.testing.assert_array_equal(answer(pid, *task), sample)
+
+    # Test that we can safely hold as many results as the keep_alive_queue_size
+    @params(*start_methods)
+    def test_pool_no_overwrite_batch(self, start_method):
+        groups = [MockGroup.from_callback(simple_callback, prefetch_queue_depth=0)]
+        for depth in [1, 2, 4, 8]:
+            with create_pool(
+                groups, keep_alive_queue_size=depth, num_workers=1, start_method=start_method
+            ) as pool:
+                pids = get_pids(pool)
+                pid = pids[0]
+                work_batches = [
+                    TaskArgs.make_sample(SampleRange(i, i + 1, i, 0)) for i in range(depth)
+                ]
+                task_list = [[(SampleInfo(i, 0, i, 0),)] for i in range(depth)]
+                for i, work_batch in enumerate(work_batches):
+                    pool.schedule_batch(context_i=0, work_batch=work_batch)
+                assert_scheduled_num(pool.contexts[0], depth)
+                batches = []
+                for i in range(depth):
+                    batches.append(pool.receive_batch(context_i=0))
+                    assert_scheduled_num(pool.contexts[0], depth - 1 - i)
+                tasks_batches = zip(task_list, batches)
+                for tasks, batch in tasks_batches:
+                    for task, sample in zip(tasks, batch):
+                        np.testing.assert_array_equal(answer(pid, *task), sample)
 
 
 # ################################################################################################ #
@@ -185,21 +184,28 @@ def test_pool_no_overwrite_batch(start_method):
 # ################################################################################################ #
 
 
-@check_pool
-def test_pool_work_split_multiple_tasks(start_method):
-    callbacks = [MockGroup.from_callback(simple_callback)]
-    with create_pool(
-        callbacks, keep_alive_queue_size=1, num_workers=2, start_method=start_method
-    ) as pool:
-        num_tasks = 16
-        pids = get_pids(pool)
-        assert len(pids) == 2
-        work_batch = TaskArgs.make_sample(SampleRange(0, num_tasks, 0, 0))
-        tasks = [(SampleInfo(i, i, 0, 0),) for i in range(num_tasks)]
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        batch = pool.receive_batch(context_i=0)
-        for task, sample in zip(tasks, batch):
-            np.testing.assert_array_equal(answer(-1, *task)[1:], sample[1:])
+class TestPoolMultipleWorkers:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @params(*start_methods)
+    def test_pool_work_split_multiple_tasks(self, start_method):
+        callbacks = [MockGroup.from_callback(simple_callback)]
+        with create_pool(
+            callbacks, keep_alive_queue_size=1, num_workers=2, start_method=start_method
+        ) as pool:
+            num_tasks = 16
+            pids = get_pids(pool)
+            assert len(pids) == 2
+            work_batch = TaskArgs.make_sample(SampleRange(0, num_tasks, 0, 0))
+            tasks = [(SampleInfo(i, i, 0, 0),) for i in range(num_tasks)]
+            pool.schedule_batch(context_i=0, work_batch=work_batch)
+            batch = pool.receive_batch(context_i=0)
+            for task, sample in zip(tasks, batch):
+                np.testing.assert_array_equal(answer(-1, *task)[1:], sample[1:])
 
 
 # ################################################################################################ #
@@ -207,128 +213,125 @@ def test_pool_work_split_multiple_tasks(start_method):
 # ################################################################################################ #
 
 
-@check_pool
-def test_pool_iterator_dedicated_worker(start_method):
-    groups = [
-        MockGroup.from_callback(simple_callback, prefetch_queue_depth=3),
-        MockGroup.from_callback(IteratorCb(), prefetch_queue_depth=3, batch=True),
-    ]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=4, start_method=start_method
-    ) as pool:
-        pids = get_pids(pool)
-        assert len(pids) == 4
-        tasks_list = []
-        samples_count = 0
-        for i in range(4):
-            tasks = [(SampleInfo(samples_count + j, j, i, 0),) for j in range(i + 1)]
-            tasks_list.append(tasks)
-            work_batch = TaskArgs.make_sample(
-                SampleRange(samples_count, samples_count + i + 1, i, 0)
-            )
-            samples_count += len(tasks)
+class TestPoolMultipleCallbacks:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @params(*start_methods)
+    def test_pool_iterator_dedicated_worker(self, start_method):
+        groups = [
+            MockGroup.from_callback(simple_callback, prefetch_queue_depth=3),
+            MockGroup.from_callback(IteratorCb(), prefetch_queue_depth=3, batch=True),
+        ]
+        with create_pool(
+            groups, keep_alive_queue_size=1, num_workers=4, start_method=start_method
+        ) as pool:
+            pids = get_pids(pool)
+            assert len(pids) == 4
+            tasks_list = []
+            samples_count = 0
+            for i in range(4):
+                tasks = [(SampleInfo(samples_count + j, j, i, 0),) for j in range(i + 1)]
+                tasks_list.append(tasks)
+                work_batch = TaskArgs.make_sample(
+                    SampleRange(samples_count, samples_count + i + 1, i, 0)
+                )
+                samples_count += len(tasks)
+                pool.schedule_batch(context_i=0, work_batch=work_batch)
+                pool.schedule_batch(context_i=1, work_batch=TaskArgs.make_batch((i,)))
+            assert pool.contexts[0].dedicated_worker_id is None
+            iter_worker_num = pool.contexts[1].dedicated_worker_id
+            iter_worker_pid = pool.pool._processes[iter_worker_num].pid
+            for i in range(4):
+                batch_0 = pool.receive_batch(context_i=0)
+                batch_1 = pool.receive_batch(context_i=1)
+                tasks = tasks_list[i]
+                assert len(batch_0) == len(tasks)
+                assert len(batch_1) == len(tasks)
+                for task, sample in zip(tasks, batch_0):
+                    np.testing.assert_array_equal(answer(-1, *task)[1:], sample[1:])
+                for sample in batch_1:
+                    np.testing.assert_array_equal(np.array([iter_worker_pid, i + 1]), sample)
+
+    @params(*start_methods)
+    def test_pool_many_ctxs(self, start_method):
+        callbacks = [simple_callback, another_callback]
+        groups = [MockGroup.from_callback(cb) for cb in callbacks]
+        with create_pool(
+            groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
+        ) as pool:
+            pids = get_pids(pool)
+            tasks = [(SampleInfo(0, 0, 0, 0),)]
+            work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
             pool.schedule_batch(context_i=0, work_batch=work_batch)
-            pool.schedule_batch(context_i=1, work_batch=TaskArgs.make_batch((i,)))
-        assert pool.contexts[0].dedicated_worker_id is None
-        iter_worker_num = pool.contexts[1].dedicated_worker_id
-        iter_worker_pid = pool.pool._processes[iter_worker_num].pid
-        for i in range(4):
+            pool.schedule_batch(context_i=1, work_batch=work_batch)
+            batch_0 = pool.receive_batch(context_i=0)
+            batch_1 = pool.receive_batch(context_i=1)
+            for task, sample, pid in zip(tasks, batch_0, pids):
+                np.testing.assert_array_equal(answer(pid, *task), sample)
+            for task, sample, pid in zip(tasks, batch_1, pids):
+                np.testing.assert_array_equal(answer(pid, *task) + 100, sample)
+
+    @params(*start_methods)
+    def test_pool_context_sync(self, start_method):
+        callbacks = [simple_callback, another_callback]
+        groups = [MockGroup.from_callback(cb, prefetch_queue_depth=3) for cb in callbacks]
+        with create_pool(
+            groups, keep_alive_queue_size=1, num_workers=4, start_method=start_method
+        ) as pool:
+            capture_processes(pool)
+            for i in range(4):
+                work_batch = TaskArgs.make_sample(SampleRange(0, 10 * (i + 1), 0, 0))
+                pool.schedule_batch(context_i=0, work_batch=work_batch)
+                pool.schedule_batch(context_i=1, work_batch=work_batch)
+            assert_scheduled_num(pool.contexts[0], 4)
+            assert_scheduled_num(pool.contexts[1], 4)
+            # pool after a reset should discard all previously scheduled tasks
+            # (and sync workers to avoid race on writing to results buffer)
+            pool.reset()
+            tasks = [(SampleInfo(1000 + j, j, 0, 1),) for j in range(5)]
+            work_batch = TaskArgs.make_sample(SampleRange(1000, 1005, 0, 1))
+            pool.schedule_batch(context_i=0, work_batch=work_batch)
+            pool.schedule_batch(context_i=1, work_batch=work_batch)
+            assert_scheduled_num(pool.contexts[0], 1)
+            assert_scheduled_num(pool.contexts[1], 1)
             batch_0 = pool.receive_batch(context_i=0)
             batch_1 = pool.receive_batch(context_i=1)
-            tasks = tasks_list[i]
             assert len(batch_0) == len(tasks)
             assert len(batch_1) == len(tasks)
             for task, sample in zip(tasks, batch_0):
                 np.testing.assert_array_equal(answer(-1, *task)[1:], sample[1:])
-            for sample in batch_1:
-                np.testing.assert_array_equal(np.array([iter_worker_pid, i + 1]), sample)
-
-
-@check_pool
-def test_pool_many_ctxs(start_method):
-    callbacks = [simple_callback, another_callback]
-    groups = [MockGroup.from_callback(cb) for cb in callbacks]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=1, start_method=start_method
-    ) as pool:
-        pids = get_pids(pool)
-        pid = pids[0]
-        tasks = [(SampleInfo(0, 0, 0, 0),)]
-        work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        pool.schedule_batch(context_i=1, work_batch=work_batch)
-        batch_0 = pool.receive_batch(context_i=0)
-        batch_1 = pool.receive_batch(context_i=1)
-        for task, sample, pid in zip(tasks, batch_0, pids):
-            np.testing.assert_array_equal(answer(pid, *task), sample)
-        for task, sample, pid in zip(tasks, batch_1, pids):
-            np.testing.assert_array_equal(answer(pid, *task) + 100, sample)
-
-
-@check_pool
-def test_pool_context_sync(start_method):
-    callbacks = [simple_callback, another_callback]
-    groups = [MockGroup.from_callback(cb, prefetch_queue_depth=3) for cb in callbacks]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=4, start_method=start_method
-    ) as pool:
-        capture_processes(pool)
-        for i in range(4):
-            tasks = [(SampleInfo(j, 0, 0, 0),) for j in range(10 * (i + 1))]
-            work_batch = TaskArgs.make_sample(SampleRange(0, 10 * (i + 1), 0, 0))
-            pool.schedule_batch(context_i=0, work_batch=work_batch)
-            pool.schedule_batch(context_i=1, work_batch=work_batch)
-        assert_scheduled_num(pool.contexts[0], 4)
-        assert_scheduled_num(pool.contexts[1], 4)
-        # pool after a reset should discard all previously scheduled tasks
-        # (and sync workers to avoid race on writing to results buffer)
-        pool.reset()
-        tasks = [(SampleInfo(1000 + j, j, 0, 1),) for j in range(5)]
-        work_batch = TaskArgs.make_sample(SampleRange(1000, 1005, 0, 1))
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        pool.schedule_batch(context_i=1, work_batch=work_batch)
-        assert_scheduled_num(pool.contexts[0], 1)
-        assert_scheduled_num(pool.contexts[1], 1)
-        batch_0 = pool.receive_batch(context_i=0)
-        batch_1 = pool.receive_batch(context_i=1)
-        assert len(batch_0) == len(tasks)
-        assert len(batch_1) == len(tasks)
-        for task, sample in zip(tasks, batch_0):
-            np.testing.assert_array_equal(answer(-1, *task)[1:], sample[1:])
-        for task, sample in zip(tasks, batch_1):
-            np.testing.assert_array_equal(answer(-1, *task)[1:] + 100, sample[1:])
-
-
-@with_setup(setup_function, teardown_function)
-def _test_multiple_stateful_sources_single_worker(num_workers):
-    groups = [
-        MockGroup.from_callback(IteratorCb(), batch=True),
-        MockGroup.from_callback(IteratorCb(), batch=True),
-    ]
-    with create_pool(
-        groups, keep_alive_queue_size=1, num_workers=num_workers, start_method="spawn"
-    ) as pool:
-        pids = get_pids(pool)
-        assert len(pids) == min(num_workers, len(groups))
-        pool.schedule_batch(context_i=0, work_batch=TaskArgs.make_batch((0,)))
-        pool.schedule_batch(context_i=1, work_batch=TaskArgs.make_batch((0,)))
-        iter_worker_num_0 = pool.contexts[0].dedicated_worker_id
-        iter_worker_num_1 = pool.contexts[1].dedicated_worker_id
-        iter_worker_pid_0 = pool.pool._processes[iter_worker_num_0].pid
-        iter_worker_pid_1 = pool.pool._processes[iter_worker_num_1].pid
-        batch_0 = pool.receive_batch(context_i=0)
-        batch_1 = pool.receive_batch(context_i=1)
-        np.testing.assert_array_equal(np.array([iter_worker_pid_0, 1]), batch_0[0])
-        np.testing.assert_array_equal(np.array([iter_worker_pid_1, 1]), batch_1[0])
-        if num_workers == 1:
-            assert iter_worker_pid_0 == iter_worker_pid_1
-        else:
-            assert iter_worker_pid_0 != iter_worker_pid_1
-
-
-def test_multiple_stateful_sources_single_worker():
-    for num_workers in (1, 4):
-        yield _test_multiple_stateful_sources_single_worker, num_workers
+            for task, sample in zip(tasks, batch_1):
+                np.testing.assert_array_equal(answer(-1, *task)[1:] + 100, sample[1:])
+
+    @params(1, 4)
+    def test_multiple_stateful_sources_single_worker(self, num_workers):
+        groups = [
+            MockGroup.from_callback(IteratorCb(), batch=True),
+            MockGroup.from_callback(IteratorCb(), batch=True),
+        ]
+        with create_pool(
+            groups, keep_alive_queue_size=1, num_workers=num_workers, start_method="spawn"
+        ) as pool:
+            pids = get_pids(pool)
+            assert len(pids) == min(num_workers, len(groups))
+            pool.schedule_batch(context_i=0, work_batch=TaskArgs.make_batch((0,)))
+            pool.schedule_batch(context_i=1, work_batch=TaskArgs.make_batch((0,)))
+            iter_worker_num_0 = pool.contexts[0].dedicated_worker_id
+            iter_worker_num_1 = pool.contexts[1].dedicated_worker_id
+            iter_worker_pid_0 = pool.pool._processes[iter_worker_num_0].pid
+            iter_worker_pid_1 = pool.pool._processes[iter_worker_num_1].pid
+            batch_0 = pool.receive_batch(context_i=0)
+            batch_1 = pool.receive_batch(context_i=1)
+            np.testing.assert_array_equal(np.array([iter_worker_pid_0, 1]), batch_0[0])
+            np.testing.assert_array_equal(np.array([iter_worker_pid_1, 1]), batch_1[0])
+            if num_workers == 1:
+                assert iter_worker_pid_0 == iter_worker_pid_1
+            else:
+                assert iter_worker_pid_0 != iter_worker_pid_1
 
 
 # ################################################################################################ #
@@ -340,18 +343,24 @@ def invalid_callback(i):
     return "42"
 
 
-@raises(
-    Exception,
-    glob="Unsupported callback return type. Expected NumPy array, PyTorch or "
-    "MXNet cpu tensors, DALI TensorCPU, or list or tuple of them representing sample. Got",
-)
-@with_setup(setup_function, teardown_function)
-def test_pool_invalid_return():
-    callbacks = [MockGroup.from_callback(invalid_callback)]
-    with create_pool(
-        callbacks, keep_alive_queue_size=1, num_workers=1, start_method="spawn"
-    ) as pool:
-        _ = get_pids(pool)
-        work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
-        pool.schedule_batch(context_i=0, work_batch=work_batch)
-        pool.receive_batch(context_i=0)
+class TestPoolInvalidReturn:
+    def setUp(self):
+        setup_function()
+
+    def tearDown(self):
+        teardown_function()
+
+    @raises(
+        Exception,
+        glob="Unsupported callback return type. Expected NumPy array, PyTorch or "
+        "MXNet cpu tensors, DALI TensorCPU, or list or tuple of them representing sample. Got",
+    )
+    def test_pool_invalid_return(self):
+        callbacks = [MockGroup.from_callback(invalid_callback)]
+        with create_pool(
+            callbacks, keep_alive_queue_size=1, num_workers=1, start_method="spawn"
+        ) as pool:
+            _ = get_pids(pool)
+            work_batch = TaskArgs.make_sample(SampleRange(0, 1, 0, 0))
+            pool.schedule_batch(context_i=0, work_batch=work_batch)
+            pool.receive_batch(context_i=0)
diff --git a/qa/TL0_multigpu/test_body.sh b/qa/TL0_multigpu/test_body.sh
index 27609a9ed0e..3ac4c10fe82 100644
--- a/qa/TL0_multigpu/test_body.sh
+++ b/qa/TL0_multigpu/test_body.sh
@@ -7,7 +7,7 @@ test_py_with_framework() {
 
 test_py() {
   ${python_new_invoke_test} -s decoder -A 'multi_gpu'
-  ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,multi_gpu' -s experimental_mode
+  ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,multi_gpu' -s experimental_mode
 }
 
 test_gtest() {
diff --git a/qa/TL0_python-self-test-core/test_body.sh b/qa/TL0_python-self-test-core/test_body.sh
index 7e450425e9b..bb16187a154 100644
--- a/qa/TL0_python-self-test-core/test_body.sh
+++ b/qa/TL0_python-self-test-core/test_body.sh
@@ -23,9 +23,9 @@ test_py_with_framework() {
                             test_functional_api.py \
                             test_external_source_impl_utils.py); do
         if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-            ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy" ${test_script%.py}
+            ${python_new_invoke_test} -A "!slow,!pytorch,!cupy" ${test_script%.py}
         else
-            ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy,!numba" ${test_script%.py}
+            ${python_new_invoke_test} -A "!slow,!pytorch,!cupy,!numba" ${test_script%.py}
         fi
     done
 
@@ -35,7 +35,7 @@ test_py_with_framework() {
         ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy,!numba" test_dali_variable_batch_size
     fi
 
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy' test_backend_impl
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy' test_backend_impl
 
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
         ${python_new_invoke_test} -A 'numba' -s type_annotations
@@ -74,7 +74,7 @@ test_dynamic_mode_torch() {
 }
 
 test_pytorch() {
-    ${python_new_invoke_test} --attr '!slow,pytorch' test_dali_variable_batch_size.py
+    ${python_new_invoke_test} -A '!slow,pytorch' test_dali_variable_batch_size
     test_dynamic_mode_torch
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
         ${python_new_invoke_test} -A 'pytorch' -s type_annotations
@@ -87,20 +87,20 @@ test_pytorch() {
 
 test_checkpointing() {
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' checkpointing.test_dali_checkpointing
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' checkpointing.test_dali_stateless_operators
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' checkpointing.test_dali_checkpointing
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' checkpointing.test_dali_stateless_operators
     else
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!sanitizer_skip' checkpointing.test_dali_checkpointing
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!sanitizer_skip' checkpointing.test_dali_checkpointing
 
         # External source tests are slow and Python-side mostly, but let's run just one of them
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing.1
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing.1
     fi
 }
 
 test_dynamic_mode() {
-    ${python_new_invoke_test}  -A '!slow,!pytorch,!mxnet,!cupy,!numba' -s experimental_mode
-    CUDA_VISIBLE_DEVICES= ${python_new_invoke_test}  -A 'cpu_only,!slow,!pytorch,!mxnet,!cupy,!numba' -s experimental_mode
-    ${python_new_invoke_test}  -A '!slow,!pytorch,!mxnet,!cupy,!numba' -s ndd_vs_fn
+    ${python_new_invoke_test}  -A '!slow,!pytorch,!cupy,!numba' -s experimental_mode
+    CUDA_VISIBLE_DEVICES= ${python_new_invoke_test}  -A 'cpu_only,!slow,!pytorch,!cupy,!numba' -s experimental_mode
+    ${python_new_invoke_test}  -A '!slow,!pytorch,!cupy,!numba' -s ndd_vs_fn
 }
 
 
diff --git a/qa/TL0_python-self-test-readers-decoders/test_body.sh b/qa/TL0_python-self-test-readers-decoders/test_body.sh
index f1d39109504..5af20221c18 100644
--- a/qa/TL0_python-self-test-readers-decoders/test_body.sh
+++ b/qa/TL0_python-self-test-readers-decoders/test_body.sh
@@ -14,8 +14,13 @@ test_py_with_framework() {
       test_pool.py test_external_source_parallel.py test_external_source_parallel_shared_batch.py \
       test_external_source_parallel_large_sample.py \
       | sed "/$FILTER_PATTERN/d"); do
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script%.py}
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' ${test_script%.py}
     done
+    # run this test explicitly as it needs not GPU context in the process
+    if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork_cpu_only
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel_custom_serialization._test_no_pickling_in_forking_mode
+    fi
 
 
     if [ -n "$DALI_ENABLE_SANITIZERS" ]; then
@@ -52,10 +57,10 @@ test_jpeg_scan_limit() {
       # test various broken cases with smaller limit to make the test faster
       DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit
       # test default limit for one case
-      ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
+      ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
     else
       # let's check if error handling does not lead to leaks
-      DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
+      DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
     fi
 }
 
diff --git a/qa/TL0_python-self-test_tegra/test_body.sh b/qa/TL0_python-self-test_tegra/test_body.sh
index 3bbe3e710e3..2dfe1ed8af0 100644
--- a/qa/TL0_python-self-test_tegra/test_body.sh
+++ b/qa/TL0_python-self-test_tegra/test_body.sh
@@ -23,10 +23,11 @@ test_py_with_framework() {
         test_script=${test_script%.py}
         # execute only when no matches are found
         if [ ${status} -eq 0 ]; then
-            ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' ${test_script}
+            ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!scipy' ${test_script}
         fi
     done
 
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork_cpu_only
 
     XAVIER_OPERATOR_1_TESTS=""
     for test_script in $(ls operator_1/test_*.py); do
@@ -70,9 +71,9 @@ test_py_with_framework() {
         fi
     done
 
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' -s operator_1 $XAVIER_OPERATOR_1_TESTS
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' -s operator_2 $XAVIER_OPERATOR_2_TESTS
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!scipy' -s reader $XAVIER_READER_TESTS
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!scipy' -s operator_1 $XAVIER_OPERATOR_1_TESTS
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!scipy' -s operator_2 $XAVIER_OPERATOR_2_TESTS
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!scipy' -s reader $XAVIER_READER_TESTS
 }
 
 test_py() {
diff --git a/qa/TL1_python-self-test_conda/test_body.sh b/qa/TL1_python-self-test_conda/test_body.sh
index 3b3a7d2fe87..e926bce7b89 100644
--- a/qa/TL1_python-self-test_conda/test_body.sh
+++ b/qa/TL1_python-self-test_conda/test_body.sh
@@ -2,22 +2,22 @@
 
 test_py_with_framework() {
     for test_script in $(ls test_pipeline*.py test_external_source_dali.py test_external_source_numpy.py test_external_source_parallel_garbage_collection_order.py test_functional_api.py); do
-        ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' ${test_script%.py}
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' ${test_script%.py}
     done
 
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy' test_backend_impl
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy' test_backend_impl
 
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' -s operator_1
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' -s operator_2
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba' -s reader
-    ${python_new_invoke_test} -A '!slow,!pytorch,!mxnet,!cupy,!numba,!jpeg_scans_limit' -s decoder
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' -s operator_1
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' -s operator_2
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' -s reader
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!jpeg_scans_limit' -s decoder
 }
 
 test_jpeg_scan_limit() {
     # test various broken cases with smaller limit for speed
     DALI_MAX_JPEG_SCANS=30 ${python_new_invoke_test} -s decoder test_jpeg_scan_limit
     # test default limit for one case
-    ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit.1
+    ${python_new_invoke_test} -s decoder test_jpeg_scan_limit.ProgressiveJpeg.test_scans_limit:1
 }
 
 test_py() {
diff --git a/qa/TL1_tensorflow_dataset/test_impl.sh b/qa/TL1_tensorflow_dataset/test_impl.sh
index c8d3c90f652..4180e1261b5 100755
--- a/qa/TL1_tensorflow_dataset/test_impl.sh
+++ b/qa/TL1_tensorflow_dataset/test_impl.sh
@@ -17,7 +17,7 @@ test_body() {
         ${python_new_invoke_test} test_dali_tf_dataset_graph._test_tf_dataset_multigpu_manual_placement
         ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_other_gpu
         ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_multigpu_manual_placement
-        ${python_new_invoke_test} test_dali_tf_dataset_eager._test_tf_dataset_multigpu_mirrored_strategy
+        ${python_new_invoke_test} test_dali_tf_dataset_eager.TestTFDatasetMultiGPU._test_tf_dataset_multigpu_mirrored_strategy
         ${python_new_invoke_test} test_dali_tf_dataset_mnist_eager
         ${python_new_invoke_test} test_dali_tf_dataset_mnist_graph
 
diff --git a/qa/test_template_impl.sh b/qa/test_template_impl.sh
index a7f7c1d71b1..4047c44e6e2 100755
--- a/qa/test_template_impl.sh
+++ b/qa/test_template_impl.sh
@@ -23,7 +23,7 @@ python_new_test_runner="python -m nose2"
 python_new_test_args="--verbose --plugin=nose2_test_timer.plugin --with-timer --timer-color --timer-top-n 20"
 python_new_invoke_test="${python_new_test_runner} ${python_new_test_args}"
 
-# Set proper CUDA version for packages, like MXNet, requiring it
+# Set proper CUDA version for packages, requiring it
 pip_packages=$(eval "echo \"${pip_packages}\"" | sed "s/##CUDA_VERSION##/${CUDA_VERSION}/")
 last_config_index=$($topdir/qa/setup_packages.py -n -u $pip_packages --cuda ${CUDA_VERSION})
 

From 512cc6c841a2abd5d9c30dd698b545b00a0da6d4 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Tue, 10 Mar 2026 07:14:08 +0100
Subject: [PATCH 10/19] Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/operator_1/test_numba_func.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index ab95c9f3ce7..41644044f0d 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -322,6 +322,7 @@ def test_numba_func(
             expected_out,
         )
 
+    @attr("sanitizer_skip")
     def test_numba_func_with_cond(self):
         # When the function is not converted, the numba still works with no issues.
         # AG conversion or using a complex enough decorator would break this.
@@ -341,6 +342,7 @@ def test_numba_func_with_cond(self):
             enable_conditionals=True,
         )
 
+    @attr("sanitizer_skip")
     def test_numba_func_with_cond_do_not_convert(self):
         # Test if do_not_convert decorated functions still work.
         _testimpl_numba_func(

From 80847755ebc7980c5a672a3d4d24b6bfc314ec16 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Thu, 12 Mar 2026 17:31:24 +0100
Subject: [PATCH 11/19] Review fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 .../test/python/operator_1/test_numba_func.py | 1030 ++++++++---------
 1 file changed, 488 insertions(+), 542 deletions(-)

diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index 41644044f0d..e464ee69406 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -203,267 +203,6 @@ def numba_func_pipe(
             assert np.array_equal(out_arr, expected_out[i])
 
 
-class TestNumbaFuncCPU:
-    def setUp(self):
-        check_numba_compatibility_cpu()
-
-    @attr("sanitizer_skip")
-    @params(
-        (
-            [(10, 10, 10)],
-            np.bool_,
-            set_all_values_to_1_batch,
-            [dali_types.BOOL],
-            [dali_types.BOOL],
-            [3],
-            [3],
-            None,
-            True,
-            [np.full((10, 10, 10), 1, dtype=np.bool_)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.uint8,
-            set_all_values_to_255_batch,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            True,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.uint8,
-            set_all_values_to_255_sample,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            None,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.float32,
-            set_all_values_to_float_batch,
-            [dali_types.FLOAT],
-            [dali_types.FLOAT],
-            [3],
-            [3],
-            None,
-            True,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.float32,
-            set_all_values_to_float_sample,
-            [dali_types.FLOAT],
-            [dali_types.FLOAT],
-            [3],
-            [3],
-            None,
-            None,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
-        ),
-        (
-            [(10, 20, 30), (20, 10, 30)],
-            np.int64,
-            change_out_shape_batch,
-            [dali_types.INT64],
-            [dali_types.INT64],
-            [3],
-            [3],
-            setup_change_out_shape,
-            True,
-            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
-        ),
-        (
-            [(10, 20, 30), (20, 10, 30)],
-            np.int64,
-            change_out_shape_sample,
-            [dali_types.INT64],
-            [dali_types.INT64],
-            [3],
-            [3],
-            setup_change_out_shape,
-            None,
-            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
-        ),
-    )
-    def test_numba_func(
-        self,
-        shape,
-        dtype,
-        run_fn,
-        out_types,
-        in_types,
-        outs_ndim,
-        ins_ndim,
-        setup_fn,
-        batch_processing,
-        expected_out,
-    ):
-        device = "cpu"
-        _testimpl_numba_func(
-            device,
-            shape,
-            dtype,
-            run_fn,
-            out_types,
-            in_types,
-            outs_ndim,
-            ins_ndim,
-            setup_fn,
-            batch_processing,
-            expected_out,
-        )
-
-    @attr("sanitizer_skip")
-    def test_numba_func_with_cond(self):
-        # When the function is not converted, the numba still works with no issues.
-        # AG conversion or using a complex enough decorator would break this.
-        # TODO(klecki): Can we add any additional safeguards?
-        _testimpl_numba_func(
-            device="cpu",
-            shapes=[(10, 10, 10)],
-            dtype=np.uint8,
-            run_fn=set_all_values_to_255_batch,
-            out_types=[dali_types.UINT8],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[3],
-            ins_ndim=[3],
-            setup_fn=None,
-            batch_processing=True,
-            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
-            enable_conditionals=True,
-        )
-
-    @attr("sanitizer_skip")
-    def test_numba_func_with_cond_do_not_convert(self):
-        # Test if do_not_convert decorated functions still work.
-        _testimpl_numba_func(
-            device="cpu",
-            shapes=[(10, 10, 10)],
-            dtype=np.uint8,
-            run_fn=do_not_convert(set_all_values_to_255_batch),
-            out_types=[dali_types.UINT8],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[3],
-            ins_ndim=[3],
-            setup_fn=None,
-            batch_processing=True,
-            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
-            enable_conditionals=True,
-        )
-
-
-class TestNumbaFuncGPU:
-    def setUp(self):
-        check_numba_compatibility_gpu()
-
-    @attr("sanitizer_skip")
-    @params(
-        (
-            [(10, 10, 10)],
-            np.bool_,
-            set_all_values_to_1_sample_gpu,
-            [dali_types.BOOL],
-            [dali_types.BOOL],
-            [3],
-            [3],
-            None,
-            None,
-            [np.full((10, 10, 10), 1, dtype=np.bool_)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.uint8,
-            set_all_values_to_255_sample_gpu,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            None,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
-        ),
-        (
-            [(10, 10, 10)],
-            np.float32,
-            set_all_values_to_float_sample_gpu,
-            [dali_types.FLOAT],
-            [dali_types.FLOAT],
-            [3],
-            [3],
-            None,
-            None,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
-        ),
-        (
-            [(100, 20, 30), (20, 100, 30)],
-            np.int64,
-            change_out_shape_sample_gpu,
-            [dali_types.INT64],
-            [dali_types.INT64],
-            [3],
-            [3],
-            setup_change_out_shape,
-            None,
-            [
-                np.full((20, 30, 100), 42, dtype=np.int32),
-                np.full((100, 30, 20), 42, dtype=np.int32),
-            ],
-        ),
-        (
-            [(20), (30)],
-            np.int32,
-            change_ndim_gpu,
-            [dali_types.INT32],
-            [dali_types.INT32],
-            [4],
-            [1],
-            change_ndim_setup,
-            None,
-            [change_dim_expected_out(20), change_dim_expected_out(30)],
-        ),
-    )
-    def test_numba_func_gpu(
-        self,
-        shape,
-        dtype,
-        run_fn,
-        out_types,
-        in_types,
-        outs_ndim,
-        ins_ndim,
-        setup_fn,
-        batch_processing,
-        expected_out,
-    ):
-        device = "gpu"
-        blocks = [32, 32, 1]
-        threads_per_block = [32, 16, 1]
-        _testimpl_numba_func(
-            device,
-            shape,
-            dtype,
-            run_fn,
-            out_types,
-            in_types,
-            outs_ndim,
-            ins_ndim,
-            setup_fn,
-            batch_processing,
-            expected_out,
-            blocks,
-            threads_per_block,
-        )
-
-
 @pipeline_def
 def numba_func_image_pipe(
     device="cpu",
@@ -583,134 +322,6 @@ def rot_image_setup(outs, ins):
         out0[sample_id][2] = in0[sample_id][2]
 
 
-class TestNumbaFuncImageCPU:
-    def setUp(self):
-        check_numba_compatibility_cpu()
-
-    @attr("sanitizer_skip")
-    @params(
-        (
-            reverse_col_batch,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            True,
-            lambda x: 255 - x,
-        ),
-        (
-            reverse_col_sample,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            None,
-            lambda x: 255 - x,
-        ),
-        (
-            rot_image_batch,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            rot_image_setup,
-            True,
-            lambda x: np.rot90(x),
-        ),
-        (
-            rot_image_sample,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            rot_image_setup,
-            None,
-            lambda x: np.rot90(x),
-        ),
-    )
-    def test_numba_func_image(
-        self,
-        run_fn,
-        out_types,
-        in_types,
-        outs_ndim,
-        ins_ndim,
-        setup_fn,
-        batch_processing,
-        transform,
-    ):
-        device = "cpu"
-        _testimpl_numba_func_image(
-            device,
-            run_fn,
-            out_types,
-            in_types,
-            outs_ndim,
-            ins_ndim,
-            setup_fn,
-            batch_processing,
-            transform,
-        )
-
-
-class TestNumbaFuncImageGPU:
-    def setUp(self):
-        check_numba_compatibility_gpu()
-
-    @attr("sanitizer_skip")
-    @params(
-        (
-            reverse_col_sample_gpu,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            None,
-            None,
-            lambda x: 255 - x,
-        ),
-        (
-            rot_image_sample_gpu,
-            [dali_types.UINT8],
-            [dali_types.UINT8],
-            [3],
-            [3],
-            rot_image_setup,
-            None,
-            np.rot90,
-        ),
-    )
-    def test_numba_func_image_gpu(
-        self,
-        run_fn,
-        out_types,
-        in_types,
-        outs_ndim,
-        ins_ndim,
-        setup_fn,
-        batch_processing,
-        transform,
-    ):
-        device = "gpu"
-        blocks = [32, 32, 1]
-        threads_per_block = [32, 8, 1]
-        _testimpl_numba_func_image(
-            device,
-            run_fn,
-            out_types,
-            in_types,
-            outs_ndim,
-            ins_ndim,
-            setup_fn,
-            batch_processing,
-            transform,
-            blocks,
-            threads_per_block,
-        )
-
-
 def split_images_col_sample(out0, out1, out2, in0):
     for i in range(in0.shape[0]):
         for j in range(in0.shape[1]):
@@ -775,63 +386,6 @@ def numba_func_split_image_pipe(
     return images_in, out0, out1, out2
 
 
-class TestSplitImagesCol:
-    def setUp(self):
-        check_numba_compatibility_cpu()
-
-    @attr("sanitizer_skip")
-    def test_split_images_col(self):
-        pipe = numba_func_split_image_pipe(
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=split_images_col_sample,
-            setup_fn=setup_split_images_col,
-            out_types=[dali_types.UINT8 for i in range(3)],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[2, 2, 2],
-            ins_ndim=[3],
-            device="cpu",
-        )
-        for _ in range(3):
-            images_in, R, G, B = pipe.run()
-            for i in range(len(images_in)):
-                assert np.array_equal(
-                    images_in.at(i), np.stack([R.at(i), G.at(i), B.at(i)], axis=2)
-                )
-
-
-class TestSplitImagesColGPU:
-    def setUp(self):
-        check_numba_compatibility_gpu()
-
-    @attr("sanitizer_skip")
-    def test_split_images_col_gpu(self):
-        blocks = [32, 32, 1]
-        threads_per_block = [32, 8, 1]
-        pipe = numba_func_split_image_pipe(
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=split_images_col_sample_gpu,
-            setup_fn=setup_split_images_col,
-            out_types=[dali_types.UINT8 for i in range(3)],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[2, 2, 2],
-            ins_ndim=[3],
-            device="gpu",
-            blocks=blocks,
-            threads_per_block=threads_per_block,
-        )
-        for _ in range(3):
-            images_in, R, G, B = pipe.run()
-            for i in range(len(images_in)):
-                assert np.array_equal(
-                    to_array(images_in[i]),
-                    np.stack([to_array(R[i]), to_array(G[i]), to_array(B[i])], axis=2),
-                )
-
-
 def multiple_ins_setup(outs, ins):
     out0 = outs[0]
     in0 = ins[0]
@@ -896,62 +450,6 @@ def numba_multiple_ins_pipe(
     )
 
 
-class TestMultipleIns:
-    def setUp(self):
-        check_numba_compatibility_cpu()
-
-    @attr("sanitizer_skip")
-    def test_multiple_ins(self):
-        pipe = numba_multiple_ins_pipe(
-            shapes=[(10, 10)],
-            dtype=np.uint8,
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=multiple_ins_run,
-            setup_fn=multiple_ins_setup,
-            out_types=[dali_types.UINT8],
-            in_types=[dali_types.UINT8 for i in range(3)],
-            outs_ndim=[3],
-            ins_ndim=[2, 2, 2],
-            device="cpu",
-        )
-        for _ in range(3):
-            outs = pipe.run()
-            out_arr = np.array(outs[0][0])
-            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
-
-
-class TestMultipleInsGPU:
-    def setUp(self):
-        check_numba_compatibility_gpu()
-
-    @attr("sanitizer_skip")
-    def test_multiple_ins_gpu(self):
-        blocks = [32, 32, 1]
-        threads_per_block = [32, 8, 1]
-        pipe = numba_multiple_ins_pipe(
-            shapes=[(10, 10)],
-            dtype=np.uint8,
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=multiple_ins_run_gpu,
-            setup_fn=multiple_ins_setup,
-            out_types=[dali_types.UINT8],
-            in_types=[dali_types.UINT8 for i in range(3)],
-            outs_ndim=[3],
-            ins_ndim=[2, 2, 2],
-            device="gpu",
-            blocks=blocks,
-            threads_per_block=threads_per_block,
-        )
-        for _ in range(3):
-            outs = pipe.run()
-            out_arr = to_array(outs[0][0])
-            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
-
-
 def nonuniform_types_setup(outs, ins):
     out0 = outs[0]
     out1 = outs[1]
@@ -1013,50 +511,498 @@ def nonuniform_types_pipe(
     return images_in, out_img, out_shape
 
 
-class TestNonuniformTypes:
+class TestNumbaFuncCPU:
     def setUp(self):
         check_numba_compatibility_cpu()
 
     @attr("sanitizer_skip")
-    def test_nonuniform_types_cpu(self):
-        pipe = nonuniform_types_pipe(
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=nonuniform_types_run_cpu,
-            out_types=[dali_types.UINT8, dali_types.INT64],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[3, 1],
-            ins_ndim=[3],
-            device="cpu",
-        )
-        for _ in range(3):
-            images_in, images_out, img_shape = pipe.run()
-            for i in range(len(images_in)):
-                assert np.array_equal(255 - images_in.at(i), images_out.at(i))
-                assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
-
-
-class TestNonuniformTypesGPU:
-    def setUp(self):
-        check_numba_compatibility_gpu()
-
-    @attr("sanitizer_skip")
-    def test_nonuniform_types_gpu(self):
-        blocks = [16, 16, 1]
-        threads_per_block = [32, 16, 1]
-        pipe = nonuniform_types_pipe(
-            batch_size=8,
-            num_threads=1,
-            device_id=0,
-            run_fn=nonuniform_types_run_gpu,
-            out_types=[dali_types.UINT8, dali_types.INT64],
-            in_types=[dali_types.UINT8],
-            outs_ndim=[3, 1],
-            ins_ndim=[3],
-            device="gpu",
-            blocks=blocks,
-            threads_per_block=threads_per_block,
+    @params(
+        (
+            [(10, 10, 10)],
+            np.bool_,
+            set_all_values_to_1_batch,
+            [dali_types.BOOL],
+            [dali_types.BOOL],
+            [3],
+            [3],
+            None,
+            True,
+            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.uint8,
+            set_all_values_to_255_batch,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            True,
+            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.uint8,
+            set_all_values_to_255_sample,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.float32,
+            set_all_values_to_float_batch,
+            [dali_types.FLOAT],
+            [dali_types.FLOAT],
+            [3],
+            [3],
+            None,
+            True,
+            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.float32,
+            set_all_values_to_float_sample,
+            [dali_types.FLOAT],
+            [dali_types.FLOAT],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+        ),
+        (
+            [(10, 20, 30), (20, 10, 30)],
+            np.int64,
+            change_out_shape_batch,
+            [dali_types.INT64],
+            [dali_types.INT64],
+            [3],
+            [3],
+            setup_change_out_shape,
+            True,
+            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
+        ),
+        (
+            [(10, 20, 30), (20, 10, 30)],
+            np.int64,
+            change_out_shape_sample,
+            [dali_types.INT64],
+            [dali_types.INT64],
+            [3],
+            [3],
+            setup_change_out_shape,
+            None,
+            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
+        ),
+    )
+    def test_numba_func(
+        self,
+        shape,
+        dtype,
+        run_fn,
+        out_types,
+        in_types,
+        outs_ndim,
+        ins_ndim,
+        setup_fn,
+        batch_processing,
+        expected_out,
+    ):
+        _testimpl_numba_func(
+            "cpu",
+            shape,
+            dtype,
+            run_fn,
+            out_types,
+            in_types,
+            outs_ndim,
+            ins_ndim,
+            setup_fn,
+            batch_processing,
+            expected_out,
+        )
+
+    @attr("sanitizer_skip")
+    def test_numba_func_with_cond(self):
+        # When the function is not converted, the numba still works with no issues.
+        # AG conversion or using a complex enough decorator would break this.
+        # TODO(klecki): Can we add any additional safeguards?
+        _testimpl_numba_func(
+            device="cpu",
+            shapes=[(10, 10, 10)],
+            dtype=np.uint8,
+            run_fn=set_all_values_to_255_batch,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3],
+            ins_ndim=[3],
+            setup_fn=None,
+            batch_processing=True,
+            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
+            enable_conditionals=True,
+        )
+
+    @attr("sanitizer_skip")
+    def test_numba_func_with_cond_do_not_convert(self):
+        # Test if do_not_convert decorated functions still work.
+        _testimpl_numba_func(
+            device="cpu",
+            shapes=[(10, 10, 10)],
+            dtype=np.uint8,
+            run_fn=do_not_convert(set_all_values_to_255_batch),
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3],
+            ins_ndim=[3],
+            setup_fn=None,
+            batch_processing=True,
+            expected_out=[np.full((10, 10, 10), 255, dtype=np.uint8)],
+            enable_conditionals=True,
+        )
+
+    @attr("sanitizer_skip")
+    @params(
+        (
+            reverse_col_batch,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            True,
+            lambda x: 255 - x,
+        ),
+        (
+            reverse_col_sample,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            None,
+            lambda x: 255 - x,
+        ),
+        (
+            rot_image_batch,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            rot_image_setup,
+            True,
+            lambda x: np.rot90(x),
+        ),
+        (
+            rot_image_sample,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            rot_image_setup,
+            None,
+            lambda x: np.rot90(x),
+        ),
+    )
+    def test_numba_func_image(
+        self,
+        run_fn,
+        out_types,
+        in_types,
+        outs_ndim,
+        ins_ndim,
+        setup_fn,
+        batch_processing,
+        transform,
+    ):
+        _testimpl_numba_func_image(
+            "cpu",
+            run_fn,
+            out_types,
+            in_types,
+            outs_ndim,
+            ins_ndim,
+            setup_fn,
+            batch_processing,
+            transform,
+        )
+
+    @attr("sanitizer_skip")
+    def test_split_images_col(self):
+        pipe = numba_func_split_image_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=split_images_col_sample,
+            setup_fn=setup_split_images_col,
+            out_types=[dali_types.UINT8 for i in range(3)],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[2, 2, 2],
+            ins_ndim=[3],
+            device="cpu",
+        )
+        for _ in range(3):
+            images_in, R, G, B = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(
+                    images_in.at(i), np.stack([R.at(i), G.at(i), B.at(i)], axis=2)
+                )
+
+    @attr("sanitizer_skip")
+    def test_multiple_ins(self):
+        pipe = numba_multiple_ins_pipe(
+            shapes=[(10, 10)],
+            dtype=np.uint8,
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=multiple_ins_run,
+            setup_fn=multiple_ins_setup,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8 for i in range(3)],
+            outs_ndim=[3],
+            ins_ndim=[2, 2, 2],
+            device="cpu",
+        )
+        for _ in range(3):
+            outs = pipe.run()
+            out_arr = np.array(outs[0][0])
+            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
+
+    @attr("sanitizer_skip")
+    def test_nonuniform_types(self):
+        pipe = nonuniform_types_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=nonuniform_types_run_cpu,
+            out_types=[dali_types.UINT8, dali_types.INT64],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3, 1],
+            ins_ndim=[3],
+            device="cpu",
+        )
+        for _ in range(3):
+            images_in, images_out, img_shape = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(255 - images_in.at(i), images_out.at(i))
+                assert np.array_equal(images_out.at(i).shape, img_shape.at(i))
+
+
+class TestNumbaFuncGPU:
+    def setUp(self):
+        check_numba_compatibility_gpu()
+
+    @attr("sanitizer_skip")
+    @params(
+        (
+            [(10, 10, 10)],
+            np.bool_,
+            set_all_values_to_1_sample_gpu,
+            [dali_types.BOOL],
+            [dali_types.BOOL],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.uint8,
+            set_all_values_to_255_sample_gpu,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+        ),
+        (
+            [(10, 10, 10)],
+            np.float32,
+            set_all_values_to_float_sample_gpu,
+            [dali_types.FLOAT],
+            [dali_types.FLOAT],
+            [3],
+            [3],
+            None,
+            None,
+            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+        ),
+        (
+            [(100, 20, 30), (20, 100, 30)],
+            np.int64,
+            change_out_shape_sample_gpu,
+            [dali_types.INT64],
+            [dali_types.INT64],
+            [3],
+            [3],
+            setup_change_out_shape,
+            None,
+            [
+                np.full((20, 30, 100), 42, dtype=np.int32),
+                np.full((100, 30, 20), 42, dtype=np.int32),
+            ],
+        ),
+        (
+            [(20), (30)],
+            np.int32,
+            change_ndim_gpu,
+            [dali_types.INT32],
+            [dali_types.INT32],
+            [4],
+            [1],
+            change_ndim_setup,
+            None,
+            [change_dim_expected_out(20), change_dim_expected_out(30)],
+        ),
+    )
+    def test_numba_func(
+        self,
+        shape,
+        dtype,
+        run_fn,
+        out_types,
+        in_types,
+        outs_ndim,
+        ins_ndim,
+        setup_fn,
+        batch_processing,
+        expected_out,
+    ):
+        _testimpl_numba_func(
+            "gpu",
+            shape,
+            dtype,
+            run_fn,
+            out_types,
+            in_types,
+            outs_ndim,
+            ins_ndim,
+            setup_fn,
+            batch_processing,
+            expected_out,
+            blocks=[32, 32, 1],
+            threads_per_block=[32, 16, 1],
+        )
+
+    @attr("sanitizer_skip")
+    @params(
+        (
+            reverse_col_sample_gpu,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            None,
+            None,
+            lambda x: 255 - x,
+        ),
+        (
+            rot_image_sample_gpu,
+            [dali_types.UINT8],
+            [dali_types.UINT8],
+            [3],
+            [3],
+            rot_image_setup,
+            None,
+            np.rot90,
+        ),
+    )
+    def test_numba_func_image(
+        self,
+        run_fn,
+        out_types,
+        in_types,
+        outs_ndim,
+        ins_ndim,
+        setup_fn,
+        batch_processing,
+        transform,
+    ):
+        _testimpl_numba_func_image(
+            "gpu",
+            run_fn,
+            out_types,
+            in_types,
+            outs_ndim,
+            ins_ndim,
+            setup_fn,
+            batch_processing,
+            transform,
+            blocks=[32, 32, 1],
+            threads_per_block=[32, 8, 1],
+        )
+
+    @attr("sanitizer_skip")
+    def test_split_images_col(self):
+        pipe = numba_func_split_image_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=split_images_col_sample_gpu,
+            setup_fn=setup_split_images_col,
+            out_types=[dali_types.UINT8 for i in range(3)],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[2, 2, 2],
+            ins_ndim=[3],
+            device="gpu",
+            blocks=[32, 32, 1],
+            threads_per_block=[32, 8, 1],
+        )
+        for _ in range(3):
+            images_in, R, G, B = pipe.run()
+            for i in range(len(images_in)):
+                assert np.array_equal(
+                    to_array(images_in[i]),
+                    np.stack([to_array(R[i]), to_array(G[i]), to_array(B[i])], axis=2),
+                )
+
+    @attr("sanitizer_skip")
+    def test_multiple_ins(self):
+        pipe = numba_multiple_ins_pipe(
+            shapes=[(10, 10)],
+            dtype=np.uint8,
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=multiple_ins_run_gpu,
+            setup_fn=multiple_ins_setup,
+            out_types=[dali_types.UINT8],
+            in_types=[dali_types.UINT8 for i in range(3)],
+            outs_ndim=[3],
+            ins_ndim=[2, 2, 2],
+            device="gpu",
+            blocks=[32, 32, 1],
+            threads_per_block=[32, 8, 1],
+        )
+        for _ in range(3):
+            outs = pipe.run()
+            out_arr = to_array(outs[0][0])
+            assert np.array_equal(out_arr, np.zeros((10, 10, 3), dtype=np.uint8))
+
+    @attr("sanitizer_skip")
+    def test_nonuniform_types(self):
+        pipe = nonuniform_types_pipe(
+            batch_size=8,
+            num_threads=1,
+            device_id=0,
+            run_fn=nonuniform_types_run_gpu,
+            out_types=[dali_types.UINT8, dali_types.INT64],
+            in_types=[dali_types.UINT8],
+            outs_ndim=[3, 1],
+            ins_ndim=[3],
+            device="gpu",
+            blocks=[16, 16, 1],
+            threads_per_block=[32, 16, 1],
         )
         for _ in range(3):
             images_in, images_out, img_shape = pipe.run()

From ccaf8c43255f98a57c3bdeaacbdc8a259c1f7c37 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Fri, 13 Mar 2026 12:28:39 +0100
Subject: [PATCH 12/19] Review fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/nose2_attrib_generators.py    | 12 +++++++++---
 dali/test/python/operator_1/test_numba_func.py |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/dali/test/python/nose2_attrib_generators.py b/dali/test/python/nose2_attrib_generators.py
index 389fd63a50d..dd08c1fce25 100644
--- a/dali/test/python/nose2_attrib_generators.py
+++ b/dali/test/python/nose2_attrib_generators.py
@@ -46,12 +46,18 @@ def _get_attrib_plugin(self):
     def _build_attribs_list(self, attrib_plugin):
         """Build the attribs list from the attrib plugin's -A configuration.
 
-        This replicates the logic from AttributeSelector.moduleLoadedSuite
-        for -A filters only (not -E eval filters).
+        NOTE: This intentionally replicates the -A parsing logic from
+        nose2's AttributeSelector.moduleLoadedSuite (nose2/plugins/attrib.py).
+        nose2 does not cache a pre-parsed form of attrib_plugin.attribs; the
+        raw -A strings are parsed on every moduleLoadedSuite call. Because we
+        need the parsed representation here (to call validateAttrib), we must
+        duplicate this parsing. If nose2 changes how it parses -A expressions
+        (e.g. adding quoting, ranges, or OR-groups), this copy must be updated
+        to match.
         """
         attribs = []
 
-        # Handle -A (attribute) filters
+        # Handle -A (attribute) filters — mirrors AttributeSelector.moduleLoadedSuite
         for attr in attrib_plugin.attribs:
             attr_group = []
             for attrib in attr.strip().split(","):
diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index e464ee69406..41ff114cff0 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -697,7 +697,7 @@ def test_numba_func_with_cond_do_not_convert(self):
             [3],
             rot_image_setup,
             True,
-            lambda x: np.rot90(x),
+            np.rot90,
         ),
         (
             rot_image_sample,
@@ -707,7 +707,7 @@ def test_numba_func_with_cond_do_not_convert(self):
             [3],
             rot_image_setup,
             None,
-            lambda x: np.rot90(x),
+            np.rot90,
         ),
     )
     def test_numba_func_image(

From b031f79d5eb3b22c4c98b5df7a7d2454a2ffb59c Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Fri, 17 Apr 2026 16:00:15 +0200
Subject: [PATCH 13/19] Defer callback construction to test run time to avoid
 OOM at discovery

- nose_utils.py: consolidate double unittest import into single form
- test_dali_tf_es_pipelines.py: gen_tf_with_dali_external_source now
  yields a make_es_args factory instead of pre-built callback objects,
  so no data is created during test discovery
- test_dali_tf_dataset_graph.py: call make_es_args() at test run time
- test_external_source_parallel.py: _make_all_kinds_parallel_cases
  yields (epoch_size, ...) descriptors; ExtCallback/SampleCallbackBatched/
  SampleCallbackIterator are constructed inside test_all_kinds_parallel

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/nose_utils.py                |  7 ++--
 .../test/python/operator_1/test_numba_func.py | 41 +++++++++++--------
 .../test/python/test_dali_tf_dataset_eager.py | 15 +++++--
 .../test/python/test_dali_tf_dataset_graph.py | 38 +++++++++--------
 dali/test/python/test_dali_tf_es_pipelines.py | 23 ++++++-----
 .../python/test_external_source_parallel.py   | 40 ++++++++----------
 .../test_body.sh                              |  1 +
 qa/TL0_python-self-test_tegra/test_body.sh    |  1 +
 8 files changed, 95 insertions(+), 71 deletions(-)

diff --git a/dali/test/python/nose_utils.py b/dali/test/python/nose_utils.py
index 3580cc011cc..ca94f511ee6 100644
--- a/dali/test/python/nose_utils.py
+++ b/dali/test/python/nose_utils.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from unittest import SkipTest  # noqa: F401
-import unittest
-import re
 import fnmatch
 import functools
+import re
+import unittest
+
+SkipTest = unittest.SkipTest  # noqa: F401  # re-exported for callers
 
 
 def attr(*tags):
diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index 41ff114cff0..f1028c49b5f 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -123,6 +123,18 @@ def change_dim_expected_out(d):
     return np.array(list(range(d)) * 8).reshape(2, 2, 2, d)
 
 
+def _make_expected_out(spec):
+    """Build an expected-output array from a lightweight descriptor.
+
+    spec is either (shape, fill_value, dtype) for np.full, or
+    (callable, *args) to call the function at test run time.
+    """
+    if callable(spec[0]):
+        return spec[0](*spec[1:])
+    shape, fill_value, dtype = spec
+    return np.full(shape, fill_value, dtype=dtype)
+
+
 def get_data(shapes, dtype):
     return [np.empty(shape, dtype=dtype) for shape in shapes]
 
@@ -200,7 +212,7 @@ def numba_func_pipe(
         outs = pipe.run()
         for i in range(batch_size):
             out_arr = to_array(outs[0][i])
-            assert np.array_equal(out_arr, expected_out[i])
+            assert np.array_equal(out_arr, _make_expected_out(expected_out[i]))
 
 
 @pipeline_def
@@ -527,7 +539,7 @@ def setUp(self):
             [3],
             None,
             True,
-            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+            [((10, 10, 10), 1, np.bool_)],
         ),
         (
             [(10, 10, 10)],
@@ -539,7 +551,7 @@ def setUp(self):
             [3],
             None,
             True,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+            [((10, 10, 10), 255, np.uint8)],
         ),
         (
             [(10, 10, 10)],
@@ -551,7 +563,7 @@ def setUp(self):
             [3],
             None,
             None,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+            [((10, 10, 10), 255, np.uint8)],
         ),
         (
             [(10, 10, 10)],
@@ -563,7 +575,7 @@ def setUp(self):
             [3],
             None,
             True,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+            [((10, 10, 10), 0.5, np.float32)],
         ),
         (
             [(10, 10, 10)],
@@ -575,7 +587,7 @@ def setUp(self):
             [3],
             None,
             None,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+            [((10, 10, 10), 0.5, np.float32)],
         ),
         (
             [(10, 20, 30), (20, 10, 30)],
@@ -587,7 +599,7 @@ def setUp(self):
             [3],
             setup_change_out_shape,
             True,
-            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
+            [((20, 30, 10), 42, np.int32), ((10, 30, 20), 42, np.int32)],
         ),
         (
             [(10, 20, 30), (20, 10, 30)],
@@ -599,7 +611,7 @@ def setUp(self):
             [3],
             setup_change_out_shape,
             None,
-            [np.full((20, 30, 10), 42, dtype=np.int32), np.full((10, 30, 20), 42, dtype=np.int32)],
+            [((20, 30, 10), 42, np.int32), ((10, 30, 20), 42, np.int32)],
         ),
     )
     def test_numba_func(
@@ -811,7 +823,7 @@ def setUp(self):
             [3],
             None,
             None,
-            [np.full((10, 10, 10), 1, dtype=np.bool_)],
+            [((10, 10, 10), 1, np.bool_)],
         ),
         (
             [(10, 10, 10)],
@@ -823,7 +835,7 @@ def setUp(self):
             [3],
             None,
             None,
-            [np.full((10, 10, 10), 255, dtype=np.uint8)],
+            [((10, 10, 10), 255, np.uint8)],
         ),
         (
             [(10, 10, 10)],
@@ -835,7 +847,7 @@ def setUp(self):
             [3],
             None,
             None,
-            [np.full((10, 10, 10), 0.5, dtype=np.float32)],
+            [((10, 10, 10), 0.5, np.float32)],
         ),
         (
             [(100, 20, 30), (20, 100, 30)],
@@ -847,10 +859,7 @@ def setUp(self):
             [3],
             setup_change_out_shape,
             None,
-            [
-                np.full((20, 30, 100), 42, dtype=np.int32),
-                np.full((100, 30, 20), 42, dtype=np.int32),
-            ],
+            [((20, 30, 100), 42, np.int32), ((100, 30, 20), 42, np.int32)],
         ),
         (
             [(20), (30)],
@@ -862,7 +871,7 @@ def setUp(self):
             [1],
             change_ndim_setup,
             None,
-            [change_dim_expected_out(20), change_dim_expected_out(30)],
+            [(change_dim_expected_out, 20), (change_dim_expected_out, 30)],
         ),
     )
     def test_numba_func(
diff --git a/dali/test/python/test_dali_tf_dataset_eager.py b/dali/test/python/test_dali_tf_dataset_eager.py
index 8ab53534ec1..b7d1c1f7232 100644
--- a/dali/test/python/test_dali_tf_dataset_eager.py
+++ b/dali/test/python/test_dali_tf_dataset_eager.py
@@ -441,11 +441,20 @@ class TestTFWithDALIExternalSource:
     def setUp(self):
         skip_inputs_for_incompatible_tf()
 
-    @params(*gen_tf_with_dali_external_source(run_tf_with_dali_external_source))
+    @params(*gen_tf_with_dali_external_source())
     def test_tf_with_dali_external_source(
-        self, test_run, dev, es_args, es_dev, dtype, iter_limit, dense
+        self, get_callback, is_batched, cycle, batch_info, dense, dev, es_dev, dtype, iter_limit
     ):
-        test_run(dev, es_args, es_dev, dtype, iter_limit, dense)
+        bs = 12 if is_batched else None
+        es_args = {
+            "source": get_callback(dtype, iter_limit, bs, dense),
+            "batch": is_batched,
+            "cycle": cycle,
+            "batch_info": batch_info,
+        }
+        run_tf_with_dali_external_source(
+            dev, es_args, es_dev, tf.dtypes.as_dtype(dtype), iter_limit, dense
+        )
 
     def test_tf_dataset_layouts(self):
         for shape, layout in [((2, 3), "XY"), ((10, 20, 3), "HWC"), ((4, 128, 64, 3), "FHWC")]:
diff --git a/dali/test/python/test_dali_tf_dataset_graph.py b/dali/test/python/test_dali_tf_dataset_graph.py
index d35a9800e69..ee5d322a77f 100644
--- a/dali/test/python/test_dali_tf_dataset_graph.py
+++ b/dali/test/python/test_dali_tf_dataset_graph.py
@@ -210,26 +210,22 @@ def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
     )
 
 
-start_values = [
-    [np.full((2, 4), -42, dtype=np.int64), np.full((3, 5), -123.0, dtype=np.float32)],
-    [np.full((3, 5), -3.14, dtype=np.float32)],
-    [
-        np.full((2, 4), -42, dtype=np.int64),
-        np.full((3, 5), -666.0, dtype=np.float32),
-        np.full((1, 7), 5, dtype=np.int8),
-    ],
+start_value_specs = [
+    [((2, 4), -42, np.int64), ((3, 5), -123.0, np.float32)],
+    [((3, 5), -3.14, np.float32)],
+    [((2, 4), -42, np.int64), ((3, 5), -666.0, np.float32), ((1, 7), 5, np.int8)],
 ]
 
-input_names = [["input_{}".format(i) for i, _ in enumerate(vals)] for vals in start_values]
+input_names = [["input_{}".format(i) for i in range(len(specs))] for specs in start_value_specs]
 
 
 def _generate_tf_dataset_multi_input_test_cases():
     cases = []
     for dev in ["cpu", "gpu"]:
-        for starts, names in zip(start_values, input_names):
-            cases.append((dev, starts, names, ["dataset" for _ in input_names]))
+        for specs, names in zip(start_value_specs, input_names):
+            cases.append((dev, specs, names, ["dataset" for _ in input_names]))
             for batches in list(itertools.product([True, False], repeat=len(input_names))):
-                cases.append((dev, starts, names, batches))
+                cases.append((dev, specs, names, batches))
     return cases
 
 
@@ -238,7 +234,8 @@ def setUp(self):
         skip_inputs_for_incompatible_tf()
 
     @params(*_generate_tf_dataset_multi_input_test_cases())
-    def test_tf_dataset_multi_input(self, dev, starts, names, batches):
+    def test_tf_dataset_multi_input(self, dev, start_specs, names, batches):
+        starts = [np.full(shape, val, dtype=dtype) for shape, val, dtype in start_specs]
         run_tf_dataset_multi_input(dev, starts, names, batches)
 
 
@@ -255,11 +252,20 @@ class TestTFWithDALIExternalSource:
     def setUp(self):
         skip_inputs_for_incompatible_tf()
 
-    @params(*gen_tf_with_dali_external_source(run_tf_with_dali_external_source))
+    @params(*gen_tf_with_dali_external_source())
     def test_tf_with_dali_external_source(
-        self, test_run, dev, es_args, es_dev, dtype, iter_limit, dense
+        self, get_callback, is_batched, cycle, batch_info, dense, dev, es_dev, dtype, iter_limit
     ):
-        test_run(dev, es_args, es_dev, dtype, iter_limit, dense)
+        bs = 12 if is_batched else None
+        es_args = {
+            "source": get_callback(dtype, iter_limit, bs, dense),
+            "batch": is_batched,
+            "cycle": cycle,
+            "batch_info": batch_info,
+        }
+        run_tf_with_dali_external_source(
+            dev, es_args, es_dev, tf.dtypes.as_dtype(dtype), iter_limit, dense
+        )
 
 
 tf_dataset_wrong_placement_error_msg = (
diff --git a/dali/test/python/test_dali_tf_es_pipelines.py b/dali/test/python/test_dali_tf_es_pipelines.py
index d2d328bc18a..94cf84fa3f1 100644
--- a/dali/test/python/test_dali_tf_es_pipelines.py
+++ b/dali/test/python/test_dali_tf_es_pipelines.py
@@ -221,19 +221,20 @@ def get_dense_options(is_batched):
         return [True]
 
 
-def gen_tf_with_dali_external_source(test_run):
+def gen_tf_with_dali_external_source():
     for dtype in [np.uint8, np.int32, np.float32]:
         for get_callback, is_batched, cycle, batch_info in es_configurations:
             for dense in get_dense_options(is_batched):
                 for dev, es_dev in [("cpu", "cpu"), ("gpu", "cpu"), ("gpu", "gpu")]:
                     for iter_limit in [3, 9, 10, 11, 100]:
-                        bs = 12 if is_batched else None
-                        es_args = {
-                            "source": get_callback(dtype, iter_limit, bs, dense),
-                            "batch": is_batched,
-                            "cycle": cycle,
-                            "batch_info": batch_info,
-                        }
-                        yield test_run, dev, es_args, es_dev, tf.dtypes.as_dtype(
-                            dtype
-                        ), iter_limit, dense
+                        yield (
+                            get_callback,
+                            is_batched,
+                            cycle,
+                            batch_info,
+                            dense,
+                            dev,
+                            es_dev,
+                            dtype,
+                            iter_limit,
+                        )
diff --git a/dali/test/python/test_external_source_parallel.py b/dali/test/python/test_external_source_parallel.py
index 3bf04e290fe..4d7c180c293 100644
--- a/dali/test/python/test_external_source_parallel.py
+++ b/dali/test/python/test_external_source_parallel.py
@@ -21,7 +21,6 @@
 from nose_utils import raises
 import unittest
 import functools
-import nvidia.dali.backend as _b
 
 
 def no_arg_fun():
@@ -180,11 +179,9 @@ def _test_parallel_fork_cpu_only(self):
             capture_processes(pipe1._py_pool)
             utils.compare_pipelines(pipe0, pipe1, batch_size, iters)
 
-    def test_parallel_fork(self):
+    def _test_parallel_fork(self):
         epoch_size = 250
         callback = utils.ExtCallback((4, 5), epoch_size, np.int32)
-        # if context is already initialized, use spawn to avoid fork wich will fail immediately
-        init_method = "fork" if not _b.IsDriverInitialized() else "spawn"
         pipes = [
             (
                 utils.create_pipe(
@@ -192,7 +189,7 @@ def test_parallel_fork(self):
                     "cpu",
                     batch_size,
                     py_num_workers=num_workers,
-                    py_start_method=init_method,
+                    py_start_method="fork",
                     parallel=True,
                 ),
                 utils.create_pipe(callback, "cpu", batch_size, parallel=False),
@@ -210,7 +207,7 @@ def test_parallel_fork(self):
                     "cpu",
                     32,
                     py_num_workers=1,
-                    py_start_method=init_method,
+                    py_start_method="fork",
                     parallel=True,
                     batch=True,
                 ),
@@ -492,15 +489,21 @@ def tearDown(self):
 
     @cartesian_params(
         [
-            (1, Iterable(BATCH_SIZE, (4, 5), epoch_size=1), False),
-            (4, Iterable(BATCH_SIZE, (4, 5), epoch_size=4), False),
+            (1, (4, 5), False),
+            (4, (4, 5), False),
             (1, generator_epoch_size_1, True),
             (4, generator_epoch_size_4, True),
         ],
         (1, 2, 6),
     )
     def test_cycle_raise(self, case_description, reader_queue_size):
-        epoch_size, cb, is_gen_fun = case_description
+        epoch_size, shape_or_gen_fun, is_gen_fun = case_description
+        if is_gen_fun:
+            cb = shape_or_gen_fun
+            refer_iter = cb()
+        else:
+            cb = Iterable(self.BATCH_SIZE, shape_or_gen_fun, epoch_size=epoch_size)
+            refer_iter = cb
         pipe = utils.create_pipe(
             cb,
             "cpu",
@@ -516,10 +519,6 @@ def test_cycle_raise(self, case_description, reader_queue_size):
         )
         pipe.build()
         capture_processes(pipe._py_pool)
-        if is_gen_fun:
-            refer_iter = cb()
-        else:
-            refer_iter = cb
         for _ in range(3):
             i = 0
             while True:
@@ -684,9 +683,6 @@ def _make_all_kinds_parallel_cases():
                 if trailing >= batch_size:
                     continue
                 epoch_size = num_iters * batch_size + trailing
-                sample_cb = utils.ExtCallback((4, 5), epoch_size, np.int32)
-                batch_cb = SampleCallbackBatched(sample_cb, batch_size, batch_info=True)
-                iterator_cb = SampleCallbackIterator(sample_cb, batch_size, batch_info=True)
                 for reader_queue_sizes in (
                     (1, 1, 1),
                     (2, 2, 2),
@@ -698,9 +694,7 @@ def _make_all_kinds_parallel_cases():
                     for num_workers in (1, 7):
                         cases.append(
                             (
-                                sample_cb,
-                                batch_cb,
-                                iterator_cb,
+                                epoch_size,
                                 batch_size,
                                 num_workers,
                                 reader_queue_sizes,
@@ -720,14 +714,16 @@ def tearDown(self):
     @params(*_make_all_kinds_parallel_cases())
     def test_all_kinds_parallel(
         self,
-        sample_cb,
-        batch_cb,
-        iterator_cb,
+        epoch_size,
         batch_size,
         num_workers,
         reader_queue_sizes,
         num_iters,
     ):
+        sample_cb = utils.ExtCallback((4, 5), epoch_size, np.int32)
+        batch_cb = SampleCallbackBatched(sample_cb, batch_size, batch_info=True)
+        iterator_cb = SampleCallbackIterator(sample_cb, batch_size, batch_info=True)
+
         @dali.pipeline_def(
             batch_size=batch_size,
             num_threads=4,
diff --git a/qa/TL0_python-self-test-readers-decoders/test_body.sh b/qa/TL0_python-self-test-readers-decoders/test_body.sh
index 5af20221c18..5d3c4f7a08c 100644
--- a/qa/TL0_python-self-test-readers-decoders/test_body.sh
+++ b/qa/TL0_python-self-test-readers-decoders/test_body.sh
@@ -19,6 +19,7 @@ test_py_with_framework() {
     # run this test explicitly as it needs not GPU context in the process
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
         ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork_cpu_only
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork
         ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel_custom_serialization._test_no_pickling_in_forking_mode
     fi
 
diff --git a/qa/TL0_python-self-test_tegra/test_body.sh b/qa/TL0_python-self-test_tegra/test_body.sh
index 2dfe1ed8af0..c57043c10b3 100644
--- a/qa/TL0_python-self-test_tegra/test_body.sh
+++ b/qa/TL0_python-self-test_tegra/test_body.sh
@@ -28,6 +28,7 @@ test_py_with_framework() {
     done
 
     ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork_cpu_only
+    ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' test_external_source_parallel.TestParallelFork._test_parallel_fork
 
     XAVIER_OPERATOR_1_TESTS=""
     for test_script in $(ls operator_1/test_*.py); do

From 9a11b20371ec732518c7d43025ac4c4458cf98a6 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Fri, 17 Apr 2026 21:37:37 +0200
Subject: [PATCH 14/19] Fix _make_expected_out crash when passed a pre-built
 ndarray

test_numba_func_with_cond tests passed np.full(...) directly in
expected_out and were not updated when the descriptor format was
introduced in 839dfb17a, causing a ValueError on unpack.

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/operator_1/test_numba_func.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dali/test/python/operator_1/test_numba_func.py b/dali/test/python/operator_1/test_numba_func.py
index f1028c49b5f..0b63c7c60b2 100644
--- a/dali/test/python/operator_1/test_numba_func.py
+++ b/dali/test/python/operator_1/test_numba_func.py
@@ -126,9 +126,11 @@ def change_dim_expected_out(d):
 def _make_expected_out(spec):
     """Build an expected-output array from a lightweight descriptor.
 
-    spec is either (shape, fill_value, dtype) for np.full, or
-    (callable, *args) to call the function at test run time.
+    spec is either a numpy array (returned as-is), (shape, fill_value, dtype)
+    for np.full, or (callable, *args) to call the function at test run time.
     """
+    if isinstance(spec, np.ndarray):
+        return spec
     if callable(spec[0]):
         return spec[0](*spec[1:])
     shape, fill_value, dtype = spec

From 9083f2720736d8e1f7917fa0e63e094493fa9c20 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Mon, 20 Apr 2026 15:22:23 +0200
Subject: [PATCH 15/19] Review fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 qa/TL0_python-self-test-core/test_body.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qa/TL0_python-self-test-core/test_body.sh b/qa/TL0_python-self-test-core/test_body.sh
index bb16187a154..799de09f0ba 100644
--- a/qa/TL0_python-self-test-core/test_body.sh
+++ b/qa/TL0_python-self-test-core/test_body.sh
@@ -30,9 +30,9 @@ test_py_with_framework() {
     done
 
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-        ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy" test_dali_variable_batch_size
+        ${python_new_invoke_test} -A "!slow,!pytorch!cupy" test_dali_variable_batch_size
     else
-        ${python_new_invoke_test} -A "!slow,!pytorch,!mxnet,!cupy,!numba" test_dali_variable_batch_size
+        ${python_new_invoke_test} -A "!slow,!pytorch!cupy,!numba" test_dali_variable_batch_size
     fi
 
     ${python_new_invoke_test} -A '!slow,!pytorch,!cupy' test_backend_impl
@@ -93,7 +93,7 @@ test_checkpointing() {
         ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba,!sanitizer_skip' checkpointing.test_dali_checkpointing
 
         # External source tests are slow and Python-side mostly, but let's run just one of them
-        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing.1
+        ${python_new_invoke_test} -A '!slow,!pytorch,!cupy,!numba' checkpointing.test_dali_checkpointing.test_external_source_checkpointing:1
     fi
 }
 

From 84110e2073a6ea1c8744ea00a0a7f1beb1befb46 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Mon, 20 Apr 2026 17:25:40 +0200
Subject: [PATCH 16/19] Consolidate TF dataset test classes sharing setUp into
 single class

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 .../test/python/test_dali_tf_dataset_eager.py | 293 ++++++++----------
 .../test/python/test_dali_tf_dataset_graph.py | 122 +++-----
 2 files changed, 171 insertions(+), 244 deletions(-)

diff --git a/dali/test/python/test_dali_tf_dataset_eager.py b/dali/test/python/test_dali_tf_dataset_eager.py
index b7d1c1f7232..c9dc491c931 100644
--- a/dali/test/python/test_dali_tf_dataset_eager.py
+++ b/dali/test/python/test_dali_tf_dataset_eager.py
@@ -18,7 +18,7 @@
 import nvidia.dali.plugin.tf as dali_tf
 from nvidia.dali.plugin.tf.experimental import Input
 from nvidia.dali import fn
-from nose2.tools import params
+from nose2.tools import params, cartesian_params
 from nose_utils import raises
 from test_dali_tf_dataset_pipelines import (
     FixedSampleIterator,
@@ -93,15 +93,6 @@ def _generate_tf_dataset_with_constant_input_test_cases():
     return cases
 
 
-class TestTFDatasetWithInputs:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_with_constant_input_test_cases())
-    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
-        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
-
-
 def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch="dataset"):
     min_shape = get_min_shape_helper(batch, max_shape)
     it = RandomSampleIterator(max_shape, dtype(0), min_shape=min_shape)
@@ -121,15 +112,6 @@ def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch="dataset"):
 ]
 
 
-class TestTFDatasetWithRandomInput:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_tf_dataset_with_random_input_test_cases)
-    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
-        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
-
-
 # Run with everything on GPU (External Source op as well)
 def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
     min_shape = get_min_shape_helper(batch, max_shape)
@@ -149,15 +131,6 @@ def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
 ]
 
 
-class TestTFDatasetWithRandomInputGPU:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_tf_dataset_with_random_input_gpu_test_cases)
-    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
-        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
-
-
 def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
     get_pipeline_desc = external_source_tester(
         max_shape, dtype, RandomSampleIterator(max_shape, dtype(0)), es_dev, no_copy
@@ -181,15 +154,6 @@ def _generate_tf_dataset_with_no_copy_test_cases():
     return cases
 
 
-class TestTFDatasetWithNoCopy:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_with_no_copy_test_cases())
-    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
-        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
-
-
 def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     it1 = RandomSampleIterator(max_shape, dtype(0), start=0, stop=stop_samples)
     get_pipeline_desc = external_source_tester(max_shape, dtype, it1)
@@ -201,24 +165,6 @@ def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     )
 
 
-class TestTFDatasetWithStopIter:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    def test_tf_dataset_with_stop_iter(self):
-        batch_size = 12
-        for dev in ["cpu", "gpu"]:
-            for max_shape in [(10, 20), (120, 120, 3), (3, 40, 40, 4)]:
-                for dtype in [np.uint8, np.int32, np.float32]:
-                    for iters in [1, 2, 3, 4, 5]:
-                        run_tf_dataset_with_stop_iter(
-                            dev,
-                            max_shape,
-                            dtype,
-                            iters * batch_size - 3,
-                        )
-
-
 def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
     run_tf_dataset_eager_mode(
         dev,
@@ -250,43 +196,6 @@ def _generate_tf_dataset_multi_input_test_cases():
     return cases
 
 
-class TestTFDatasetMultiInput:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_multi_input_test_cases())
-    def test_tf_dataset_multi_input(self, dev, starts, names, batches):
-        run_tf_dataset_multi_input(dev, starts, names, batches)
-
-
-@raises(tf.errors.InternalError, glob="TF device and DALI device mismatch")
-def test_tf_dataset_wrong_placement_cpu():
-    batch_size = 12
-    num_threads = 4
-
-    pipeline = get_image_pipeline(batch_size, num_threads, "cpu", 0)
-
-    with tf.device("/gpu:0"):
-        dataset = get_dali_dataset_from_pipeline(pipeline, "gpu", 0)
-
-    for sample in dataset:
-        pass
-
-
-@raises(tf.errors.InternalError, glob="TF device and DALI device mismatch")
-def test_tf_dataset_wrong_placement_gpu():
-    batch_size = 12
-    num_threads = 4
-
-    pipeline = get_image_pipeline(batch_size, num_threads, "gpu", 0)
-
-    with tf.device("/cpu:0"):
-        dataset = get_dali_dataset_from_pipeline(pipeline, "cpu", 0)
-
-    for sample in dataset:
-        pass
-
-
 def check_basic_dataset_build(input_datasets):
     input_names = ["a", "b"]
     batches = ["dataset" for _ in input_names]
@@ -315,47 +224,13 @@ def check_tf_dataset_wrong_input_type(wrong_input_datasets):
     check_basic_dataset_build(wrong_input_datasets)
 
 
-class TestTFDatasetInputValidation:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    def test_tf_dataset_wrong_input_type(self):
-        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-        # wrong `input_datasets` type (no dictionary)
-        for wrong_input_dataset in ["a", input_dataset, [input_dataset]]:
-            check_tf_dataset_wrong_input_type(wrong_input_dataset)
-        # wrong values in dictionary
-        for wrong_input_dataset in ["str", [input_dataset]]:
-            check_tf_dataset_wrong_input_type(
-                {
-                    "a": wrong_input_dataset,
-                    "b": wrong_input_dataset,
-                }
-            )
-        # wrong keys in dictionary
-        for wrong_input_name in [42, ("a", "b")]:
-            check_tf_dataset_wrong_input_type({wrong_input_name: input_dataset})
-
-
-class TestTFDatasetExternalSourceValidation:
-    def setUp(self):
-        skip_for_incompatible_tf()
-
-    @raises(
-        ValueError,
-        glob="Found External Source nodes in the Pipeline, that were not assigned any inputs.",
-    )
-    def test_input_not_provided(self):
-        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-        check_basic_dataset_build({"a": input_dataset})
-
-    @raises(
-        ValueError,
-        glob="Did not find an External Source placeholder node * in the provided pipeline",
+def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
+    run_tf_dataset_eager_mode(
+        dev,
+        get_pipeline_desc=get_external_source_pipe(es_args, dtype, ed_dev),
+        to_dataset=external_source_to_tf_dataset,
+        to_stop_iter=True,
     )
-    def test_missing_es_node(self):
-        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
-        check_basic_dataset_build({"a": input_dataset, "b": input_dataset, "c": input_dataset})
 
 
 @pipeline_def(batch_size=10, num_threads=4, device_id=0)
@@ -379,10 +254,78 @@ def check_single_es_pipeline(kwargs, input_datasets):
         return dali_dataset
 
 
-class TestTFDatasetESParameters:
+def check_layout(kwargs, input_datasets, layout):
+    pipe = Pipeline(10, 4, 0)
+    with pipe:
+        input = fn.external_source(**kwargs)
+        # Rely on the Pad internal check to ensure that External Source set layout
+        pipe.set_outputs(fn.pad(input, axis_names=layout))
+
+    with tf.device("/cpu:0"):
+        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
+            input_datasets=input_datasets,
+            pipeline=pipe,
+            batch_size=pipe.max_batch_size,
+            output_shapes=None,
+            output_dtypes=tf.int64,
+            num_threads=pipe.num_threads,
+            device_id=pipe.device_id,
+        )
+
+    run_dataset_eager_mode(dali_dataset, 10)
+
+
+class TestTFDatasetWithInputs:
     def setUp(self):
         skip_inputs_for_incompatible_tf()
 
+    @params(*_generate_tf_dataset_with_constant_input_test_cases())
+    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
+        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
+
+    @params(*_tf_dataset_with_random_input_test_cases)
+    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
+
+    @params(*_tf_dataset_with_random_input_gpu_test_cases)
+    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
+
+    @params(*_generate_tf_dataset_with_no_copy_test_cases())
+    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
+        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
+
+    @cartesian_params(
+        ["cpu", "gpu"],
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        [1, 2, 3, 4, 5],
+    )
+    def test_tf_dataset_with_stop_iter(self, dev, max_shape, dtype, iters):
+        batch_size = 12
+        run_tf_dataset_with_stop_iter(dev, max_shape, dtype, iters * batch_size - 3)
+
+    @params(*_generate_tf_dataset_multi_input_test_cases())
+    def test_tf_dataset_multi_input(self, dev, starts, names, batches):
+        run_tf_dataset_multi_input(dev, starts, names, batches)
+
+    def test_tf_dataset_wrong_input_type(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        # wrong `input_datasets` type (no dictionary)
+        for wrong_input_dataset in ["a", input_dataset, [input_dataset]]:
+            check_tf_dataset_wrong_input_type(wrong_input_dataset)
+        # wrong values in dictionary
+        for wrong_input_dataset in ["str", [input_dataset]]:
+            check_tf_dataset_wrong_input_type(
+                {
+                    "a": wrong_input_dataset,
+                    "b": wrong_input_dataset,
+                }
+            )
+        # wrong keys in dictionary
+        for wrong_input_name in [42, ("a", "b")]:
+            check_tf_dataset_wrong_input_type({wrong_input_name: input_dataset})
+
     @raises(
         ValueError,
         glob="Did not find an External Source placeholder node * in the provided pipeline",
@@ -406,41 +349,6 @@ def test_tf_dataset_es_num_outputs_provided(self):
     def test_tf_dataset_disallowed_es(self):
         check_single_es_pipeline({}, {})
 
-
-def check_layout(kwargs, input_datasets, layout):
-    pipe = Pipeline(10, 4, 0)
-    with pipe:
-        input = fn.external_source(**kwargs)
-        # Rely on the Pad internal check to ensure that External Source set layout
-        pipe.set_outputs(fn.pad(input, axis_names=layout))
-
-    with tf.device("/cpu:0"):
-        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
-            input_datasets=input_datasets,
-            pipeline=pipe,
-            batch_size=pipe.max_batch_size,
-            output_shapes=None,
-            output_dtypes=tf.int64,
-            num_threads=pipe.num_threads,
-            device_id=pipe.device_id,
-        )
-
-    run_dataset_eager_mode(dali_dataset, 10)
-
-
-def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
-    run_tf_dataset_eager_mode(
-        dev,
-        get_pipeline_desc=get_external_source_pipe(es_args, dtype, ed_dev),
-        to_dataset=external_source_to_tf_dataset,
-        to_stop_iter=True,
-    )
-
-
-class TestTFWithDALIExternalSource:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
     @params(*gen_tf_with_dali_external_source())
     def test_tf_with_dali_external_source(
         self, get_callback, is_batched, cycle, batch_info, dense, dev, es_dev, dtype, iter_limit
@@ -467,6 +375,55 @@ def test_tf_dataset_layouts(self):
             check_layout({"name": "in"}, {"in": Input(in_dataset, layout=layout)}, layout)
 
 
+@raises(tf.errors.InternalError, glob="TF device and DALI device mismatch")
+def test_tf_dataset_wrong_placement_cpu():
+    batch_size = 12
+    num_threads = 4
+
+    pipeline = get_image_pipeline(batch_size, num_threads, "cpu", 0)
+
+    with tf.device("/gpu:0"):
+        dataset = get_dali_dataset_from_pipeline(pipeline, "gpu", 0)
+
+    for sample in dataset:
+        pass
+
+
+@raises(tf.errors.InternalError, glob="TF device and DALI device mismatch")
+def test_tf_dataset_wrong_placement_gpu():
+    batch_size = 12
+    num_threads = 4
+
+    pipeline = get_image_pipeline(batch_size, num_threads, "gpu", 0)
+
+    with tf.device("/cpu:0"):
+        dataset = get_dali_dataset_from_pipeline(pipeline, "cpu", 0)
+
+    for sample in dataset:
+        pass
+
+
+class TestTFDatasetExternalSourceValidation:
+    def setUp(self):
+        skip_for_incompatible_tf()
+
+    @raises(
+        ValueError,
+        glob="Found External Source nodes in the Pipeline, that were not assigned any inputs.",
+    )
+    def test_input_not_provided(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_basic_dataset_build({"a": input_dataset})
+
+    @raises(
+        ValueError,
+        glob="Did not find an External Source placeholder node * in the provided pipeline",
+    )
+    def test_missing_es_node(self):
+        input_dataset = tf.data.Dataset.from_tensors(np.full((2, 2), 42)).repeat()
+        check_basic_dataset_build({"a": input_dataset, "b": input_dataset, "c": input_dataset})
+
+
 # Test if the TypeError is raised for unsupported arguments for regular DALIDataset
 @raises(TypeError, glob="Dataset inputs are allowed only in *DALIDatasetWithInputs")
 def test_tf_experimental_inputs_disabled():
diff --git a/dali/test/python/test_dali_tf_dataset_graph.py b/dali/test/python/test_dali_tf_dataset_graph.py
index ee5d322a77f..63a7038742e 100644
--- a/dali/test/python/test_dali_tf_dataset_graph.py
+++ b/dali/test/python/test_dali_tf_dataset_graph.py
@@ -76,15 +76,6 @@ def _generate_tf_dataset_with_constant_input_test_cases():
     return cases
 
 
-class TestTFDatasetWithConstantInput:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_with_constant_input_test_cases())
-    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
-        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
-
-
 def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch):
     min_shape = get_min_shape_helper(batch, max_shape)
     iterator = RandomSampleIterator(max_shape, dtype(0), min_shape=min_shape)
@@ -97,20 +88,6 @@ def run_tf_dataset_with_random_input(dev, max_shape, dtype, batch):
     )
 
 
-class TestTFDatasetWithRandomInput:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @cartesian_params(
-        ["cpu", "gpu"],
-        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
-        [np.uint8, np.int32, np.float32],
-        ["dataset", True, False, None],
-    )
-    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
-        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
-
-
 # Run with everything on GPU (External Source op as well)
 def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
     min_shape = get_min_shape_helper(batch, max_shape)
@@ -124,19 +101,6 @@ def run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch):
     )
 
 
-class TestTFDatasetWithRandomInputGPU:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @cartesian_params(
-        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
-        [np.uint8, np.int32, np.float32],
-        ["dataset", True, False, None],
-    )
-    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
-        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
-
-
 def run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy):
     run_tf_dataset_graph(
         dataset_dev,
@@ -160,15 +124,6 @@ def _generate_tf_dataset_with_no_copy_test_cases():
     return cases
 
 
-class TestTFDatasetWithNoCopy:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_with_no_copy_test_cases())
-    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
-        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
-
-
 def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     run_tf_dataset_graph(
         dev,
@@ -182,26 +137,6 @@ def run_tf_dataset_with_stop_iter(dev, max_shape, dtype, stop_samples):
     )
 
 
-class TestTFDatasetWithStopIter:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @cartesian_params(
-        ["cpu", "gpu"],
-        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
-        [np.uint8, np.int32, np.float32],
-        [1, 2, 3, 4, 5],
-    )
-    def test_tf_dataset_with_stop_iter(self, dev, max_shape, dtype, iters):
-        batch_size = 12
-        run_tf_dataset_with_stop_iter(
-            dev,
-            max_shape,
-            dtype,
-            iters * batch_size - 3,
-        )
-
-
 def run_tf_dataset_multi_input(dev, start_values, input_names, batches):
     run_tf_dataset_graph(
         dev,
@@ -229,16 +164,6 @@ def _generate_tf_dataset_multi_input_test_cases():
     return cases
 
 
-class TestTFDatasetMultiInput:
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    @params(*_generate_tf_dataset_multi_input_test_cases())
-    def test_tf_dataset_multi_input(self, dev, start_specs, names, batches):
-        starts = [np.full(shape, val, dtype=dtype) for shape, val, dtype in start_specs]
-        run_tf_dataset_multi_input(dev, starts, names, batches)
-
-
 def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
     run_tf_dataset_graph(
         dev,
@@ -248,10 +173,55 @@ def run_tf_with_dali_external_source(dev, es_args, ed_dev, dtype, *_):
     )
 
 
-class TestTFWithDALIExternalSource:
+class TestTFDatasetWithInputs:
     def setUp(self):
         skip_inputs_for_incompatible_tf()
 
+    @params(*_generate_tf_dataset_with_constant_input_test_cases())
+    def test_tf_dataset_with_constant_input(self, dev, shape, value, dtype, batch):
+        run_tf_dataset_with_constant_input(dev, shape, value, dtype, batch)
+
+    @cartesian_params(
+        ["cpu", "gpu"],
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        ["dataset", True, False, None],
+    )
+    def test_tf_dataset_with_random_input(self, dev, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input(dev, max_shape, dtype, batch)
+
+    @cartesian_params(
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        ["dataset", True, False, None],
+    )
+    def test_tf_dataset_with_random_input_gpu(self, max_shape, dtype, batch):
+        run_tf_dataset_with_random_input_gpu(max_shape, dtype, batch)
+
+    @params(*_generate_tf_dataset_with_no_copy_test_cases())
+    def test_tf_dataset_with_no_copy(self, max_shape, dtype, dataset_dev, es_dev, no_copy):
+        run_tf_dataset_no_copy(max_shape, dtype, dataset_dev, es_dev, no_copy)
+
+    @cartesian_params(
+        ["cpu", "gpu"],
+        [(10, 20), (120, 120, 3), (3, 40, 40, 4)],
+        [np.uint8, np.int32, np.float32],
+        [1, 2, 3, 4, 5],
+    )
+    def test_tf_dataset_with_stop_iter(self, dev, max_shape, dtype, iters):
+        batch_size = 12
+        run_tf_dataset_with_stop_iter(
+            dev,
+            max_shape,
+            dtype,
+            iters * batch_size - 3,
+        )
+
+    @params(*_generate_tf_dataset_multi_input_test_cases())
+    def test_tf_dataset_multi_input(self, dev, start_specs, names, batches):
+        starts = [np.full(shape, val, dtype=dtype) for shape, val, dtype in start_specs]
+        run_tf_dataset_multi_input(dev, starts, names, batches)
+
     @params(*gen_tf_with_dali_external_source())
     def test_tf_with_dali_external_source(
         self, get_callback, is_batched, cycle, batch_info, dense, dev, es_dev, dtype, iter_limit

From c24561426059cfd28ac2f35e261045885435626a Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Mon, 20 Apr 2026 19:48:39 +0200
Subject: [PATCH 17/19] Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 qa/TL0_python-self-test-core/test_body.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/TL0_python-self-test-core/test_body.sh b/qa/TL0_python-self-test-core/test_body.sh
index 799de09f0ba..8ce59202866 100644
--- a/qa/TL0_python-self-test-core/test_body.sh
+++ b/qa/TL0_python-self-test-core/test_body.sh
@@ -30,9 +30,9 @@ test_py_with_framework() {
     done
 
     if [ -z "$DALI_ENABLE_SANITIZERS" ]; then
-        ${python_new_invoke_test} -A "!slow,!pytorch!cupy" test_dali_variable_batch_size
+        ${python_new_invoke_test} -A "!slow,!pytorch,!cupy" test_dali_variable_batch_size
     else
-        ${python_new_invoke_test} -A "!slow,!pytorch!cupy,!numba" test_dali_variable_batch_size
+        ${python_new_invoke_test} -A "!slow,!pytorch,!cupy,!numba" test_dali_variable_batch_size
     fi
 
     ${python_new_invoke_test} -A '!slow,!pytorch,!cupy' test_backend_impl

From 8f41ff09fbe73638174b96edbda3bbd3a17fb320 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Mon, 20 Apr 2026 20:00:13 +0200
Subject: [PATCH 18/19] Add missing file

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/test_dali_tf_exec2.py | 95 +++++++++++++-------------
 1 file changed, 46 insertions(+), 49 deletions(-)

diff --git a/dali/test/python/test_dali_tf_exec2.py b/dali/test/python/test_dali_tf_exec2.py
index 0aa7ad0ebb7..5db421cae7c 100644
--- a/dali/test/python/test_dali_tf_exec2.py
+++ b/dali/test/python/test_dali_tf_exec2.py
@@ -43,55 +43,52 @@ def dali_exec2_pipeline():
     return output.cpu()
 
 
-class TestTFDatasetExec2(unittest.TestCase):
-    def setUp(self):
-        skip_inputs_for_incompatible_tf()
-
-    def test_tf_dataset_exec2(self):
-        """Test that exec_dynamic is propagated to DALI pipeline
-        by dali_tf.DALIDatasetWithInputs"""
-        # From Tensorflow's perspective, this is a CPU pipeline
-        with tf.device("/cpu:0"):
-            dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
-                pipeline=dali_exec2_pipeline(),
-                batch_size=5,
-                output_shapes=(5,),
-                output_dtypes=(tf.int32),
-                num_threads=4,
-                device_id=0,
-            )
-
-            @tf.function
-            def tf_function_with_conditionals(dali_dataset):
-                negative = tf.constant(0)
-                positive = tf.constant(0)
-                for input in dali_dataset:
-                    if tf.reduce_sum(input) < 0:
-                        negative = negative + 1
-                    else:
-                        positive = positive + 1
-                return negative, positive
-
-            pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
-            # Eager mode: integers, graph mode: tensors, need to fetch value if it's Tensor
-            if (
-                tf.executing_eagerly() is False
-                or getattr(tf.compat.v1, "_eager_context", None) is not None
-            ):
-                # get concrete function and run in session for static graph mode
-                # fallback for session-based TF execution (e.g. when other test turned eager off)
-                try:
-                    from tensorflow.compat.v1 import Session
-                except ImportError:
-                    # Older TF versions don't have compat.v1 layer
-                    from tensorflow import Session
-
-                with Session() as sess:
-                    pos_val, neg_val = sess.run([pos, neg])
-            else:
-                pos_val, neg_val = pos, neg
-            assert pos_val == 3
-            assert neg_val == 2
+def test_tf_dataset_exec2(self):
+    """Test that exec_dynamic is propagated to DALI pipeline
+    by dali_tf.DALIDatasetWithInputs"""
+    skip_inputs_for_incompatible_tf()
+    # From Tensorflow's perspective, this is a CPU pipeline
+    with tf.device("/cpu:0"):
+        dali_dataset = dali_tf.experimental.DALIDatasetWithInputs(
+            pipeline=dali_exec2_pipeline(),
+            batch_size=5,
+            output_shapes=(5,),
+            output_dtypes=(tf.int32),
+            num_threads=4,
+            device_id=0,
+        )
+
+        @tf.function
+        def tf_function_with_conditionals(dali_dataset):
+            negative = tf.constant(0)
+            positive = tf.constant(0)
+            for input in dali_dataset:
+                if tf.reduce_sum(input) < 0:
+                    negative = negative + 1
+                else:
+                    positive = positive + 1
+            return negative, positive
+
+        pos, neg = tf_function_with_conditionals(dali_dataset.take(5))
+        # Eager mode: integers, graph mode: tensors, need to fetch value if it's Tensor
+        if (
+            tf.executing_eagerly() is False
+            or getattr(tf.compat.v1, "_eager_context", None) is not None
+        ):
+            # get concrete function and run in session for static graph mode
+            # fallback for session-based TF execution (e.g. when other test turned eager off)
+            try:
+                from tensorflow.compat.v1 import Session
+            except ImportError:
+                # Older TF versions don't have compat.v1 layer
+                from tensorflow import Session
+
+            with Session() as sess:
+                pos_val, neg_val = sess.run([pos, neg])
+        else:
+            pos_val, neg_val = pos, neg
+        assert pos_val == 3
+        assert neg_val == 2
 
 
 @pipeline_def(num_threads=4, exec_dynamic=True)

From 4a861c362076b63485574eda8117785a6e5aefc8 Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <jlisiecki@nvidia.com>
Date: Mon, 20 Apr 2026 20:30:18 +0200
Subject: [PATCH 19/19] Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 dali/test/python/test_dali_tf_exec2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dali/test/python/test_dali_tf_exec2.py b/dali/test/python/test_dali_tf_exec2.py
index 5db421cae7c..15457ca9660 100644
--- a/dali/test/python/test_dali_tf_exec2.py
+++ b/dali/test/python/test_dali_tf_exec2.py
@@ -21,7 +21,6 @@
 import nvidia.dali.plugin.tf as dali_tf
 from test_utils_tensorflow import skip_inputs_for_incompatible_tf
 from test_utils import get_dali_extra_path
-import unittest
 
 test_data_root = get_dali_extra_path()
 lmdb_folder = os.path.join(test_data_root, "db", "lmdb")