test(conftest): joint (device, implementation_index) parametrize

zhangyue · zhangyue · commit 1dd288fed447 · 2026-04-21T17:47:00.000+08:00
Replaces the per-test `@pytest.mark.parametrize("implementation_index", ...)` + runtime `if impl not in active_indices: skip` pattern with a single hook in `conftest.pytest_generate_tests` that emits only the (device, impl) pairs actually active on each device. Rationale: kernel dispatch is per-device, so cross-device union (previous `all_active_implementation_indices` helper) polluted the matrix with impls that the selected device can't run — runtime-skipped noise. Joint generation keeps the matrix to its semantic cell: "this device has this impl, so run it". - `tests/conftest.py`: when both `device` and `implementation_index` are in fixturenames, emit pairs via `op_cls.active_implementation_indices(dev)`; fall back to a skipped placeholder (`id="skip"`) when no device has an active impl, avoiding `[NOTSET-...]` test IDs. - `tests/{test_add,test_gemm,test_rms_norm,test_swiglu}.py`: drop the hardcoded `implementation_index` parametrize decorator and the runtime `active_indices` guard — conftest now handles both. - `tests/utils.py`: remove the `all_active_implementation_indices` helper (superseded by per-device generation in conftest). Same test outcome on Ascend CI (1935 passed / 1686 skipped) but the remaining skips are now either semantically mandatory (uint dtypes unsupported by `torch_npu`, Gemm impl=2 SFINAE-only workaround, op missing ascend impl on op-simple pending PR #66) rather than mechanism artifacts.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -191,7 +191,65 @@ def pytest_generate_tests(metafunc):
         else:
             devices = ()
 
-        metafunc.parametrize("device", devices or available)
+        devices = devices or available
+
+        # Joint `(device, implementation_index)` parametrize: generate only
+        # pairs where the op has an active implementation on that device.
+        # Avoids cross-device pollution — an impl active on `cpu` but not on
+        # `npu` no longer appears as a runtime skip in the npu column.
+        if (
+            "implementation_index" in metafunc.fixturenames
+            and "implementation_index" not in already_parametrized
+        ):
+            op_cls = _op_class_from_module(metafunc.module)
+
+            if op_cls is not None and hasattr(op_cls, "active_implementation_indices"):
+                pairs = [
+                    (dev, idx)
+                    for dev in devices
+                    for idx in op_cls.active_implementation_indices(dev)
+                ]
+
+                if not pairs:
+                    # Emit one skipped placeholder so test IDs read
+                    # `[skip-dtype0-...]` instead of `[NOTSET-...]`.
+                    pairs = [
+                        pytest.param(
+                            devices[0] if devices else "cpu",
+                            0,
+                            marks=pytest.mark.skip(
+                                reason=(
+                                    f"{op_cls.__name__} has no active "
+                                    "implementation on any available device"
+                                )
+                            ),
+                            id="skip",
+                        )
+                    ]
+
+                metafunc.parametrize("device, implementation_index", pairs)
+
+                return
+
+        metafunc.parametrize("device", devices)
+
+
+def _op_class_from_module(module):
+    """Derive the `infini.ops.<Op>` class from a `tests/test_<snake>.py` module."""
+    module_name = module.__name__.rsplit(".", 1)[-1]
+
+    if not module_name.startswith("test_"):
+        return None
+
+    op_snake = module_name[len("test_") :]
+    op_pascal = "".join(part.capitalize() for part in op_snake.split("_"))
+
+    try:
+        import infini.ops as _ops
+    except ImportError:
+        return None
+
+    return getattr(_ops, op_pascal, None)
 
 
 @pytest.hookimpl(tryfirst=True)
diff --git a/tests/test_add.py b/tests/test_add.py
@@ -4,7 +4,6 @@
 
 from tests.utils import (
     Payload,
-    all_active_implementation_indices,
     empty_strided,
     get_stream,
     randint_strided,
@@ -36,9 +35,6 @@
         ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
     ),
 )
-@pytest.mark.parametrize(
-    "implementation_index", all_active_implementation_indices(infini.ops.Add)
-)
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -64,11 +60,6 @@ def test_add(
             "The `torch.musa` test cloning path does not support `uint16`, `uint32`, or `uint64`."
         )
 
-    active_indices = infini.ops.Add.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     if implementation_index == 1 and dtype in _UINT_DTYPES:
         pytest.skip("ATen `add` does not support unsigned integer types")
 
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
@@ -2,12 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import (
-    Payload,
-    all_active_implementation_indices,
-    get_stream,
-    randn_strided,
-)
+from tests.utils import Payload, get_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -25,9 +20,6 @@
 @pytest.mark.parametrize("beta", (-1, -0.5, 0, 0.5, 1))
 @pytest.mark.parametrize("trans_a", (False, True))
 @pytest.mark.parametrize("trans_b", (False, True))
-@pytest.mark.parametrize(
-    "implementation_index", all_active_implementation_indices(infini.ops.Gemm)
-)
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -61,11 +53,6 @@ def test_gemm(
     if device == "mlu" and dtype == torch.bfloat16:
         pytest.skip("`bfloat16` is not supported by `cnnlBatchMatMulEx`")
 
-    active_indices = infini.ops.Gemm.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     if implementation_index == 1 and dtype in (torch.float16, torch.bfloat16):
         pytest.skip("cuBLASLt half-precision exceeds current tolerances")
 
diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py
@@ -2,13 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import (
-    Payload,
-    all_active_implementation_indices,
-    empty_strided,
-    get_stream,
-    randn_strided,
-)
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -24,9 +18,6 @@
     ),
 )
 @pytest.mark.parametrize("eps", (1e-6, 1e-5))
-@pytest.mark.parametrize(
-    "implementation_index", all_active_implementation_indices(infini.ops.RmsNorm)
-)
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -48,11 +39,6 @@ def test_rms_norm(
     rtol,
     atol,
 ):
-    active_indices = infini.ops.RmsNorm.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     input = randn_strided(input_shape, input_strides, dtype=dtype, device=device)
     weight = randn_strided(weight_shape, weight_strides, dtype=dtype, device=device)
     out = empty_strided(input_shape, out_strides, dtype=dtype, device=device)
diff --git a/tests/test_swiglu.py b/tests/test_swiglu.py
@@ -2,13 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import (
-    Payload,
-    all_active_implementation_indices,
-    empty_strided,
-    get_stream,
-    rand_strided,
-)
+from tests.utils import Payload, empty_strided, get_stream, rand_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -25,9 +19,6 @@
         ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
     ),
 )
-@pytest.mark.parametrize(
-    "implementation_index", all_active_implementation_indices(infini.ops.Swiglu)
-)
 @pytest.mark.parametrize(
     ("dtype", "rtol", "atol"),
     (
@@ -47,11 +38,6 @@ def test_swiglu(
     rtol,
     atol,
 ):
-    active_indices = infini.ops.Swiglu.active_implementation_indices(device)
-
-    if implementation_index not in active_indices:
-        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
-
     input = rand_strided(shape, input_strides, dtype=dtype, device=device)
     gate = rand_strided(shape, gate_strides, dtype=dtype, device=device)
     out = empty_strided(shape, out_strides, dtype=dtype, device=device)
diff --git a/tests/utils.py b/tests/utils.py
@@ -122,28 +122,6 @@ def get_stream(device):
     return getattr(stream, attr, 0)
 
 
-def all_active_implementation_indices(op_cls):
-    """Union of `op_cls.active_implementation_indices(device)` across every
-    locally-available torch device type.
-
-    Use as the `@pytest.mark.parametrize("implementation_index", ...)` value so
-    the test matrix grows automatically when a new backend implementation is
-    added.  Per-device filtering (skipping indices not active on the currently
-    selected device) stays the test body's responsibility — see the `skip`
-    pattern in `test_gemm.py`.
-
-    Limited to `get_available_devices()` to avoid `DispatchFunc::std::abort`
-    for device types outside the build's `ActiveDevices` set (e.g., querying
-    `"cuda"` on an Ascend-only build).
-    """
-    indices = set()
-
-    for device in get_available_devices():
-        indices.update(op_cls.active_implementation_indices(device))
-
-    return tuple(sorted(indices))
-
-
 def clone_strided(input):
     output = empty_strided(
         input.size(), input.stride(), dtype=input.dtype, device=input.device