InfiniTensor
diff --git a/‎tests/test_add.py‎
Lines changed: 11 additions & 2 deletions b/‎tests/test_add.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎tests/test_add_rms_norm.py‎
Lines changed: 95 additions & 0 deletions b/‎tests/test_add_rms_norm.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎tests/test_cast.py‎
Lines changed: 65 additions & 0 deletions b/‎tests/test_cast.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎tests/test_cat.py‎
Lines changed: 74 additions & 0 deletions b/‎tests/test_cat.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎tests/test_causal_softmax.py‎
Lines changed: 6 additions & 3 deletions b/‎tests/test_causal_softmax.py‎
Lines changed: 6 additions & 3 deletions
@@ -2,7 +2,13 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, randint_strided, randn_strided
+from tests.utils import (
+    Payload,
+    empty_strided,
+    get_npu_stream,
+    randint_strided,
+    randn_strided,
+)
 
 _INT_DTYPES = (torch.int16, torch.int32, torch.int64)
 
@@ -63,7 +69,10 @@ def test_add(
 
 
 def _add(input, other, out):
-    infini.ops.add(input, other, out)
+    if input.device.type == "npu":
+        infini.ops.add(input, other, out, stream=get_npu_stream(input))
+    else:
+        infini.ops.add(input, other, out)
 
     return out
 
 
@@ -0,0 +1,95 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_npu_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, strides",
+    (
+        ((1, 64), None),
+        ((2, 128), None),
+        ((4, 48, 64), None),
+        ((2, 4, 2048), None),
+        ((1, 64), (64, 1)),
+        ((4, 48, 64), (3072, 64, 1)),
+    ),
+)
+@pytest.mark.parametrize("eps", (1e-6, 1e-5))
+@pytest.mark.parametrize("implementation_index", (0, 1))
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-4, 1e-4),
+        (torch.float16, 1e-2, 1e-2),
+        (torch.bfloat16, 2e-2, 1e-2),
+    ),
+)
+def test_add_rms_norm(
+    shape,
+    strides,
+    eps,
+    implementation_index,
+    dtype,
+    device,
+    rtol,
+    atol,
+):
+    active_indices = infini.ops.AddRmsNorm.active_implementation_indices(device)
+
+    if implementation_index not in active_indices:
+        pytest.skip(f"implementation `{implementation_index}` not active on `{device}`")
+
+    weight_shape = (shape[-1],)
+    x1 = randn_strided(shape, strides, dtype=dtype, device=device)
+    x2 = randn_strided(shape, strides, dtype=dtype, device=device)
+    gamma = randn_strided(weight_shape, None, dtype=dtype, device=device)
+    y_out = empty_strided(shape, strides, dtype=dtype, device=device)
+    x_out = empty_strided(shape, strides, dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args, **kwargs: _add_rms_norm(
+            *args, **kwargs, implementation_index=implementation_index
+        ),
+        _torch_add_rms_norm,
+        (x1, x2, gamma),
+        {"eps": eps, "y_out": y_out, "x_out": x_out},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _add_rms_norm(x1, x2, gamma, *, eps=1e-6, y_out=None, x_out=None,
+                  implementation_index=0):
+    if x1.device.type == "npu":
+        infini.ops.add_rms_norm(
+            x1, x2, gamma, eps, y_out, x_out,
+            implementation_index=implementation_index,
+            stream=get_npu_stream(x1),
+        )
+    else:
+        infini.ops.add_rms_norm(
+            x1, x2, gamma, eps, y_out, x_out,
+            implementation_index=implementation_index,
+        )
+
+    # Concatenate both outputs into a single flat tensor for allclose comparison.
+    return torch.cat([y_out.contiguous().flatten(), x_out.contiguous().flatten()])
+
+
+def _torch_add_rms_norm(x1, x2, gamma, *, eps=1e-6, y_out=None, x_out=None):
+    x_sum = x1 + x2
+
+    if x_out is not None:
+        x_out.copy_(x_sum)
+
+    rms = torch.sqrt(torch.mean(x_sum.float() * x_sum.float(), dim=-1,
+                                keepdim=True) + eps)
+    y = (x_sum.float() / rms * gamma.float()).to(x1.dtype)
+
+    if y_out is not None:
+        y_out.copy_(y)
+
+    return torch.cat([y_out.contiguous().flatten(), x_out.contiguous().flatten()])
@@ -0,0 +1,65 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_npu_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, out_strides",
+    (
+        ((13, 4), None, None),
+        ((13, 4), (10, 1), (10, 1)),
+        ((13, 4, 4), None, None),
+        ((16, 5632), None, None),
+        ((4, 4, 5632), None, None),
+    ),
+)
+@pytest.mark.parametrize(
+    ("input_dtype", "out_dtype", "rtol", "atol"),
+    (
+        (torch.float16, torch.float32, 1e-3, 1e-3),
+        (torch.float32, torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, torch.float32, 1e-2, 5e-3),
+        (torch.float32, torch.bfloat16, 1e-2, 5e-3),
+        (torch.float16, torch.bfloat16, 1e-2, 5e-3),
+        (torch.bfloat16, torch.float16, 1e-2, 5e-3),
+    ),
+)
+def test_cast(
+    shape,
+    input_strides,
+    out_strides,
+    input_dtype,
+    out_dtype,
+    device,
+    rtol,
+    atol,
+):
+    input = randn_strided(shape, input_strides, dtype=input_dtype, device=device)
+    out = empty_strided(shape, out_strides, dtype=out_dtype, device=device)
+
+    return Payload(
+        _cast,
+        _torch_cast,
+        (input, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _cast(input, out):
+    if input.device.type == "npu":
+        infini.ops.cast(input, out, stream=get_npu_stream(input))
+    else:
+        infini.ops.cast(input, out)
+
+    return out
+
+
+def _torch_cast(input, out):
+    out.copy_(input.to(out.dtype))
+
+    return out
@@ -0,0 +1,74 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_npu_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shapes, dim, out_shape",
+    (
+        # 2 inputs, dim=0
+        (((4, 64), (4, 64)), 0, (8, 64)),
+        # 2 inputs, dim=1
+        (((4, 32), (4, 64)), 1, (4, 96)),
+        # 2 inputs, dim=-1 (negative dim)
+        (((4, 32), (4, 64)), -1, (4, 96)),
+        # 3 inputs, dim=1
+        (((4, 16), (4, 32), (4, 16)), 1, (4, 64)),
+        # 2 inputs, dim=0, 3D
+        (((2, 4, 64), (2, 4, 64)), 0, (4, 4, 64)),
+        # 2 inputs, dim=2, 3D
+        (((2, 4, 32), (2, 4, 64)), 2, (2, 4, 96)),
+        # 4 inputs, dim=1
+        (((1, 1024), (1, 1024), (1, 1024), (1, 1024)), 1, (1, 4096)),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 1e-7, 1e-7),
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+def test_cat(shapes, dim, out_shape, dtype, device, rtol, atol):
+    inputs = [
+        randn_strided(s, None, dtype=dtype, device=device) for s in shapes
+    ]
+    out = empty_strided(out_shape, None, dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args: _cat(*args, dim=dim),
+        lambda *args: _torch_cat(*args, dim=dim),
+        (*inputs, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _cat(*args, dim):
+    inputs = list(args[:-1])
+    out = args[-1]
+
+    first = inputs[0]
+    rest = inputs[1:]
+
+    if first.device.type == "npu":
+        infini.ops.cat(first, rest, dim, out, stream=get_npu_stream(first))
+    else:
+        infini.ops.cat(first, rest, dim, out)
+
+    return out
+
+
+def _torch_cat(*args, dim):
+    inputs = list(args[:-1])
+    out = args[-1]
+
+    result = torch.cat(inputs, dim=dim)
+    out.copy_(result)
+
+    return out
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, empty_strided, randn_strided
+from tests.utils import Payload, empty_strided, get_npu_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -40,15 +40,18 @@ def test_causal_softmax(shape, input_strides, out_strides, dtype, device, rtol,
 
 
 def _causal_softmax(input, out):
-    infini.ops.causal_softmax(input, out)
+    if input.device.type == "npu":
+        infini.ops.causal_softmax(input, out, stream=get_npu_stream(input))
+    else:
+        infini.ops.causal_softmax(input, out)
 
     return out
 
 
 def _torch_causal_softmax(input, out):
     mask = torch.tril(torch.ones_like(input), diagonal=-1).flip(dims=[-2, -1])
     masked = torch.where(mask == 1, -torch.inf, input.to(torch.float32))
-    result = torch.nn.functional.softmax(masked, dim=-1, dtype=input.dtype)
+    result = torch.nn.functional.softmax(masked, dim=-1)
     out.copy_(result)
 
     return out