diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index e66df4659..23d2457a8 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -1,5 +1,8 @@
 #pragma once
 
-#include "op/matmul.hpp"
-#include "op/ones.hpp"
-#include "op/rearrange.hpp"
+#include "ops/add.hpp"
+#include "ops/attention.hpp"
+#include "ops/matmul.hpp"
+#include "ops/ones.hpp"
+#include "ops/rearrange.hpp"
+#include "ops/rms_norm.hpp"
diff --git a/include/infinicore/ops/add.hpp b/include/infinicore/ops/add.hpp
new file mode 100644
index 000000000..1dd5df0ff
--- /dev/null
+++ b/include/infinicore/ops/add.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Add {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor c, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor add(Tensor a, Tensor b);
+void add_(Tensor c, Tensor a, Tensor b);
+Tensor operator+(Tensor a, Tensor b);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/attention.hpp b/include/infinicore/ops/attention.hpp
new file mode 100644
index 000000000..1bc447c77
--- /dev/null
+++ b/include/infinicore/ops/attention.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Attention {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, size_t);
+    static void execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+void attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+} // namespace infinicore::op
diff --git a/include/infinicore/op/common/cache.hpp b/include/infinicore/ops/common/cache.hpp
similarity index 100%
rename from include/infinicore/op/common/cache.hpp
rename to include/infinicore/ops/common/cache.hpp
diff --git a/include/infinicore/op/common/dispatcher.hpp b/include/infinicore/ops/common/dispatcher.hpp
similarity index 86%
rename from include/infinicore/op/common/dispatcher.hpp
rename to include/infinicore/ops/common/dispatcher.hpp
index 092cc099b..d0bf93f0f 100644
--- a/include/infinicore/op/common/dispatcher.hpp
+++ b/include/infinicore/ops/common/dispatcher.hpp
@@ -8,19 +8,19 @@ namespace infinicore::op::common {
 template <typename Fn>
 class OpDispatcher {
 public:
-    void registerDevice(Device::Type device_type, Fn fn, bool override_existing=true) {
-        if (table_[(size_t)device_type] == nullptr || override_existing){
+    void registerDevice(Device::Type device_type, Fn fn, bool override_existing = true) {
+        if (table_[(size_t)device_type] == nullptr || override_existing) {
             table_[(size_t)device_type] = fn;
         }
     }
 
-    void registerDevice(std::initializer_list<Device::Type> device_types, Fn fn, bool override_existing=true) {
+    void registerDevice(std::initializer_list<Device::Type> device_types, Fn fn, bool override_existing = true) {
         for (auto device_type : device_types) {
             registerDevice(device_type, fn, override_existing);
         }
     }
 
-    void registerAll(Fn fn, bool override_existing=true) {
+    void registerAll(Fn fn, bool override_existing = true) {
         for (size_t device_type = 0; device_type < static_cast<size_t>(Device::Type::COUNT); ++device_type) {
             registerDevice((Device::Type)device_type, fn, override_existing);
         }
diff --git a/include/infinicore/op/common/op.hpp b/include/infinicore/ops/common/op.hpp
similarity index 100%
rename from include/infinicore/op/common/op.hpp
rename to include/infinicore/ops/common/op.hpp
diff --git a/include/infinicore/op/matmul.hpp b/include/infinicore/ops/matmul.hpp
similarity index 100%
rename from include/infinicore/op/matmul.hpp
rename to include/infinicore/ops/matmul.hpp
diff --git a/include/infinicore/op/ones.hpp b/include/infinicore/ops/ones.hpp
similarity index 100%
rename from include/infinicore/op/ones.hpp
rename to include/infinicore/ops/ones.hpp
diff --git a/include/infinicore/op/rearrange.hpp b/include/infinicore/ops/rearrange.hpp
similarity index 100%
rename from include/infinicore/op/rearrange.hpp
rename to include/infinicore/ops/rearrange.hpp
diff --git a/include/infinicore/ops/rms_norm.hpp b/include/infinicore/ops/rms_norm.hpp
new file mode 100644
index 000000000..1212c446e
--- /dev/null
+++ b/include/infinicore/ops/rms_norm.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class RMSNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, float);
+    static void execute(Tensor y, Tensor x, Tensor weight, float epsilon = 1e-5f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor rms_norm(Tensor x, Tensor weight, float epsilon = 1e-5f);
+void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 961806b15..4757d7f29 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -25,8 +25,11 @@
     uint8,
 )
 from infinicore.ntops import use_ntops
+from infinicore.ops.add import add
+from infinicore.ops.attention import attention
 from infinicore.ops.matmul import matmul
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.rms_norm import rms_norm
 from infinicore.tensor import (
     empty,
     from_blob,
@@ -66,8 +69,11 @@
     # `ntops` integration.
     "use_ntops",
     # Operations.
+    "add",
+    "attention",
     "matmul",
     "rearrange",
+    "rms_norm",
     "empty",
     "from_blob",
     "ones",
diff --git a/python/infinicore/dtype.py b/python/infinicore/dtype.py
index ec06f5d7f..a323471c2 100644
--- a/python/infinicore/dtype.py
+++ b/python/infinicore/dtype.py
@@ -4,7 +4,6 @@
 class dtype:
     def __init__(self, data_type):
         """An internal method. Please do not use this directly."""
-
         self._underlying = data_type
 
     def __repr__(self):
@@ -29,9 +28,31 @@ def __repr__(self):
             _infinicore.DataType.C128: "complex128",
             _infinicore.DataType.BF16: "bfloat16",
         }
-
         return f"infinicore.{repr_map[self._underlying]}"
 
+    def __eq__(self, other):
+        """
+        Compare two dtype objects for equality.
+
+        Args:
+            other: The object to compare with
+
+        Returns:
+            bool: True if both objects are dtype instances with the same underlying data type
+        """
+        if not isinstance(other, dtype):
+            return False
+        return self._underlying == other._underlying
+
+    def __hash__(self):
+        """
+        Return a hash value for the dtype object.
+
+        Returns:
+            int: Hash value based on the underlying data type
+        """
+        return hash(self._underlying)
+
 
 float32 = dtype(_infinicore.DataType.F32)
 float = float32
diff --git a/python/infinicore/ops/add.py b/python/infinicore/ops/add.py
new file mode 100644
index 000000000..239c7c6fd
--- /dev/null
+++ b/python/infinicore/ops/add.py
@@ -0,0 +1,9 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def add(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.add(input._underlying, other._underlying))
+
+    _infinicore.add_(out._underlying, input._underlying, other._underlying)
diff --git a/python/infinicore/ops/attention.py b/python/infinicore/ops/attention.py
new file mode 100644
index 000000000..70e3d913a
--- /dev/null
+++ b/python/infinicore/ops/attention.py
@@ -0,0 +1,26 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def attention(q, k, v, k_cache, v_cache, pos, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.attention(
+                q._underlying,
+                k._underlying,
+                v._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                pos,
+            )
+        )
+
+    _infinicore.attention_(
+        out._underlying,
+        q._underlying,
+        k._underlying,
+        v._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        pos,
+    )
diff --git a/python/infinicore/ops/rms_norm.py b/python/infinicore/ops/rms_norm.py
new file mode 100644
index 000000000..6e8f788ee
--- /dev/null
+++ b/python/infinicore/ops/rms_norm.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rms_norm(input, weight, epsilon=1e-5, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.rms_norm(input._underlying, weight._underlying, epsilon)
+        )
+
+    _infinicore.rms_norm_(
+        out._underlying, input._underlying, weight._underlying, epsilon
+    )
diff --git a/src/infinicore/ops/add/add.cc b/src/infinicore/ops/add/add.cc
new file mode 100644
index 000000000..4962e9dd8
--- /dev/null
+++ b/src/infinicore/ops/add/add.cc
@@ -0,0 +1,24 @@
+#include "infinicore/ops/add.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Add::schema> &Add::dispatcher() {
+    static common::OpDispatcher<Add::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Add::execute(Tensor c, Tensor a, Tensor b) {
+    dispatcher().lookup(context::getDevice().getType())(c, a, b);
+}
+
+Tensor add(Tensor a, Tensor b) {
+    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
+    add_(c, a, b);
+    return c;
+}
+
+void add_(Tensor c, Tensor a, Tensor b) {
+    Add::execute(c, a, b);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/add/add_infiniop.cc b/src/infinicore/ops/add/add_infiniop.cc
new file mode 100644
index 000000000..e034b94de
--- /dev/null
+++ b/src/infinicore/ops/add/add_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/add.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::add_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAddDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAddDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor c, Tensor a, Tensor b) {
+    size_t seed = hash_combine(c, b, a);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAddDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAddDescriptor(
+            context::getInfiniopHandle(), &desc,
+            c->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAddWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAdd(
+        desc, workspace->data(), workspace_size,
+        c->data(), a->data(), b->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Add::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::add_impl::infiniop
diff --git a/src/infinicore/ops/attention/attention.cc b/src/infinicore/ops/attention/attention.cc
new file mode 100644
index 000000000..bf4fd8203
--- /dev/null
+++ b/src/infinicore/ops/attention/attention.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/attention.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Attention::schema> &Attention::dispatcher() {
+    static common::OpDispatcher<Attention::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Attention::execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    dispatcher().lookup(context::getDevice().getType())(out, q, k, v, k_cache, v_cache, pos);
+}
+
+Tensor attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t n_q_head = q->shape()[0];
+    size_t seq_len = q->shape()[1];
+    size_t head_dim = q->shape()[2];
+    Shape shape = {seq_len, n_q_head, head_dim};
+    auto out = Tensor::empty(shape, q->dtype(), q->device());
+    attention_(out, q, k, v, k_cache, v_cache, pos);
+    return out;
+}
+
+void attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    Attention::execute(out, q, k, v, k_cache, v_cache, pos);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/attention/attention_infiniop.cc b/src/infinicore/ops/attention/attention_infiniop.cc
new file mode 100644
index 000000000..816cd884c
--- /dev/null
+++ b/src/infinicore/ops/attention/attention_infiniop.cc
@@ -0,0 +1,54 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/attention.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::attention_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAttentionDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAttentionDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAttentionDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t seed = hash_combine(out, q, k, v, k_cache, v_cache, pos);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAttentionDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAttentionDescriptor(
+            context::getInfiniopHandle(), &desc,
+            out->desc(), q->desc(), k->desc(), v->desc(),
+            k_cache->desc(), v_cache->desc(), pos));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAttentionWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAttention(
+        desc, workspace->data(), workspace_size,
+        out->data(), q->data(), k->data(), v->data(),
+        k_cache->data(), v_cache->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Attention::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::attention_impl::infiniop
diff --git a/src/infinicore/op/matmul/matmul.cc b/src/infinicore/ops/matmul/matmul.cc
similarity index 94%
rename from src/infinicore/op/matmul/matmul.cc
rename to src/infinicore/ops/matmul/matmul.cc
index 33f5a157c..d04e98268 100644
--- a/src/infinicore/op/matmul/matmul.cc
+++ b/src/infinicore/ops/matmul/matmul.cc
@@ -1,4 +1,4 @@
-#include "infinicore/op/matmul.hpp"
+#include "infinicore/ops/matmul.hpp"
 
 namespace infinicore::op {
 
diff --git a/src/infinicore/op/matmul/matmul_infiniop.cc b/src/infinicore/ops/matmul/matmul_infiniop.cc
similarity index 85%
rename from src/infinicore/op/matmul/matmul_infiniop.cc
rename to src/infinicore/ops/matmul/matmul_infiniop.cc
index b68a4a243..3bd69c3f8 100644
--- a/src/infinicore/op/matmul/matmul_infiniop.cc
+++ b/src/infinicore/ops/matmul/matmul_infiniop.cc
@@ -1,7 +1,7 @@
 #include "../../utils.hpp"
 #include "infinicore/common/hash.hpp"
-#include "infinicore/op/common/cache.hpp"
-#include "infinicore/op/matmul.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/matmul.hpp"
 #include <infiniop.h>
 
 namespace infinicore::op::matmul_impl::infiniop {
@@ -27,7 +27,9 @@ void calculate(Tensor c, Tensor a, Tensor b) {
     infiniopGemmDescriptor_t desc = nullptr;
 
     if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(context::getInfiniopHandle(), &desc, c->desc(), a->desc(), b->desc()));
+        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
+            context::getInfiniopHandle(), &desc,
+            c->desc(), a->desc(), b->desc()));
         cache.put(seed, desc);
     } else {
         desc = *desc_opt;
diff --git a/src/infinicore/op/ones/ones.cc b/src/infinicore/ops/ones/ones.cc
similarity index 87%
rename from src/infinicore/op/ones/ones.cc
rename to src/infinicore/ops/ones/ones.cc
index 9974202e5..c28403eaf 100644
--- a/src/infinicore/op/ones/ones.cc
+++ b/src/infinicore/ops/ones/ones.cc
@@ -1,4 +1,4 @@
-#include "infinicore/op/ones.hpp"
+#include "infinicore/ops/ones.hpp"
 
 namespace infinicore::op {
 
diff --git a/src/infinicore/op/rearrange/rearrange.cc b/src/infinicore/ops/rearrange/rearrange.cc
similarity index 93%
rename from src/infinicore/op/rearrange/rearrange.cc
rename to src/infinicore/ops/rearrange/rearrange.cc
index 808a8e442..fe9cb4e99 100644
--- a/src/infinicore/op/rearrange/rearrange.cc
+++ b/src/infinicore/ops/rearrange/rearrange.cc
@@ -1,4 +1,4 @@
-#include "infinicore/op/rearrange.hpp"
+#include "infinicore/ops/rearrange.hpp"
 
 namespace infinicore::op {
 
diff --git a/src/infinicore/op/rearrange/rearrange_infiniop.cc b/src/infinicore/ops/rearrange/rearrange_infiniop.cc
similarity index 94%
rename from src/infinicore/op/rearrange/rearrange_infiniop.cc
rename to src/infinicore/ops/rearrange/rearrange_infiniop.cc
index 8b9d15162..d0a02105b 100644
--- a/src/infinicore/op/rearrange/rearrange_infiniop.cc
+++ b/src/infinicore/ops/rearrange/rearrange_infiniop.cc
@@ -1,7 +1,7 @@
 #include "../../utils.hpp"
 #include "infinicore/common/hash.hpp"
-#include "infinicore/op/common/cache.hpp"
-#include "infinicore/op/rearrange.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rearrange.hpp"
 #include <infiniop.h>
 
 namespace infinicore::op::rearrange_impl::infiniop {
diff --git a/src/infinicore/ops/rms_norm/rms_norm.cc b/src/infinicore/ops/rms_norm/rms_norm.cc
new file mode 100644
index 000000000..613608b0f
--- /dev/null
+++ b/src/infinicore/ops/rms_norm/rms_norm.cc
@@ -0,0 +1,24 @@
+#include "infinicore/ops/rms_norm.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<RMSNorm::schema> &RMSNorm::dispatcher() {
+    static common::OpDispatcher<RMSNorm::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void RMSNorm::execute(Tensor y, Tensor x, Tensor weight, float epsilon) {
+    dispatcher().lookup(context::getDevice().getType())(y, x, weight, epsilon);
+}
+
+Tensor rms_norm(Tensor x, Tensor weight, float epsilon) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    rms_norm_(y, x, weight, epsilon);
+    return y;
+}
+
+void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon) {
+    RMSNorm::execute(y, x, weight, epsilon);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc b/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
new file mode 100644
index 000000000..3a4cdbefa
--- /dev/null
+++ b/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/rms_norm.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::rms_norm_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopRMSNormDescriptor_t> caches(
+    100, // capacity
+    [](infiniopRMSNormDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyRMSNormDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x, Tensor weight, float epsilon) {
+    size_t seed = hash_combine(y, x, weight, epsilon);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopRMSNormDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateRMSNormDescriptor(
+            context::getInfiniopHandle(), &desc,
+            y->desc(), x->desc(), weight->desc(), epsilon));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopRMSNorm(
+        desc, workspace->data(), workspace_size,
+        y->data(), x->data(), weight->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    RMSNorm::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::rms_norm_impl::infiniop
diff --git a/src/infinicore/pybind11/infinicore.cc b/src/infinicore/pybind11/infinicore.cc
index 31b159dfd..981c727d6 100644
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
@@ -4,7 +4,7 @@
 #include "context.hpp"
 #include "device.hpp"
 #include "dtype.hpp"
-#include "op.hpp"
+#include "ops.hpp"
 #include "tensor.hpp"
 
 namespace infinicore {
@@ -13,7 +13,7 @@ PYBIND11_MODULE(_infinicore, m) {
     context::bind(m);
     device::bind(m);
     dtype::bind(m);
-    op::bind(m);
+    ops::bind(m);
     tensor::bind(m);
 }
 
diff --git a/src/infinicore/pybind11/op.hpp b/src/infinicore/pybind11/op.hpp
deleted file mode 100644
index 04d1160df..000000000
--- a/src/infinicore/pybind11/op.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "op/matmul.hpp"
-#include "op/rearrange.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::op {
-
-inline void bind(py::module &m) {
-    bind_matmul(m);
-    bind_rearrange(m);
-}
-
-} // namespace infinicore::op
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
new file mode 100644
index 000000000..3cfef16ac
--- /dev/null
+++ b/src/infinicore/pybind11/ops.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "ops/add.hpp"
+#include "ops/attention.hpp"
+#include "ops/matmul.hpp"
+#include "ops/rearrange.hpp"
+#include "ops/rms_norm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind(py::module &m) {
+    bind_add(m);
+    bind_attention(m);
+    bind_matmul(m);
+    bind_rearrange(m);
+    bind_rms_norm(m);
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/add.hpp b/src/infinicore/pybind11/ops/add.hpp
new file mode 100644
index 000000000..fe7ac852f
--- /dev/null
+++ b/src/infinicore/pybind11/ops/add.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/add.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_add(py::module &m) {
+    m.def("add",
+          &op::add,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(Addition of two tensors.)doc");
+
+    m.def("add_",
+          &op::add_,
+          py::arg("c"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place tensor addition.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/attention.hpp b/src/infinicore/pybind11/ops/attention.hpp
new file mode 100644
index 000000000..4af2d5f74
--- /dev/null
+++ b/src/infinicore/pybind11/ops/attention.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/attention.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_attention(py::module &m) {
+    m.def("attention",
+          &op::attention,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("pos"),
+          R"doc(Attention mechanism with KV caching.
+
+Args:
+    q: Query tensor
+    k: Key tensor  
+    v: Value tensor
+    k_cache: Key cache tensor
+    v_cache: Value cache tensor
+    pos: Current position in the sequence
+
+Returns:
+    Output tensor from attention computation
+)doc");
+
+    m.def("attention_",
+          &op::Attention::execute,
+          py::arg("out"),
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("pos"),
+          R"doc(In-place attention mechanism with KV caching.
+
+Args:
+    out: Output tensor
+    q: Query tensor
+    k: Key tensor
+    v: Value tensor  
+    k_cache: Key cache tensor
+    v_cache: Value cache tensor
+    pos: Current position in the sequence
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/op/matmul.hpp b/src/infinicore/pybind11/ops/matmul.hpp
similarity index 82%
rename from src/infinicore/pybind11/op/matmul.hpp
rename to src/infinicore/pybind11/ops/matmul.hpp
index ad98021e4..13591c90b 100644
--- a/src/infinicore/pybind11/op/matmul.hpp
+++ b/src/infinicore/pybind11/ops/matmul.hpp
@@ -2,11 +2,11 @@
 
 #include <pybind11/pybind11.h>
 
-#include "infinicore/op/matmul.hpp"
+#include "infinicore/ops/matmul.hpp"
 
 namespace py = pybind11;
 
-namespace infinicore::op {
+namespace infinicore::ops {
 
 inline void bind_matmul(py::module &m) {
     m.def("matmul",
@@ -23,4 +23,4 @@ inline void bind_matmul(py::module &m) {
           R"doc(In-place matrix multiplication.)doc");
 }
 
-} // namespace infinicore::op
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/op/rearrange.hpp b/src/infinicore/pybind11/ops/rearrange.hpp
similarity index 80%
rename from src/infinicore/pybind11/op/rearrange.hpp
rename to src/infinicore/pybind11/ops/rearrange.hpp
index 94ac919bc..816b00079 100644
--- a/src/infinicore/pybind11/op/rearrange.hpp
+++ b/src/infinicore/pybind11/ops/rearrange.hpp
@@ -2,11 +2,11 @@
 
 #include <pybind11/pybind11.h>
 
-#include "infinicore/op/rearrange.hpp"
+#include "infinicore/ops/rearrange.hpp"
 
 namespace py = pybind11;
 
-namespace infinicore::op {
+namespace infinicore::ops {
 
 inline void bind_rearrange(py::module &m) {
     m.def("rearrange",
@@ -21,4 +21,4 @@ inline void bind_rearrange(py::module &m) {
           R"doc(In-place tensor rearrangement.)doc");
 }
 
-} // namespace infinicore::op
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rms_norm.hpp b/src/infinicore/pybind11/ops/rms_norm.hpp
new file mode 100644
index 000000000..1fd899c44
--- /dev/null
+++ b/src/infinicore/pybind11/ops/rms_norm.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rms_norm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rms_norm(py::module &m) {
+    m.def("rms_norm",
+          &op::rms_norm,
+          py::arg("x"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(Root Mean Square Normalization.
+
+Args:
+    x: Input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+
+Returns:
+    Normalized tensor with same shape as input
+)doc");
+
+    m.def("rms_norm_",
+          &op::rms_norm_,
+          py::arg("y"),
+          py::arg("x"),
+          py::arg("weight"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(In-place Root Mean Square Normalization.
+
+Args:
+    y: Output tensor
+    x: Input tensor
+    weight: Scale weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/test/infinicore/framework/__init__.py b/test/infinicore/framework/__init__.py
index 6b74d43e1..87d32f25f 100644
--- a/test/infinicore/framework/__init__.py
+++ b/test/infinicore/framework/__init__.py
@@ -1,31 +1,46 @@
-from .base import TestConfig, TestRunner, TestCase
+# [file name]: __init__.py
+# [file content begin]
+from .base import TestConfig, TestRunner, TestCase, BaseOperatorTest
+from .tensor import TensorSpec, TensorInitializer
 from .utils import (
-    create_infinicore_tensor,
     compare_results,
+    create_test_comparator,
     debug,
     get_tolerance,
+    infinicore_tensor_from_torch,
     profile_operation,
     rearrange_tensor,
+    convert_infinicore_to_torch,
 )
 from .config import get_test_devices, get_args
 from .devices import InfiniDeviceEnum, InfiniDeviceNames, torch_device_map
 from .datatypes import to_torch_dtype, to_infinicore_dtype
+from .runner import GenericTestRunner
+from .templates import BinaryOperatorTest, UnaryOperatorTest
 
 __all__ = [
+    "TensorSpec",
+    "TensorInitializer",
     "TestConfig",
     "TestRunner",
     "TestCase",
-    "create_infinicore_tensor",
+    "BaseOperatorTest",
     "compare_results",
+    "create_test_comparator",
+    "convert_infinicore_to_torch",
     "debug",
+    "get_args",
+    "get_test_devices",
     "get_tolerance",
+    "infinicore_tensor_from_torch",
     "profile_operation",
     "rearrange_tensor",
-    "get_test_devices",
-    "get_args",
     "InfiniDeviceEnum",
     "InfiniDeviceNames",
     "torch_device_map",
     "to_torch_dtype",
     "to_infinicore_dtype",
+    "GenericTestRunner",
+    "BinaryOperatorTest",
+    "UnaryOperatorTest",
 ]
diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py
index 174137efe..1ee4a5294 100644
--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
@@ -1,18 +1,90 @@
 import torch
 import infinicore
-from .devices import InfiniDeviceNames
-from .utils import synchronize_device
+
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Tuple, Union, Callable, Optional
+
+from .datatypes import to_torch_dtype, to_infinicore_dtype
+from .devices import InfiniDeviceNames, torch_device_map
+from .tensor import TensorSpec, TensorInitializer
+from .utils import (
+    create_test_comparator,
+    infinicore_tensor_from_torch,
+    profile_operation,
+    rearrange_tensor,
+    synchronize_device,
+)
 
 
 class TestCase:
-    """Base test case class"""
+    """Test case"""
+
+    OUT_OF_PLACE = "out_of_place"
+    IN_PLACE = "in_place"
+    BOTH = "both"
+
+    def __init__(self, operation_mode, inputs, output=None, **kwargs):
+        if operation_mode not in [self.IN_PLACE, self.OUT_OF_PLACE, self.BOTH]:
+            raise ValueError(f"Invalid operation_mode: {operation_mode}")
+
+        if operation_mode == self.IN_PLACE and output is None:
+            raise ValueError("IN_PLACE mode requires output specification")
+
+        self.operation_mode = operation_mode
+        self.inputs = []
+
+        for inp in inputs:
+            if isinstance(inp, (list, tuple)):
+                self.inputs.append(TensorSpec.from_tensor(inp))
+            elif isinstance(inp, TensorSpec):
+                self.inputs.append(inp)
+            else:
+                self.inputs.append(inp)
+
+        if isinstance(output, (list, tuple)):
+            self.output = TensorSpec.from_tensor(output)
+        else:
+            self.output = output
 
-    def __init__(self, *args, **kwargs):
-        self.args = args
         self.kwargs = kwargs
+        self.description = kwargs.pop("description", "")
 
     def __str__(self):
-        return f"TestCase{self.args}"
+        mode_str = self.operation_mode.upper()
+        input_strs = []
+        for inp in self.inputs:
+            if hasattr(inp, "is_scalar") and inp.is_scalar:
+                dtype_str = f", dtype={inp.dtype}" if inp.dtype else ""
+                input_strs.append(f"scalar({inp.value}{dtype_str})")
+            elif hasattr(inp, "shape"):
+                dtype_str = f", dtype={inp.dtype}" if inp.dtype else ""
+                init_str = (
+                    f", init={inp.init_mode}"
+                    if inp.init_mode != TensorInitializer.RANDOM
+                    else ""
+                )
+                if hasattr(inp, "is_contiguous") and not inp.is_contiguous:
+                    input_strs.append(f"strided_tensor{inp.shape}{dtype_str}{init_str}")
+                else:
+                    input_strs.append(f"tensor{inp.shape}{dtype_str}{init_str}")
+            else:
+                input_strs.append(str(inp))
+
+        base_str = f"TestCase(mode={mode_str}, inputs=[{', '.join(input_strs)}]"
+        if self.output:
+            dtype_str = f", dtype={self.output.dtype}" if self.output.dtype else ""
+            init_str = (
+                f", init={self.output.init_mode}"
+                if self.output.init_mode != TensorInitializer.RANDOM
+                else ""
+            )
+            base_str += f", output=tensor{self.output.shape}{dtype_str}{init_str}"
+        if self.kwargs:
+            base_str += f", kwargs={self.kwargs}"
+        if self.description:
+            base_str += f", desc='{self.description}'"
+        base_str += ")"
+        return base_str
 
 
 class TestConfig:
@@ -26,6 +98,7 @@ def __init__(
         bench=False,
         num_prerun=10,
         num_iterations=1000,
+        dtype_combinations=None,
     ):
         self.tensor_dtypes = tensor_dtypes
         self.tolerance_map = tolerance_map
@@ -33,6 +106,7 @@ def __init__(
         self.bench = bench
         self.num_prerun = num_prerun
         self.num_iterations = num_iterations
+        self.dtype_combinations = dtype_combinations
 
 
 class TestRunner:
@@ -41,45 +115,61 @@ class TestRunner:
     def __init__(self, test_cases, test_config):
         self.test_cases = test_cases
         self.config = test_config
-        self.failed_tests = []  # Track failures
+        self.failed_tests = []
 
-    def run_tests(self, devices, test_func):
-        """Run tests and track failures"""
+    def run_tests(self, devices, test_func, test_type="Test"):
         for device in devices:
             print(f"\n{'='*60}")
-            print(f"Testing on {InfiniDeviceNames[device]}")
+            print(f"Testing {test_type} on {InfiniDeviceNames[device]}")
             print(f"{'='*60}")
 
-            # filter unsupported data types
             tensor_dtypes = self._filter_tensor_dtypes_by_device(
                 device, self.config.tensor_dtypes
             )
 
             for test_case in self.test_cases:
-                for dtype in tensor_dtypes:
-                    try:
-                        test_func(device, test_case, dtype, self.config)
-                        print(f"✓ {test_case} with {dtype} passed")
-                    except Exception as e:
-                        error_msg = f"{test_case} with {dtype} on {InfiniDeviceNames[device]}: {e}"
-                        print(f"✗ {error_msg}")
-                        self.failed_tests.append(error_msg)
-                        if self.config.debug:
-                            raise
-
-        # Return whether any tests failed
+                if self.config.dtype_combinations:
+                    for dtype_combo in self.config.dtype_combinations:
+                        try:
+                            test_func(device, test_case, dtype_combo, self.config)
+                            combo_str = self._format_dtype_combo(dtype_combo)
+                            print(f"✓ {test_case} with {combo_str} passed")
+                        except Exception as e:
+                            combo_str = self._format_dtype_combo(dtype_combo)
+                            error_msg = f"{test_case} with {combo_str} on {InfiniDeviceNames[device]}: {e}"
+                            print(f"✗ {error_msg}")
+                            self.failed_tests.append(error_msg)
+                            if self.config.debug:
+                                raise
+                else:
+                    for dtype in tensor_dtypes:
+                        try:
+                            test_func(device, test_case, dtype, self.config)
+                            print(f"✓ {test_case} with {dtype} passed")
+                        except Exception as e:
+                            error_msg = f"{test_case} with {dtype} on {InfiniDeviceNames[device]}: {e}"
+                            print(f"✗ {error_msg}")
+                            self.failed_tests.append(error_msg)
+                            if self.config.debug:
+                                raise
+
         return len(self.failed_tests) == 0
 
+    def _format_dtype_combo(self, dtype_combo):
+        if isinstance(dtype_combo, dict):
+            return f"dtypes({dtype_combo})"
+        elif isinstance(dtype_combo, (list, tuple)):
+            return f"dtypes{tuple(dtype_combo)}"
+        else:
+            return str(dtype_combo)
+
     def _filter_tensor_dtypes_by_device(self, device, tensor_dtypes):
-        """Filter data types based on device"""
         if device in ():
-            # Filter out unsupported data types on specified devices
             return [dt for dt in tensor_dtypes if dt != infinicore.bfloat16]
         else:
             return tensor_dtypes
 
     def print_summary(self):
-        """Print test summary"""
         if self.failed_tests:
             print(f"\n\033[91m{len(self.failed_tests)} tests failed:\033[0m")
             for failure in self.failed_tests:
@@ -88,3 +178,246 @@ def print_summary(self):
         else:
             print("\n\033[92mAll tests passed!\033[0m")
             return True
+
+
+class BaseOperatorTest(ABC):
+    """Base operator test"""
+
+    def __init__(self, operator_name):
+        self.operator_name = operator_name
+        self.test_cases = self.get_test_cases()
+        self.tensor_dtypes = self.get_tensor_dtypes()
+        self.tolerance_map = self.get_tolerance_map()
+        self.dtype_combinations = self.get_dtype_combinations()
+
+    @abstractmethod
+    def get_test_cases(self):
+        """Return list of TestCase objects"""
+        pass
+
+    @abstractmethod
+    def get_tensor_dtypes(self):
+        """Return supported data types"""
+        pass
+
+    @abstractmethod
+    def get_tolerance_map(self):
+        """Return tolerance configuration"""
+        pass
+
+    def get_dtype_combinations(self):
+        """Return dtype combinations for mixed dtype tests"""
+        return None
+
+    @abstractmethod
+    def torch_operator(self, *inputs, out=None, **kwargs):
+        """Unified PyTorch operator function"""
+        pass
+
+    @abstractmethod
+    def infinicore_operator(self, *inputs, out=None, **kwargs):
+        """Unified Infinicore operator function"""
+        pass
+
+    def create_strided_tensor(
+        self, shape, strides, dtype, device, init_mode=TensorInitializer.RANDOM
+    ):
+        """Create a non-contiguous tensor with specific strides"""
+        spec = TensorSpec.from_strided_tensor(shape, strides, dtype, init_mode)
+        return spec.create_torch_tensor(device, dtype)
+
+    def prepare_inputs(self, test_case, device, dtype_config):
+        """Prepare input data"""
+        inputs = []
+
+        for i, input_spec in enumerate(test_case.inputs):
+            if isinstance(input_spec, TensorSpec):
+                if input_spec.is_scalar:
+                    inputs.append(input_spec.value)
+                else:
+                    tensor = input_spec.create_torch_tensor(device, dtype_config, i)
+                    inputs.append(tensor)
+            else:
+                inputs.append(input_spec)
+
+        return inputs, test_case.kwargs
+
+    def get_output_dtype(self, test_case, dtype_config, torch_result=None):
+        """Determine output dtype - returns infinicore dtype, not torch dtype"""
+        if test_case.output and test_case.output.dtype is not None:
+            return test_case.output.dtype
+        elif isinstance(dtype_config, dict) and "output" in dtype_config:
+            return dtype_config["output"]
+        elif torch_result is not None:
+            return to_infinicore_dtype(torch_result.dtype)
+        else:
+            if isinstance(dtype_config, (list, tuple)):
+                return dtype_config[0]
+            else:
+                return dtype_config
+
+    def run_test(self, device, test_case, dtype_config, config):
+        """Unified test execution flow"""
+        device_str = torch_device_map[device]
+
+        if test_case.operation_mode == TestCase.BOTH:
+            out_of_place_case = TestCase(
+                TestCase.OUT_OF_PLACE,
+                test_case.inputs,
+                test_case.output,
+                **test_case.kwargs,
+            )
+            self._run_single_test(
+                device, out_of_place_case, dtype_config, config, "OUT_OF_PLACE"
+            )
+
+            if test_case.output is not None:
+                in_place_case = TestCase(
+                    TestCase.IN_PLACE,
+                    test_case.inputs,
+                    test_case.output,
+                    **test_case.kwargs,
+                )
+                self._run_single_test(
+                    device, in_place_case, dtype_config, config, "IN_PLACE"
+                )
+            return
+
+        self._run_single_test(
+            device, test_case, dtype_config, config, test_case.operation_mode.upper()
+        )
+
+    def _run_single_test(self, device, test_case, dtype_config, config, mode_name):
+        """Run a single test with specified operation mode"""
+        device_str = torch_device_map[device]
+
+        inputs, kwargs = self.prepare_inputs(test_case, device, dtype_config)
+
+        infini_inputs = []
+        for inp in inputs:
+            if isinstance(inp, torch.Tensor):
+                infini_tensor = infinicore_tensor_from_torch(inp)
+                infini_inputs.append(infini_tensor)
+            else:
+                infini_inputs.append(inp)
+
+        if test_case.operation_mode == TestCase.OUT_OF_PLACE:
+
+            def torch_op():
+                return self.torch_operator(*inputs, **kwargs)
+
+            torch_result = torch_op()
+
+            if (
+                isinstance(torch_result, torch.Tensor)
+                and not torch_result.is_contiguous()
+            ):
+                torch_result = torch_result.contiguous()
+
+            def infini_op():
+                return self.infinicore_operator(*infini_inputs, **kwargs)
+
+            infini_result = infini_op()
+
+            # Get comparison dtype (infinicore dtype)
+            comparison_dtype = self.get_output_dtype(
+                test_case, dtype_config, torch_result
+            )
+
+            compare_fn = create_test_comparator(
+                config, comparison_dtype, mode_name=f"{self.operator_name} {mode_name}"
+            )
+            is_valid = compare_fn(infini_result, torch_result)
+            assert is_valid, f"{self.operator_name} {mode_name} test failed"
+
+            if config.bench:
+                profile_operation(
+                    f"PyTorch {self.operator_name} {mode_name}",
+                    torch_op,
+                    device_str,
+                    config.num_prerun,
+                    config.num_iterations,
+                )
+                profile_operation(
+                    f"Infinicore {self.operator_name} {mode_name}",
+                    infini_op,
+                    device_str,
+                    config.num_prerun,
+                    config.num_iterations,
+                )
+
+        else:
+            if not test_case.output:
+                raise ValueError("IN_PLACE test requires output specification")
+
+            # Get output dtype and create output tensor
+            output_dtype = self.get_output_dtype(test_case, dtype_config)
+            output_shape = test_case.output.shape
+
+            # Use TensorSpec to create output tensor with specified initialization mode
+            if test_case.output.is_contiguous or test_case.output.strides is None:
+                output_spec = TensorSpec.from_tensor(
+                    output_shape, output_dtype, init_mode=test_case.output.init_mode
+                )
+            else:
+                output_spec = TensorSpec.from_strided_tensor(
+                    output_shape,
+                    test_case.output.strides,
+                    output_dtype,
+                    init_mode=test_case.output.init_mode,
+                )
+
+            torch_output = output_spec.create_torch_tensor(device, output_dtype)
+
+            # For non-contiguous tensors, we need to ensure zeros initialization
+            if (
+                not test_case.output.is_contiguous
+                and test_case.output.strides is not None
+            ):
+                torch_output.zero_()
+
+            def torch_op_inplace():
+                self.torch_operator(*inputs, out=torch_output, **kwargs)
+
+            torch_op_inplace()
+
+            # Create infinicore output tensor
+            torch_dummy = torch.zeros(
+                output_shape, dtype=to_torch_dtype(output_dtype), device=device_str
+            )
+            if (
+                not test_case.output.is_contiguous
+                and not test_case.output.strides is None
+            ):
+                rearrange_tensor(torch_dummy, list(torch_output.stride()))
+            infini_output = infinicore_tensor_from_torch(torch_dummy)
+
+            def infini_op_inplace():
+                self.infinicore_operator(*infini_inputs, out=infini_output, **kwargs)
+
+            infini_op_inplace()
+
+            comparison_dtype = self.get_output_dtype(
+                test_case, dtype_config, torch_output
+            )
+            compare_fn = create_test_comparator(
+                config, comparison_dtype, mode_name=f"{self.operator_name} {mode_name}"
+            )
+            is_valid = compare_fn(infini_output, torch_output)
+            assert is_valid, f"{self.operator_name} {mode_name} test failed"
+
+            if config.bench:
+                profile_operation(
+                    f"PyTorch {self.operator_name} {mode_name}",
+                    torch_op_inplace,
+                    device_str,
+                    config.num_prerun,
+                    config.num_iterations,
+                )
+                profile_operation(
+                    f"Infinicore {self.operator_name} {mode_name}",
+                    infini_op_inplace,
+                    device_str,
+                    config.num_prerun,
+                    config.num_iterations,
+                )
diff --git a/test/infinicore/framework/runner.py b/test/infinicore/framework/runner.py
new file mode 100644
index 000000000..eb364daad
--- /dev/null
+++ b/test/infinicore/framework/runner.py
@@ -0,0 +1,49 @@
+"""
+Generic test runner that handles the common execution flow for all operators
+"""
+
+import sys
+from . import TestConfig, TestRunner, get_args, get_test_devices
+
+
+class GenericTestRunner:
+    """Generic test runner that handles the common execution flow"""
+
+    def __init__(self, operator_test_class):
+        """
+        Args:
+            operator_test_class: A class that implements BaseOperatorTest interface
+        """
+        self.operator_test = operator_test_class()
+        self.args = get_args()
+
+    def run(self):
+        """Execute the complete test suite"""
+        config = TestConfig(
+            tensor_dtypes=self.operator_test.tensor_dtypes,
+            tolerance_map=self.operator_test.tolerance_map,
+            debug=self.args.debug,
+            bench=self.args.bench,
+            num_prerun=self.args.num_prerun,
+            num_iterations=self.args.num_iterations,
+            dtype_combinations=self.operator_test.dtype_combinations,
+        )
+
+        runner = TestRunner(self.operator_test.test_cases, config)
+        devices = get_test_devices(self.args)
+
+        # Run unified tests
+        all_passed = runner.run_tests(
+            devices, self.operator_test.run_test, self.operator_test.operator_name
+        )
+
+        # Print summary
+        summary_passed = runner.print_summary()
+        all_passed = all_passed and summary_passed
+
+        return all_passed
+
+    def run_and_exit(self):
+        """Run tests and exit with appropriate status code"""
+        success = self.run()
+        sys.exit(0 if success else 1)
diff --git a/test/infinicore/framework/templates.py b/test/infinicore/framework/templates.py
new file mode 100644
index 000000000..367c6e108
--- /dev/null
+++ b/test/infinicore/framework/templates.py
@@ -0,0 +1,108 @@
+"""
+Templates for common operator patterns to minimize code duplication
+
+Available configuration methods in BaseOperatorTest:
+
+1. get_test_cases() -> List[TestCase]
+   - Define input/output shapes, strides, and operation modes
+   - Operation modes: TestCase.OUT_OF_PLACE, TestCase.IN_PLACE, TestCase.BOTH
+
+2. get_tensor_dtypes() -> List[infinicore.dtype]
+   - Define supported data types for single-dtype tests
+   - Used when dtype_combinations is None
+
+3. get_tolerance_map() -> Dict[infinicore.dtype, Dict[str, float]]
+   - Set tolerance (atol, rtol) for each data type
+   - Example: {infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}}
+
+4. get_dtype_combinations() -> Optional[List[Dict]]
+   - Define mixed dtype configurations for multi-dtype tests
+   - Return None for single-dtype tests
+
+5. torch_operator(*inputs, out=None, **kwargs) -> torch.Tensor
+   - Implement PyTorch reference implementation
+
+6. infinicore_operator(*inputs, out=None, **kwargs) -> infinicore.Tensor
+   - Implement Infinicore operator implementation
+
+New Tensor Initialization Modes:
+- TensorInitializer.RANDOM (default): Random values using torch.rand
+- TensorInitializer.ZEROS: All zeros using torch.zeros
+- TensorInitializer.ONES: All ones using torch.ones
+- TensorInitializer.RANDINT: Random integers using torch.randint
+- TensorInitializer.MANUAL: Use a pre-existing tensor with shape/strides validation
+- TensorInitializer.BINARY: Use a pre-existing tensor with shape validation only
+
+Usage examples in TestCase creation:
+- Basic: TensorSpec.from_tensor(shape)
+- With initialization: TensorSpec.from_tensor(shape, init_mode=TensorInitializer.ZEROS)
+- Strided with custom init: TensorSpec.from_strided_tensor(shape, strides, init_mode=TensorInitializer.ONES)
+"""
+
+import torch
+import infinicore
+from .base import BaseOperatorTest
+from .tensor import TensorSpec, TensorInitializer
+
+
+class BinaryOperatorTest(BaseOperatorTest):
+    """Template for binary operators (matmul, add, mul, etc.)"""
+
+    def __init__(self, operator_name, test_cases, tensor_dtypes, tolerance_map):
+        self._operator_name = operator_name
+        self._test_cases = test_cases
+        self._tensor_dtypes = tensor_dtypes
+        self._tolerance_map = tolerance_map
+        super().__init__(operator_name)
+
+    def get_test_cases(self):
+        return self._test_cases
+
+    def get_tensor_dtypes(self):
+        return self._tensor_dtypes
+
+    def get_tolerance_map(self):
+        return self._tolerance_map
+
+    def torch_operator(self, *inputs, **kwargs):
+        """Generic torch operator dispatch"""
+        # Support both functional and method calls
+        if hasattr(torch, self._operator_name):
+            op = getattr(torch, self._operator_name)
+        else:
+            # Fallback to common operator mappings
+            op_mapping = {
+                "matmul": torch.matmul,
+                "add": torch.add,
+                "mul": torch.mul,
+                "sub": torch.sub,
+                "div": torch.div,
+            }
+            op = op_mapping.get(self._operator_name)
+            if op is None:
+                raise NotImplementedError(
+                    f"Torch operator {self._operator_name} not implemented"
+                )
+
+        return op(*inputs, **kwargs)
+
+    def infinicore_operator(self, *inputs, **kwargs):
+        """Generic infinicore operator dispatch"""
+        op = getattr(infinicore, self._operator_name)
+        return op(*inputs, **kwargs)
+
+
+class UnaryOperatorTest(BinaryOperatorTest):
+    """Template for unary operators (exp, log, sin, etc.)"""
+
+    def torch_operator(self, *inputs, **kwargs):
+        # For unary operators, we only use the first input
+        if hasattr(torch, self._operator_name):
+            op = getattr(torch, self._operator_name)
+            return op(inputs[0], **kwargs)
+        else:
+            return super().torch_operator(*inputs, **kwargs)
+
+    def infinicore_operator(self, *inputs, **kwargs):
+        op = getattr(infinicore, self._operator_name)
+        return op(inputs[0], **kwargs)
diff --git a/test/infinicore/framework/tensor.py b/test/infinicore/framework/tensor.py
new file mode 100644
index 000000000..6aa5ca7b4
--- /dev/null
+++ b/test/infinicore/framework/tensor.py
@@ -0,0 +1,416 @@
+import torch
+from pathlib import Path
+from .datatypes import to_torch_dtype
+from .devices import torch_device_map
+
+
+class TensorInitializer:
+    """Tensor data initializer with multiple modes"""
+
+    RANDOM = "random"
+    ZEROS = "zeros"
+    ONES = "ones"
+    RANDINT = "randint"
+    MANUAL = "manual"
+    BINARY = "binary"
+    FROM_FILE = "from_file"
+
+    @staticmethod
+    def create_tensor(
+        shape, dtype, device, mode=RANDOM, strides=None, set_tensor=None, file_path=None
+    ):
+        """
+        Create a torch tensor with specified initialization mode
+
+        Args:
+            shape: Tensor shape
+            dtype: infinicore dtype
+            device: InfiniDeviceEnum
+            mode: Initialization mode
+            strides: Optional strides for strided tensors
+            set_tensor: Pre-existing tensor for manual/binary mode
+            file_path: Path to file for FROM_FILE mode
+
+        Returns:
+            torch.Tensor: Initialized tensor
+        """
+        # Convert InfiniDeviceEnum to torch device string
+        torch_device_str = torch_device_map[device]
+        torch_dtype = to_torch_dtype(dtype)
+
+        # Handle strided tensors - calculate required storage size
+        if strides is not None:
+            # Calculate the required storage size for strided tensor
+            storage_size = 0
+            for i in range(len(shape)):
+                if shape[i] > 0:
+                    storage_size += (shape[i] - 1) * abs(strides[i])
+            storage_size += 1  # Add 1 for the base element
+
+            # Create base storage with sufficient size
+            if mode == TensorInitializer.RANDOM:
+                base_tensor = torch.rand(
+                    storage_size, dtype=torch_dtype, device=torch_device_str
+                )
+            elif mode == TensorInitializer.ZEROS:
+                base_tensor = torch.zeros(
+                    storage_size, dtype=torch_dtype, device=torch_device_str
+                )
+            elif mode == TensorInitializer.ONES:
+                base_tensor = torch.ones(
+                    storage_size, dtype=torch_dtype, device=torch_device_str
+                )
+            elif mode == TensorInitializer.RANDINT:
+                base_tensor = torch.randint(
+                    -2000000000,
+                    2000000000,
+                    (storage_size,),
+                    dtype=torch_dtype,
+                    device=torch_device_str,
+                )
+            elif mode == TensorInitializer.MANUAL:
+                assert set_tensor is not None, "Manual mode requires set_tensor"
+                base_tensor = set_tensor.to(torch_dtype).to(torch_device_str)
+            elif mode == TensorInitializer.BINARY:
+                assert set_tensor is not None, "Binary mode requires set_tensor"
+                base_tensor = set_tensor.to(torch_dtype).to(torch_device_str)
+            elif mode == TensorInitializer.FROM_FILE:
+                base_tensor = TensorInitializer._load_from_file(
+                    file_path, storage_size, torch_dtype, torch_device_str
+                )
+            else:
+                raise ValueError(f"Unsupported initialization mode: {mode}")
+
+            # Create strided view
+            tensor = torch.as_strided(base_tensor, shape, strides)
+        else:
+            # Contiguous tensor
+            if mode == TensorInitializer.RANDOM:
+                tensor = torch.rand(shape, dtype=torch_dtype, device=torch_device_str)
+            elif mode == TensorInitializer.ZEROS:
+                tensor = torch.zeros(shape, dtype=torch_dtype, device=torch_device_str)
+            elif mode == TensorInitializer.ONES:
+                tensor = torch.ones(shape, dtype=torch_dtype, device=torch_device_str)
+            elif mode == TensorInitializer.RANDINT:
+                tensor = torch.randint(
+                    -2000000000,
+                    2000000000,
+                    shape,
+                    dtype=torch_dtype,
+                    device=torch_device_str,
+                )
+            elif mode == TensorInitializer.MANUAL:
+                assert set_tensor is not None, "Manual mode requires set_tensor"
+                assert shape == list(set_tensor.shape), "Shape mismatch in manual mode"
+                tensor = set_tensor.to(torch_dtype).to(torch_device_str)
+            elif mode == TensorInitializer.BINARY:
+                assert set_tensor is not None, "Binary mode requires set_tensor"
+                assert shape == list(set_tensor.shape), "Shape mismatch in binary mode"
+                tensor = set_tensor.to(torch_dtype).to(torch_device_str)
+            elif mode == TensorInitializer.FROM_FILE:
+                tensor = TensorInitializer._load_from_file(
+                    file_path, shape, torch_dtype, torch_device_str
+                )
+            else:
+                raise ValueError(f"Unsupported initialization mode: {mode}")
+
+        return tensor
+
+    @staticmethod
+    def _load_from_file(file_path, shape_or_size, torch_dtype, torch_device_str):
+        """
+        Load tensor data from file using PyTorch's native methods
+
+        Args:
+            file_path: Path to the file
+            shape_or_size: Tensor shape for contiguous or size for strided
+            torch_dtype: Target torch dtype
+            torch_device_str: Target device string
+
+        Returns:
+            torch.Tensor: Tensor with data loaded from file
+        """
+        if file_path is None:
+            raise ValueError("FROM_FILE mode requires file_path")
+
+        file_path = Path(file_path)
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        # Determine file type and load accordingly
+        file_extension = file_path.suffix.lower()
+
+        if file_extension in [".pt", ".pth"]:
+            # PyTorch native format
+            tensor = torch.load(file_path, map_location=torch_device_str)
+
+        elif file_extension in [".bin", ".dat", ".raw"]:
+            # Raw binary format - we need to know the expected shape
+            tensor = TensorInitializer._load_binary_file(
+                file_path, shape_or_size, torch_dtype, torch_device_str
+            )
+
+        elif file_extension in [".npy"]:
+            # NumPy format - fallback to numpy if needed
+            try:
+                import numpy as np
+
+                numpy_array = np.load(file_path)
+                tensor = (
+                    torch.from_numpy(numpy_array).to(torch_dtype).to(torch_device_str)
+                )
+            except ImportError:
+                raise ImportError("NumPy is required to load .npy files")
+
+        else:
+            # Try to load as PyTorch format first, then fallback to binary
+            try:
+                tensor = torch.load(file_path, map_location=torch_device_str)
+            except:
+                # Fallback to binary loading
+                tensor = TensorInitializer._load_binary_file(
+                    file_path, shape_or_size, torch_dtype, torch_device_str
+                )
+
+        # Ensure correct dtype and device
+        tensor = tensor.to(torch_dtype).to(torch_device_str)
+
+        # Validate shape/size
+        if isinstance(shape_or_size, (list, tuple)):
+            # Contiguous tensor - check shape
+            if list(tensor.shape) != list(shape_or_size):
+                raise ValueError(
+                    f"Tensor shape mismatch: expected {shape_or_size}, got {tensor.shape}"
+                )
+        else:
+            # Strided tensor - check total size
+            if tensor.numel() != shape_or_size:
+                raise ValueError(
+                    f"Tensor size mismatch: expected {shape_or_size} elements, got {tensor.numel()}"
+                )
+
+        return tensor
+
+    @staticmethod
+    def _load_binary_file(file_path, shape_or_size, torch_dtype, torch_device_str):
+        """
+        Load tensor from raw binary file
+
+        Args:
+            file_path: Path to binary file
+            shape_or_size: Expected shape or size
+            torch_dtype: Target dtype
+            torch_device_str: Target device
+
+        Returns:
+            torch.Tensor: Loaded tensor
+        """
+        # Read binary data
+        with open(file_path, "rb") as f:
+            binary_data = f.read()
+
+        # Create tensor from buffer
+        if isinstance(shape_or_size, (list, tuple)):
+            # Contiguous tensor with known shape
+            tensor = torch.frombuffer(binary_data, dtype=torch_dtype).reshape(
+                shape_or_size
+            )
+        else:
+            # Strided tensor - just 1D buffer
+            tensor = torch.frombuffer(binary_data, dtype=torch_dtype)
+
+        return tensor.to(torch_device_str)
+
+    @staticmethod
+    def save_to_file(tensor, file_path, format="auto"):
+        """
+        Save tensor data to file using PyTorch's native methods
+
+        Args:
+            tensor: torch.Tensor to save
+            file_path: Path to save the file
+            format: File format ('auto', 'torch', 'binary', 'numpy')
+        """
+        file_path = Path(file_path)
+
+        if format == "auto":
+            # Determine format from file extension
+            file_extension = file_path.suffix.lower()
+            if file_extension in [".pt", ".pth"]:
+                format = "torch"
+            elif file_extension in [".npy"]:
+                format = "numpy"
+            else:
+                format = "binary"
+
+        if format == "torch":
+            # PyTorch native format (preserves metadata)
+            torch.save(tensor, file_path)
+
+        elif format == "binary":
+            # Raw binary format
+            with open(file_path, "wb") as f:
+                f.write(tensor.cpu().numpy().tobytes())
+
+        elif format == "numpy":
+            # NumPy format
+            try:
+                import numpy as np
+
+                np.save(file_path, tensor.cpu().numpy())
+            except ImportError:
+                raise ImportError("NumPy is required to save .npy files")
+
+        else:
+            raise ValueError(f"Unsupported format: {format}")
+
+        print(
+            f"Tensor saved to {file_path} (shape: {tensor.shape}, dtype: {tensor.dtype}, format: {format})"
+        )
+
+    @staticmethod
+    def list_supported_formats():
+        """Return list of supported file formats"""
+        return {
+            "torch": [".pt", ".pth"],  # PyTorch native format
+            "binary": [".bin", ".dat", ".raw"],  # Raw binary
+            "numpy": [".npy"],  # NumPy format
+        }
+
+
+class TensorSpec:
+    """Tensor specification supporting various input types and per-tensor dtype"""
+
+    def __init__(
+        self,
+        shape=None,
+        dtype=None,
+        strides=None,
+        value=None,
+        is_scalar=False,
+        is_contiguous=True,
+        init_mode=TensorInitializer.RANDOM,  # Default to random initialization
+        custom_tensor=None,  # For manual/binary mode
+        file_path=None,  # For FROM_FILE mode
+        file_format=None,  # Optional file format hint
+    ):
+        self.shape = shape
+        self.dtype = dtype
+        self.strides = strides
+        self.value = value
+        self.is_scalar = is_scalar
+        self.is_contiguous = is_contiguous
+        self.init_mode = init_mode
+        self.custom_tensor = custom_tensor
+        self.file_path = file_path
+        self.file_format = file_format
+
+    @classmethod
+    def from_tensor(
+        cls,
+        shape,
+        dtype=None,
+        strides=None,
+        is_contiguous=True,
+        init_mode=TensorInitializer.RANDOM,
+        custom_tensor=None,
+        file_path=None,
+    ):
+        return cls(
+            shape=shape,
+            dtype=dtype,
+            strides=strides,
+            is_scalar=False,
+            is_contiguous=is_contiguous,
+            init_mode=init_mode,
+            custom_tensor=custom_tensor,
+            file_path=file_path,
+        )
+
+    @classmethod
+    def from_scalar(cls, value, dtype=None):
+        return cls(value=value, dtype=dtype, is_scalar=True)
+
+    @classmethod
+    def from_strided_tensor(
+        cls,
+        shape,
+        strides,
+        dtype=None,
+        init_mode=TensorInitializer.RANDOM,
+        custom_tensor=None,
+        file_path=None,
+    ):
+        return cls(
+            shape=shape,
+            dtype=dtype,
+            strides=strides,
+            is_scalar=False,
+            is_contiguous=False,
+            init_mode=init_mode,
+            custom_tensor=custom_tensor,
+            file_path=file_path,
+        )
+
+    @classmethod
+    def from_file(
+        cls,
+        file_path,
+        shape,
+        dtype=None,
+        strides=None,
+        is_contiguous=True,
+        file_format=None,
+    ):
+        """
+        Create TensorSpec that loads data from file
+
+        Args:
+            file_path: Path to file
+            shape: Tensor shape
+            dtype: infinicore dtype (inferred from file if None)
+            strides: Optional strides for strided tensors
+            is_contiguous: Whether tensor is contiguous
+            file_format: Optional file format hint
+
+        Returns:
+            TensorSpec: Configured for file loading
+        """
+        return cls(
+            shape=shape,
+            dtype=dtype,
+            strides=strides,
+            is_scalar=False,
+            is_contiguous=is_contiguous,
+            init_mode=TensorInitializer.FROM_FILE,
+            file_path=file_path,
+            file_format=file_format,
+        )
+
+    def create_torch_tensor(self, device, dtype_config, tensor_index=0):
+        """Create a torch tensor based on this specification"""
+        if self.is_scalar:
+            return self.value
+
+        # Determine dtype - ensure we're using infinicore dtype, not torch dtype
+        if self.dtype is not None:
+            tensor_dtype = self.dtype
+        elif isinstance(dtype_config, dict) and f"input_{tensor_index}" in dtype_config:
+            tensor_dtype = dtype_config[f"input_{tensor_index}"]
+        elif isinstance(dtype_config, (list, tuple)) and tensor_index < len(
+            dtype_config
+        ):
+            tensor_dtype = dtype_config[tensor_index]
+        else:
+            tensor_dtype = dtype_config
+
+        # Create tensor using the specified initialization mode
+        return TensorInitializer.create_tensor(
+            shape=self.shape,
+            dtype=tensor_dtype,
+            device=device,
+            mode=self.init_mode,
+            strides=self.strides,
+            set_tensor=self.custom_tensor,
+            file_path=self.file_path,
+        )
diff --git a/test/infinicore/framework/utils.py b/test/infinicore/framework/utils.py
index ed0cc4f71..7e6a138bb 100644
--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
@@ -4,18 +4,6 @@
 from .datatypes import to_infinicore_dtype, to_torch_dtype
 
 
-def create_infinicore_tensor(torch_tensor, device_str):
-    """Create infinicore tensor from PyTorch tensor"""
-    infini_device = infinicore.device(device_str, 0)
-
-    return infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=to_infinicore_dtype(torch_tensor.dtype),
-        device=infini_device,
-    )
-
-
 def synchronize_device(torch_device):
     """Device synchronization"""
     if torch_device == "cuda":
@@ -117,7 +105,6 @@ def add_color(text, color_code):
                 f"delta: {add_color(delta_str, 33)}"
             )
 
-        print(add_color(" INFO:", 35))
         print(f"  - Actual dtype: {actual.dtype}")
         print(f"  - Desired dtype: {expected.dtype}")
         print(f"  - Atol: {atol}")
@@ -149,44 +136,103 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
     return tolerance["atol"], tolerance["rtol"]
 
 
-def compare_results(
-    infini_result, torch_result, dtype, config, device_str, tolerance_map=None
-):
+def infinicore_tensor_from_torch(torch_tensor):
+    infini_device = infinicore.device(torch_tensor.device.type, 0)
+    if torch_tensor.is_contiguous():
+        return infinicore.from_blob(
+            torch_tensor.data_ptr(),
+            list(torch_tensor.shape),
+            dtype=to_infinicore_dtype(torch_tensor.dtype),
+            device=infini_device,
+        )
+    else:
+        return infinicore.strided_from_blob(
+            torch_tensor.data_ptr(),
+            list(torch_tensor.shape),
+            list(torch_tensor.stride()),
+            dtype=to_infinicore_dtype(torch_tensor.dtype),
+            device=infini_device,
+        )
+
+
+def convert_infinicore_to_torch(infini_result, torch_reference):
     """
-    Compare infinicore result with PyTorch reference result
+    Convert infinicore tensor to PyTorch tensor for comparison
 
     Args:
         infini_result: infinicore tensor result
-        torch_result: PyTorch tensor reference result
+        torch_reference: PyTorch tensor reference (for shape and device)
         dtype: infinicore data type
-        config: test config
         device_str: torch device string
-        device: device enum
-        tolerance_map: optional tolerance map (defaults to config's tolerance_map)
 
     Returns:
-        bool: True if results match within tolerance
+        torch.Tensor: PyTorch tensor with infinicore data
     """
-    # Convert infinicore result to PyTorch tensor for comparison
     torch_result_from_infini = torch.zeros(
-        torch_result.shape, dtype=to_torch_dtype(dtype), device=device_str
+        torch_reference.shape,
+        dtype=to_torch_dtype(infini_result.dtype),
+        device=infini_result.device.type,
     )
-    temp_tensor = create_infinicore_tensor(torch_result_from_infini, device_str)
+    temp_tensor = infinicore_tensor_from_torch(torch_result_from_infini)
     temp_tensor.copy_(infini_result)
+    return torch_result_from_infini
 
-    # Retrieve tolerance - use provided map or config's map
-    if tolerance_map is None:
-        tolerance_map = config.tolerance_map
-    atol, rtol = get_tolerance(tolerance_map, dtype)
+
+def compare_results(
+    infini_result, torch_result, atol=1e-5, rtol=1e-5, debug_mode=False
+):
+    """
+    Generic function to compare infinicore result with PyTorch reference result
+
+    Args:
+        infini_result: infinicore tensor result
+        torch_result: PyTorch tensor reference result
+        atol: absolute tolerance
+        rtol: relative tolerance
+        debug_mode: whether to enable debug output
+
+    Returns:
+        bool: True if results match within tolerance
+    """
+    # Convert infinicore result to PyTorch tensor for comparison
+    torch_result_from_infini = convert_infinicore_to_torch(infini_result, torch_result)
 
     # Debug mode: detailed comparison
-    if config.debug:
+    if debug_mode:
         debug(torch_result_from_infini, torch_result, atol=atol, rtol=rtol)
 
     # Check if results match within tolerance
     return torch.allclose(torch_result_from_infini, torch_result, atol=atol, rtol=rtol)
 
 
+def create_test_comparator(config, dtype, tolerance_map=None, mode_name=""):
+    """
+    Create a test-specific comparison function that handles test configuration
+
+    Args:
+        config: test configuration
+        dtype: infinicore data type
+        tolerance_map: optional tolerance map (defaults to config's tolerance_map)
+        mode_name: operation mode name for debug output
+
+    Returns:
+        callable: function that takes (infini_result, torch_result) and returns bool
+    """
+    if tolerance_map is None:
+        tolerance_map = config.tolerance_map
+
+    atol, rtol = get_tolerance(tolerance_map, dtype)
+
+    def compare_test_results(infini_result, torch_result):
+        if config.debug and mode_name:
+            print(f"\n\033[94mDEBUG INFO - {mode_name}:\033[0m")
+        return compare_results(
+            infini_result, torch_result, atol=atol, rtol=rtol, debug_mode=config.debug
+        )
+
+    return compare_test_results
+
+
 def rearrange_tensor(tensor, new_strides):
     """
     Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
diff --git a/test/infinicore/op/matmul.py b/test/infinicore/op/matmul.py
deleted file mode 100644
index 4ae708c60..000000000
--- a/test/infinicore/op/matmul.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import torch
-import infinicore
-import sys
-import os
-
-# Framework path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
-from framework import (
-    TestConfig,
-    TestRunner,
-    TestCase,
-    create_infinicore_tensor,
-    compare_results,
-    get_args,
-    get_test_devices,
-    profile_operation,
-    to_torch_dtype,
-    InfiniDeviceNames,
-    torch_device_map,
-)
-
-# ==============================================================================
-# Test Setup
-# ==============================================================================
-
-# Test cases
-_TEST_CASES = [
-    # (a_shape, b_shape, result_shape, a_stride, b_stride, c_stride)
-    TestCase((2, 3), (3, 4), (2, 4), None, None, None),
-    TestCase((128, 256), (256, 64), (128, 64), None, None, None),
-    TestCase((2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None),
-    TestCase((1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
-    TestCase((6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
-    TestCase((4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
-]
-
-# Data types - now using infinicore native types
-_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
-
-# Tolerance
-_TOLERANCE_MAP = {
-    infinicore.float16: {"atol": 0, "rtol": 1e-2},
-    infinicore.float32: {"atol": 0, "rtol": 1e-3},
-    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
-}
-
-# ==============================================================================
-# Test Method
-# ==============================================================================
-
-
-def test_matmul(device, test_case, dtype, config):
-    """
-    Test matmul operation
-
-    Args:
-        device: device enum
-        test_case: test case
-        dtype: infinicore data type
-        config: test config
-    """
-    a_shape, b_shape, result_shape, a_stride, b_stride, c_stride = test_case.args
-
-    print(
-        f"Testing Matmul on {InfiniDeviceNames[device]} with "
-        f"a_shape:{a_shape}, b_shape:{b_shape}, result_shape:{result_shape}, "
-        f"a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, "
-        f"dtype:{dtype}"
-    )
-
-    # Create PyTorch tensors
-    device_str = torch_device_map[device]
-    torch_dtype = to_torch_dtype(dtype)
-
-    torch_a = torch.rand(a_shape, dtype=torch_dtype, device=device_str)
-    torch_b = torch.rand(b_shape, dtype=torch_dtype, device=device_str)
-
-    # Calculate PyTorch reference result
-    def torch_matmul():
-        return torch.matmul(torch_a, torch_b)
-
-    torch_result = torch_matmul()
-
-    # Create infinicore tensors
-    infini_a = create_infinicore_tensor(torch_a, device_str)
-    infini_b = create_infinicore_tensor(torch_b, device_str)
-
-    # Out-of-place matmul
-    def infini_matmul():
-        return infinicore.matmul(infini_a, infini_b)
-
-    infini_result = infini_matmul()
-
-    # Validate results using common method
-    is_valid = compare_results(infini_result, torch_result, dtype, config, device_str)
-    assert is_valid, "Matmul test failed"
-
-    # Performance test
-    if config.bench:
-        profile_operation(
-            "PyTorch",
-            torch_matmul,
-            device_str,
-            config.num_prerun,
-            config.num_iterations,
-        )
-        profile_operation(
-            "Infinicore",
-            infini_matmul,
-            device_str,
-            config.num_prerun,
-            config.num_iterations,
-        )
-
-
-def test_matmul_inplace(device, test_case, dtype, config):
-    """
-    Test in-place matmul operation
-
-    Args:
-        device: device enum
-        test_case: test case
-        dtype: infinicore data type
-        config: test config
-    """
-    a_shape, b_shape, result_shape, a_stride, b_stride, c_stride = test_case.args
-
-    print(
-        f"Testing In-place Matmul on {InfiniDeviceNames[device]} with "
-        f"a_shape:{a_shape}, b_shape:{b_shape}, result_shape:{result_shape}, "
-        f"dtype:{dtype}"
-    )
-
-    device_str = torch_device_map[device]
-    torch_dtype = to_torch_dtype(dtype)
-
-    # Create PyTorch tensors
-    torch_a = torch.rand(a_shape, dtype=torch_dtype, device=device_str)
-    torch_b = torch.rand(b_shape, dtype=torch_dtype, device=device_str)
-
-    # Create pre-allocated result tensor
-    torch_preallocated = torch.zeros(result_shape, dtype=torch_dtype, device=device_str)
-
-    # Calculate PyTorch reference result using in-place operation
-    def torch_matmul_inplace():
-        torch.matmul(torch_a, torch_b, out=torch_preallocated)
-
-    # Execute in-place operation
-    torch_matmul_inplace()
-
-    # Create infinicore tensors
-    infini_a = create_infinicore_tensor(torch_a, device_str)
-    infini_b = create_infinicore_tensor(torch_b, device_str)
-    infini_c = infinicore.empty(
-        result_shape, dtype=dtype, device=infinicore.device(device_str, 0)
-    )
-
-    # Test in-place matmul
-    def infini_matmul_inplace():
-        infinicore.matmul(infini_a, infini_b, out=infini_c)
-
-    # Execute in-place operation
-    infini_matmul_inplace()
-
-    # Validate results using common method
-    is_valid = compare_results(infini_c, torch_preallocated, dtype, config, device_str)
-    assert is_valid, "In-place matmul test failed"
-
-    # Performance test
-    if config.bench:
-        profile_operation(
-            "PyTorch In-place",
-            torch_matmul_inplace,
-            device_str,
-            config.num_prerun,
-            config.num_iterations,
-        )
-        profile_operation(
-            "Infinicore In-place",
-            infini_matmul_inplace,
-            device_str,
-            config.num_prerun,
-            config.num_iterations,
-        )
-
-
-# ==============================================================================
-# Main Execution Function
-# ==============================================================================
-
-
-def main():
-    args = get_args()
-
-    # Create test configuration
-    config = TestConfig(
-        tensor_dtypes=_TENSOR_DTYPES,
-        tolerance_map=_TOLERANCE_MAP,
-        debug=args.debug,
-        bench=args.bench,
-        num_prerun=args.num_prerun,
-        num_iterations=args.num_iterations,
-    )
-
-    # Create test runner
-    runner = TestRunner(_TEST_CASES, config)
-
-    # Get test devices
-    devices = get_test_devices(args)
-
-    print("Starting matmul tests...")
-
-    all_passed = True
-
-    # Run out-of-place tests
-    print("\n--- Testing Out-of-place Matmul ---")
-    out_of_place_passed = runner.run_tests(devices, test_matmul)
-    all_passed = all_passed and out_of_place_passed
-
-    # Run in-place tests
-    print("\n--- Testing In-place Matmul ---")
-    in_place_passed = runner.run_tests(devices, test_matmul_inplace)
-    all_passed = all_passed and in_place_passed
-
-    runner.print_summary()
-
-    sys.exit(0 if all_passed else 1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/infinicore/ops.py b/test/infinicore/ops.py
new file mode 100644
index 000000000..f36f8b8c3
--- /dev/null
+++ b/test/infinicore/ops.py
@@ -0,0 +1,260 @@
+import os
+import sys
+import subprocess
+import argparse
+from pathlib import Path
+
+
+def find_ops_directory(start_dir=None):
+    """
+    Find the ops directory by searching from start_dir upwards.
+    """
+    if start_dir is None:
+        start_dir = Path(__file__).parent
+
+    ops_dir = start_dir / "ops"
+    if ops_dir.exists() and (ops_dir / "rms_norm.py").exists():
+        return ops_dir
+
+
+def run_all_op_tests(ops_dir=None, verbose=False, specific_ops=None, extra_args=None):
+    """
+    Run all operator test scripts in the ops directory.
+
+    Args:
+        ops_dir (str, optional): Path to the ops directory. If None, uses the current directory.
+        verbose (bool): Whether to print detailed output.
+        specific_ops (list, optional): List of specific operator names to test (e.g., ['add', 'matmul']).
+        extra_args (list, optional): Extra command line arguments to pass to test scripts.
+
+    Returns:
+        dict: Results dictionary with test names as keys and (success, return_code, output) as values.
+    """
+    if ops_dir is None:
+        ops_dir = find_ops_directory()
+    else:
+        ops_dir = Path(ops_dir)
+
+    if not ops_dir.exists():
+        print(f"Error: Ops directory '{ops_dir}' does not exist.")
+        return {}
+
+    print(f"Looking for test files in: {ops_dir}")
+
+    # Find all Python test files (looking for actual operator test files)
+    test_files = list(ops_dir.glob("*.py"))
+
+    # Filter out this script itself and non-operator test files
+    current_script = Path(__file__).name
+    test_files = [f for f in test_files if f.name != current_script]
+
+    # Further filter to include only files that look like operator tests
+    # (they typically import infinicore and BaseOperatorTest)
+    operator_test_files = []
+    for test_file in test_files:
+        try:
+            with open(test_file, "r", encoding="utf-8") as f:
+                content = f.read()
+                if "infinicore" in content and "BaseOperatorTest" in content:
+                    operator_test_files.append(test_file)
+                elif verbose:
+                    print(f"  Skipping {test_file.name}: not an operator test file")
+        except Exception as e:
+            if verbose:
+                print(f"  Could not read {test_file.name}: {e}")
+            continue
+
+    if specific_ops:
+        # Filter for specific operators (case insensitive)
+        filtered_files = []
+        for test_file in operator_test_files:
+            test_name = test_file.stem.lower()
+            if any(op.lower() in test_name for op in specific_ops):
+                filtered_files.append(test_file)
+            elif verbose:
+                print(f"  Filtered out {test_file.name}: not in specific_ops list")
+        operator_test_files = filtered_files
+
+    if not operator_test_files:
+        print(f"No operator test files found in {ops_dir}")
+        print(f"Available Python files: {[f.name for f in test_files]}")
+        print(f"Current directory: {Path.cwd()}")
+        return {}
+
+    print(f"Found {len(operator_test_files)} operator test files:")
+    for test_file in operator_test_files:
+        print(f"  - {test_file.name}")
+
+    results = {}
+
+    for test_file in operator_test_files:
+        test_name = test_file.stem
+
+        try:
+            # Run the test script
+            cmd = [sys.executable, str(test_file)]
+
+            # Add extra arguments if provided
+            if extra_args:
+                cmd.extend(extra_args)
+
+            if verbose:
+                print(f"Command: {' '.join(cmd)}")
+                print(f"Working directory: {ops_dir}")
+
+            # Always capture output to display it
+            result = subprocess.run(cmd, cwd=ops_dir, capture_output=True, text=True)
+
+            success = result.returncode == 0
+            results[test_name] = (
+                success,
+                result.returncode,
+                result.stdout,
+                result.stderr,
+            )
+
+            # Print the output from the test script
+            if result.stdout:
+                print(result.stdout)
+
+            if result.stderr:
+                print("STDERR:")
+                print(result.stderr)
+
+            if success:
+                print(f"✅ {test_name}: PASSED (return code: {result.returncode})")
+            else:
+                print(f"❌ {test_name}: FAILED (return code: {result.returncode})")
+
+        except Exception as e:
+            print(f"❌ {test_name}: ERROR - {str(e)}")
+            results[test_name] = (False, -1, "", str(e))
+
+    return results
+
+
+def print_summary(results):
+    """Print a summary of test results."""
+    print(f"\n{'='*80}")
+    print("TEST SUMMARY")
+    print(f"{'='*80}")
+
+    if not results:
+        print("No tests were run.")
+        return
+
+    passed = sum(1 for success, _, _, _ in results.values() if success)
+    total = len(results)
+
+    print(f"Total tests: {total}")
+    print(f"Passed: {passed}")
+    print(f"Failed: {total - passed}")
+
+    if total > 0:
+        print(f"Success rate: {passed/total*100:.1f}%")
+
+    if passed == total:
+        print("\n🎉 All tests passed!")
+    else:
+        print("\nFailed tests:")
+        for test_name, (success, returncode, stdout, stderr) in results.items():
+            if not success:
+                print(f"  - {test_name} (return code: {returncode})")
+                # Print brief error info for failed tests
+                if stderr:
+                    error_lines = stderr.strip().split("\n")
+                    if error_lines:
+                        print(f"    Error: {error_lines[0]}")
+
+
+def main():
+    """Main entry point with command line argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Run all operator tests in the ops directory", add_help=False
+    )
+
+    # Our script's specific arguments
+    parser.add_argument(
+        "--ops-dir", type=str, help="Path to the ops directory (default: auto-detect)"
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Print detailed command information for each test",
+    )
+    parser.add_argument(
+        "--ops", nargs="+", help="Run specific operators only (e.g., --ops add matmul)"
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List all available test files without running them",
+    )
+    parser.add_argument(
+        "-h", "--help", action="store_true", help="Show this help message and exit"
+    )
+
+    # Parse known args first, leave the rest for the test scripts
+    args, unknown_args = parser.parse_known_args()
+
+    if args.help:
+        parser.print_help()
+        print("\nExtra arguments that will be passed to test scripts:")
+        print("  --nvidia, --cpu, --bench, --debug, etc.")
+        return
+
+    # Auto-detect ops directory if not provided
+    if args.ops_dir is None:
+        ops_dir = find_ops_directory()
+    else:
+        ops_dir = Path(args.ops_dir)
+
+    if args.list:
+        # Just list available test files
+        test_files = list(ops_dir.glob("*.py"))
+        current_script = Path(__file__).name
+        test_files = [f for f in test_files if f.name != current_script]
+
+        operator_test_files = []
+        for test_file in test_files:
+            try:
+                with open(test_file, "r", encoding="utf-8") as f:
+                    content = f.read()
+                    if "infinicore" in content and "BaseOperatorTest" in content:
+                        operator_test_files.append(test_file)
+            except:
+                continue
+
+        if operator_test_files:
+            print(f"Available operator test files in {ops_dir}:")
+            for test_file in operator_test_files:
+                print(f"  - {test_file.name}")
+        else:
+            print(f"No operator test files found in {ops_dir}")
+            print(f"Available Python files: {[f.name for f in test_files]}")
+        return
+
+    # Show what extra arguments will be passed
+    if unknown_args:
+        print(f"Passing extra arguments to test scripts: {unknown_args}")
+
+    # Run all tests
+    results = run_all_op_tests(
+        ops_dir=ops_dir,
+        verbose=args.verbose,
+        specific_ops=args.ops,
+        extra_args=unknown_args,
+    )
+
+    print_summary(results)
+
+    # Exit with appropriate code
+    if results and all(success for success, _, _, _ in results.values()):
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/add.py b/test/infinicore/ops/add.py
new file mode 100644
index 000000000..a5bb13443
--- /dev/null
+++ b/test/infinicore/ops/add.py
@@ -0,0 +1,106 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (operation_mode, shape, a_strides, b_strides, c_strides)
+_TEST_CASES_DATA = [
+    (TestCase.BOTH, (13, 4), None, None, None),
+    (TestCase.BOTH, (13, 4), (10, 1), (10, 1), (10, 1)),
+    (TestCase.BOTH, (13, 4), (0, 1), None, None),
+    (TestCase.BOTH, (13, 4, 4), None, None, None),
+    (TestCase.BOTH, (13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    (TestCase.BOTH, (13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    (TestCase.BOTH, (16, 5632), None, None, None),
+    (TestCase.BOTH, (16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+]
+
+
+def parse_add_test_case(data):
+    """
+    Parse add test case data according to format:
+    (operation_mode, shape, a_strides, b_strides, c_strides)
+    """
+    operation_mode = data[0]
+    shape = data[1]
+    a_strides = data[2] if len(data) > 2 else None
+    b_strides = data[3] if len(data) > 3 else None
+    c_strides = data[4] if len(data) > 4 else None
+
+    # Create input specifications
+    inputs = []
+
+    # Input tensor a
+    if a_strides is not None:
+        inputs.append(TensorSpec.from_strided_tensor(shape, a_strides))
+    else:
+        inputs.append(TensorSpec.from_tensor(shape))
+
+    # Input tensor b (same shape as a)
+    if b_strides is not None:
+        inputs.append(TensorSpec.from_strided_tensor(shape, b_strides))
+    else:
+        inputs.append(TensorSpec.from_tensor(shape))
+
+    # Output tensor
+    if c_strides is not None:
+        output = TensorSpec.from_strided_tensor(shape, c_strides)
+    else:
+        output = TensorSpec.from_tensor(shape)
+
+    return TestCase(operation_mode, inputs, output)
+
+
+# Parse test cases
+_TEST_CASES = [parse_add_test_case(data) for data in _TEST_CASES_DATA]
+
+# Data types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# Tolerance
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 0, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+
+
+class OpTest(BaseOperatorTest):
+    """Add test with simplified test case parsing"""
+
+    def __init__(self):
+        super().__init__("Add")
+
+    def get_test_cases(self):
+        return _TEST_CASES
+
+    def get_tensor_dtypes(self):
+        return _TENSOR_DTYPES
+
+    def get_tolerance_map(self):
+        return _TOLERANCE_MAP
+
+    def torch_operator(self, a, b, out=None, **kwargs):
+        return torch.add(a, b, out=out)
+
+    def infinicore_operator(self, a, b, out=None, **kwargs):
+        return infinicore.add(a, b, out=out)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/attention_temp.py b/test/infinicore/ops/attention_temp.py
new file mode 100644
index 000000000..59af180bc
--- /dev/null
+++ b/test/infinicore/ops/attention_temp.py
@@ -0,0 +1,268 @@
+"""
+This is for framework validation
+"""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (operation_mode, n_q_head, n_kv_head, seq_len, head_dim, pos,
+#                    k_cache_buf_len, v_cache_buf_len, q_strides, k_strides, v_strides,
+#                    k_cache_strides, v_cache_strides)
+_TEST_CASES_DATA = [
+    # Prefill stage
+    (
+        TestCase.OUT_OF_PLACE,
+        32,
+        4,
+        5,
+        64,
+        0,
+        2048,
+        2048,
+        [64, 2560, 1],
+        [64, 2560, 1],
+        [64, 2560, 1],
+        [64, 11264, 1],
+        [64, 11264, 1],
+    ),
+    # Decode stage
+    (
+        TestCase.OUT_OF_PLACE,
+        32,
+        4,
+        1,
+        64,
+        3,
+        2048,
+        2048,
+        [64, 2560, 1],
+        [64, 2560, 1],
+        [64, 2560, 1],
+        [64, 11264, 1],
+        [64, 11264, 1],
+    ),
+    # Small test case
+    (TestCase.OUT_OF_PLACE, 8, 4, 2, 16, 1, 8, 8, None, None, None, None, None),
+    # Another prefill case
+    (
+        TestCase.OUT_OF_PLACE,
+        28,
+        28,
+        15,
+        128,
+        0,
+        2048,
+        2048,
+        [128, 10752, 1],
+        [128, 10752, 1],
+        [128, 10752, 1],
+        [128, 3584, 1],
+        [128, 3584, 1],
+    ),
+]
+
+# Epsilon constant for causal softmax
+_EPSILON = 1e-5
+
+
+def causal_softmax(x):
+    """Apply causal mask and softmax to attention scores"""
+    input_dtype = x.dtype
+    # Create causal mask
+    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
+    # Apply mask: set masked positions to -inf
+    masked = torch.where(mask == 1, -torch.inf, x.to(torch.float32))
+    # Apply softmax and convert back to original dtype
+    return torch.nn.functional.softmax(masked, dim=-1).to(input_dtype)
+
+
+def torch_attention(q, k, v, k_cache, v_cache, pos):
+    """PyTorch reference implementation of attention"""
+    input_dtype = q.dtype
+
+    n_q_head = q.shape[0]
+    n_kv_head = k.shape[0]
+
+    # Concatenate key and value caches
+    k_cache = k_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    v_cache = v_cache[:, :pos, :]  # (n_kv_head, pos, head_dim)
+    k = torch.cat([k_cache, k], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+    v = torch.cat([v_cache, v], dim=1)  # (n_kv_head, total_seq_len, head_dim)
+
+    total_seq_len = k.shape[1]
+    head_dim = v.shape[-1]
+
+    # Handle grouped query attention (GQA)
+    if n_q_head != n_kv_head:
+        q = q.reshape(
+            n_kv_head, -1, head_dim
+        )  # (n_kv_head, n_group * seq_len, head_dim)
+
+    # Scaled dot-product attention
+    attn_scores = (
+        torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
+        .to(input_dtype)
+        .reshape(n_q_head, -1, total_seq_len)
+    )  # (n_q_head, seq_len, total_seq_len)
+
+    # Scale by sqrt(head_dim)
+    attn_scores = attn_scores / (head_dim**0.5)
+
+    # Apply causal softmax
+    attn_weights = causal_softmax(attn_scores).reshape(
+        n_kv_head, -1, total_seq_len
+    )  # (n_kv_head, seq_len, total_seq_len)
+
+    # Weighted sum of values
+    attn_output = (
+        torch.einsum(
+            "hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
+        )
+        .to(input_dtype)
+        .reshape(n_q_head, -1, head_dim)
+        .permute(1, 0, 2)
+    )  # (seq_len, n_q_head, head_dim)
+
+    return attn_output
+
+
+def parse_attention_test_case(data):
+    """
+    Parse attention test case data according to format:
+    (operation_mode, n_q_head, n_kv_head, seq_len, head_dim, pos,
+     k_cache_buf_len, v_cache_buf_len, q_strides, k_strides, v_strides,
+     k_cache_strides, v_cache_strides)
+    """
+    operation_mode = data[0]
+    n_q_head, n_kv_head, seq_len, head_dim, pos = (
+        data[1],
+        data[2],
+        data[3],
+        data[4],
+        data[5],
+    )
+    k_cache_buf_len, v_cache_buf_len = data[6], data[7]
+    q_strides = data[8] if len(data) > 8 else None
+    k_strides = data[9] if len(data) > 9 else None
+    v_strides = data[10] if len(data) > 10 else None
+    k_cache_strides = data[11] if len(data) > 11 else None
+    v_cache_strides = data[12] if len(data) > 12 else None
+
+    # Create input specifications
+    inputs = []
+
+    # Query tensor: (n_q_head, seq_len, head_dim)
+    if q_strides is not None:
+        inputs.append(
+            TensorSpec.from_strided_tensor((n_q_head, seq_len, head_dim), q_strides)
+        )
+    else:
+        inputs.append(TensorSpec.from_tensor((n_q_head, seq_len, head_dim)))
+
+    # Key tensor: (n_kv_head, seq_len, head_dim)
+    if k_strides is not None:
+        inputs.append(
+            TensorSpec.from_strided_tensor((n_kv_head, seq_len, head_dim), k_strides)
+        )
+    else:
+        inputs.append(TensorSpec.from_tensor((n_kv_head, seq_len, head_dim)))
+
+    # Value tensor: (n_kv_head, seq_len, head_dim)
+    if v_strides is not None:
+        inputs.append(
+            TensorSpec.from_strided_tensor((n_kv_head, seq_len, head_dim), v_strides)
+        )
+    else:
+        inputs.append(TensorSpec.from_tensor((n_kv_head, seq_len, head_dim)))
+
+    # Key cache: (n_kv_head, k_cache_buf_len, head_dim)
+    if k_cache_strides is not None:
+        inputs.append(
+            TensorSpec.from_strided_tensor(
+                (n_kv_head, k_cache_buf_len, head_dim), k_cache_strides
+            )
+        )
+    else:
+        inputs.append(TensorSpec.from_tensor((n_kv_head, k_cache_buf_len, head_dim)))
+
+    # Value cache: (n_kv_head, v_cache_buf_len, head_dim)
+    if v_cache_strides is not None:
+        inputs.append(
+            TensorSpec.from_strided_tensor(
+                (n_kv_head, v_cache_buf_len, head_dim), v_cache_strides
+            )
+        )
+    else:
+        inputs.append(TensorSpec.from_tensor((n_kv_head, v_cache_buf_len, head_dim)))
+
+    # Position (scalar)
+    inputs.append(TensorSpec.from_scalar(pos))
+
+    # Output tensor: (seq_len, n_q_head, head_dim)
+    output_shape = (seq_len, n_q_head, head_dim)
+    output = TensorSpec.from_tensor(output_shape)
+
+    return TestCase(operation_mode, inputs, output)
+
+
+# Parse test cases
+_TEST_CASES = [parse_attention_test_case(data) for data in _TEST_CASES_DATA]
+
+# Data types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# Tolerance
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-4, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 1e-3, "rtol": 5e-2},
+}
+
+
+class OpTest(BaseOperatorTest):
+    """Attention test with simplified test case parsing"""
+
+    def __init__(self):
+        super().__init__("Attention")
+
+    def get_test_cases(self):
+        return _TEST_CASES
+
+    def get_tensor_dtypes(self):
+        return _TENSOR_DTYPES
+
+    def get_tolerance_map(self):
+        return _TOLERANCE_MAP
+
+    def torch_operator(self, q, k, v, k_cache, v_cache, pos, out=None, **kwargs):
+        result = torch_attention(q, k, v, k_cache, v_cache, pos)
+
+        if out is not None:
+            out.set_(result)
+            return out
+        else:
+            return result
+
+    def infinicore_operator(self, q, k, v, k_cache, v_cache, pos, out=None, **kwargs):
+        return infinicore.attention(q, k, v, k_cache, v_cache, pos, out=out)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/matmul.py b/test/infinicore/ops/matmul.py
new file mode 100644
index 000000000..e243edd95
--- /dev/null
+++ b/test/infinicore/ops/matmul.py
@@ -0,0 +1,122 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (operation_mode, nbatch, m, n, k, a_strides, b_strides, c_strides)
+# If nbatch is None: a_shape=(m, k), b_shape=(k, n), c_shape=(m, n)
+# If nbatch is provided: a_shape=(nbatch, m, k), b_shape=(nbatch, k, n), c_shape=(nbatch, m, n)
+_TEST_CASES_DATA = [
+    # Basic 2D matmul
+    (TestCase.BOTH, None, 2, 4, 3, None, None, None),
+    (TestCase.BOTH, None, 128, 64, 256, None, None, None),
+    # Batched matmul
+    (TestCase.BOTH, 2, 4, 2048, 2048, None, None, None),
+    (TestCase.BOTH, 4, 48, 6, 64, None, None, None),
+    # Strided tensors
+    (TestCase.BOTH, None, 1, 2048, 2048, (4096, 1), (4096, 1), (4096, 1)),
+    (TestCase.BOTH, None, 6, 2560, 2048, (2048, 1), (1, 2048), (2560, 1)),
+    # Mixed cases
+    (TestCase.BOTH, 8, 16, 32, 16, None, None, None),
+]
+
+
+def parse_matmul_test_case(data):
+    """
+    Parse matmul test case data according to format:
+    (operation_mode, nbatch, m, n, k, a_strides, b_strides, c_strides)
+    """
+    operation_mode = data[0]
+    nbatch = data[1]
+    m, n, k = data[2], data[3], data[4]
+    a_strides = data[5] if len(data) > 5 else None
+    b_strides = data[6] if len(data) > 6 else None
+    c_strides = data[7] if len(data) > 7 else None
+
+    # Determine shapes based on batch dimension
+    if nbatch is None:
+        a_shape = (m, k)
+        b_shape = (k, n)
+        c_shape = (m, n)
+    else:
+        a_shape = (nbatch, m, k)
+        b_shape = (nbatch, k, n)
+        c_shape = (nbatch, m, n)
+
+    # Create input specifications
+    inputs = []
+
+    # Tensor a
+    if a_strides is not None:
+        inputs.append(TensorSpec.from_strided_tensor(a_shape, a_strides))
+    else:
+        inputs.append(TensorSpec.from_tensor(a_shape))
+
+    # Tensor b
+    if b_strides is not None:
+        inputs.append(TensorSpec.from_strided_tensor(b_shape, b_strides))
+    else:
+        inputs.append(TensorSpec.from_tensor(b_shape))
+
+    # Output tensor
+    if c_strides is not None:
+        output = TensorSpec.from_strided_tensor(c_shape, c_strides)
+    else:
+        output = TensorSpec.from_tensor(c_shape)
+
+    return TestCase(operation_mode, inputs, output)
+
+
+# Parse test cases
+_TEST_CASES = [parse_matmul_test_case(data) for data in _TEST_CASES_DATA]
+
+# Data types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# Tolerance
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 0, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+
+
+class OpTest(BaseOperatorTest):
+    """Matmul test with simplified test case parsing"""
+
+    def __init__(self):
+        super().__init__("Matmul")
+
+    def get_test_cases(self):
+        return _TEST_CASES
+
+    def get_tensor_dtypes(self):
+        return _TENSOR_DTYPES
+
+    def get_tolerance_map(self):
+        return _TOLERANCE_MAP
+
+    def torch_operator(self, a, b, out=None, **kwargs):
+        return torch.matmul(a, b, out=out)
+
+    def infinicore_operator(self, a, b, out=None, **kwargs):
+        return infinicore.matmul(a, b, out=out)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rms_norm.py b/test/infinicore/ops/rms_norm.py
new file mode 100644
index 000000000..40249a745
--- /dev/null
+++ b/test/infinicore/ops/rms_norm.py
@@ -0,0 +1,132 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework.base import BaseOperatorTest, TensorSpec, TestCase
+from framework.runner import GenericTestRunner
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (operation_mode, y_shape, x_shape, w_shape, y_strides, x_strides)
+_TEST_CASES_DATA = [
+    (TestCase.BOTH, (1, 4), (1, 4), (4,), None, None),
+    (TestCase.BOTH, (2, 4), (2, 4), (4,), None, None),
+    (TestCase.BOTH, (2, 2, 4), (2, 2, 4), (4,), None, None),
+    (TestCase.BOTH, (2, 2, 4), (2, 2, 4), (4,), (12, 8, 1), (12, 8, 1)),
+    (TestCase.BOTH, (16, 2048), (16, 2048), (2048,), None, None),
+    (TestCase.BOTH, (16, 2048), (16, 2048), (2048,), (4096, 1), (4096, 1)),
+]
+
+
+def parse_rms_norm_test_case(data):
+    """
+    Parse RMSNorm test case data according to format:
+    (operation_mode, y_shape, x_shape, w_shape, y_strides, x_strides)
+    """
+    operation_mode = data[0]
+    y_shape = data[1]  # Output shape
+    x_shape = data[2]  # Input shape
+    w_shape = data[3]  # Weight shape (1D)
+    y_strides = data[4] if len(data) > 4 else None
+    x_strides = data[5] if len(data) > 5 else None
+
+    # Create input specifications
+    inputs = []
+
+    # Input tensor x
+    if x_strides is not None:
+        inputs.append(TensorSpec.from_strided_tensor(x_shape, x_strides))
+    else:
+        inputs.append(TensorSpec.from_tensor(x_shape))
+
+    # Weight tensor (1D, always contiguous)
+    inputs.append(TensorSpec.from_tensor(w_shape))
+
+    # Output tensor
+    if y_strides is not None:
+        output = TensorSpec.from_strided_tensor(y_shape, y_strides)
+    else:
+        output = TensorSpec.from_tensor(y_shape)
+
+    return TestCase(operation_mode, inputs, output)
+
+
+# Parse test cases
+_TEST_CASES = [parse_rms_norm_test_case(data) for data in _TEST_CASES_DATA]
+
+# Data types for individual tensors
+_INPUT_DTYPES = [infinicore.float16, infinicore.bfloat16]
+_WEIGHT_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+# Generate all dtype combinations
+_DTYPE_COMBINATIONS = []
+for input_dtype in _INPUT_DTYPES:
+    for weight_dtype in _WEIGHT_DTYPES:
+        _DTYPE_COMBINATIONS.append(
+            {
+                "input_0": input_dtype,  # x tensor
+                "input_1": weight_dtype,  # weight tensor
+                "output": input_dtype,  # output tensor (same as input)
+            }
+        )
+
+# Base data types
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16]
+
+# Tolerance
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 2e-3, "rtol": 2e-3},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+# EPSILON constant for RMSNorm
+_EPSILON = 1e-5
+
+
+class OpTest(BaseOperatorTest):
+    """RMSNorm test with simplified test case parsing"""
+
+    def __init__(self):
+        super().__init__("RMS_Norm")
+
+    def get_test_cases(self):
+        return _TEST_CASES
+
+    def get_tensor_dtypes(self):
+        return _TENSOR_DTYPES
+
+    def get_tolerance_map(self):
+        return _TOLERANCE_MAP
+
+    def get_dtype_combinations(self):
+        return _DTYPE_COMBINATIONS
+
+    def torch_operator(self, x, weight, out=None, **kwargs):
+        input_dtype = x.dtype
+        hidden_states = x.to(torch.float32)
+        scale = hidden_states.pow(2).mean(-1, keepdim=True).add_(_EPSILON).rsqrt_()
+        result = (hidden_states * scale * weight).to(input_dtype)
+
+        if out is not None:
+            out.set_(result)
+            return out
+        else:
+            return result
+
+    def infinicore_operator(self, x, weight, out=None, **kwargs):
+        return infinicore.rms_norm(x, weight, _EPSILON, out=out)
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xmake.lua b/xmake.lua
index 10c47aa69..ac3fad2ca 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -345,7 +345,7 @@ target("_infinicore")
     add_files("src/infinicore/context/*.cc")
     add_files("src/infinicore/context/*/*.cc")
     add_files("src/infinicore/tensor/*.cc")
-    add_files("src/infinicore/op/*/*.cc")
+    add_files("src/infinicore/ops/*/*.cc")
     add_files("src/infinicore/pybind11/**.cc")
 
     set_installdir("python/infinicore")