InfiniTensor · wooway777 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/include/infinicore/nn/layer_norm.hpp b/include/infinicore/nn/layer_norm.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "../ops.hpp"
+#include "module.hpp"
+
+namespace infinicore::nn {
+
+class LayerNorm : public Module {
+public:
+    /**
+     * @brief Construct a LayerNorm layer
+     *
+     * @param normalized_shape Size of the feature dimension to normalize (typically hidden_size)
+     * @param eps Small constant for numerical stability (default: 1e-6)
+     * @param dtype Data type for the weight (default: DataType::F32)
+     * @param device Device to create the weight on
+     */
+    LayerNorm(size_t normalized_shape,
+              double eps = 1e-6,
+              const DataType &dtype = DataType::F32,
+              const Device &device = Device());
+
+    /**
+     * @brief Forward pass: apply LayerNorm
+     *
+     * @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions
+     * @return Normalized tensor with same shape as input
+     *
+     * The normalization is applied over the last dimension.
+     * For example:
+     *   Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
+     *   Input: [batch, hidden_size] -> normalize over hidden_size
+     */
+    Tensor forward(const Tensor &x) const;
+
+    // Module information
+    size_t normalized_shape() const { return normalized_shape_; }
+    double eps() const { return eps_; }
+    DataType dtype() const { return dtype_; }
+
+    // String representation
+    std::string extra_repr() const;
+
+    // Accessors for parameters
+    Tensor weight() const { return weight_; }
+    Tensor bias() const { return bias_; }
+
+protected:
+    // Parameters
+    INFINICORE_NN_PARAMETER(weight);
+    INFINICORE_NN_PARAMETER(bias);
+
+private:
+    size_t normalized_shape_; // Size of the feature dimension
+    double eps_;              // Epsilon for numerical stability
+    DataType dtype_;          // Data type for weight
+};
+
+} // namespace infinicore::nn
diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
@@ -22,6 +22,7 @@
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
+#include "ops/layer_norm.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
 #include "ops/paged_attention.hpp"

diff --git a/include/infinicore/ops/layer_norm.hpp b/include/infinicore/ops/layer_norm.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(LayerNorm, Tensor, Tensor, Tensor, const Tensor &, const Tensor &, const Tensor &, float);
+
+Tensor layer_norm(const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon = 1e-5f);
+void layer_norm_(Tensor y, Tensor standardization, Tensor std_deviation, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon = 1e-5f);
+void layer_norm_(Tensor y, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon = 1e-5f);
+void layer_norm_for_pybind(Tensor y, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon = 1e-5f);
+
+} // namespace infinicore::op
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
@@ -13,6 +13,7 @@
 from .hinge_embedding_loss import hinge_embedding_loss
 from .huber_loss import huber_loss
 from .interpolate import interpolate
+from .layer_norm import layer_norm
 from .linear import linear
 from .linear_w8a8i8 import linear_w8a8i8
 from .log_softmax import log_softmax
@@ -83,4 +84,5 @@
     "softplus",
     "softsign",
     "huber_loss",
+    "layer_norm",
 ]
diff --git a/python/infinicore/nn/functional/layer_norm.py b/python/infinicore/nn/functional/layer_norm.py
@@ -0,0 +1,33 @@
+from typing import List
+
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def layer_norm(
+    input: Tensor,
+    normalized_shape: List[int],
+    weight: Tensor,
+    bias: Tensor,
+    eps: float = 1e-5,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Apply Layer Normalization."""
+
+    assert normalized_shape == weight.shape, (
+        "normalized_shape does not match weight.shape."
+    )
+
+    if out is None:
+        return Tensor(
+            _infinicore.layer_norm(
+                input._underlying, weight._underlying, bias._underlying, eps
+            )
+        )
+
+    _infinicore.layer_norm_(
+        out._underlying, input._underlying, weight._underlying, bias._underlying, eps
+    )
+
+    return out
diff --git a/src/infinicore/nn/layer_norm.cc b/src/infinicore/nn/layer_norm.cc
@@ -0,0 +1,27 @@
+#include "infinicore/nn/layer_norm.hpp"
+#include "infinicore/ops.hpp"
+#include <cmath>
+#include <stdexcept>
+
+namespace infinicore::nn {
+
+LayerNorm::LayerNorm(size_t normalized_shape, double eps, const DataType &dtype, const Device &device)
+    : normalized_shape_(normalized_shape),
+      eps_(eps),
+      dtype_(dtype) {
+
+    device_ = device;
+
+    INFINICORE_NN_PARAMETER_INIT(weight, ({normalized_shape}, dtype_, device));
+    INFINICORE_NN_PARAMETER_INIT(bias, ({normalized_shape}, dtype_, device));
+}
+
+Tensor LayerNorm::forward(const Tensor &x) const {
+    return op::layer_norm(x, weight_, bias_, static_cast<float>(eps_));
+}
+
+std::string LayerNorm::extra_repr() const {
+    return "LayerNorm(normalized_shape=" + std::to_string(normalized_shape_) + ", eps=" + std::to_string(eps_) + ", dtype=" + std::to_string(static_cast<int>(dtype_)) + ")";
+}
+
+} // namespace infinicore::nn
diff --git a/src/infinicore/ops/layer_norm/layer_norm.cc b/src/infinicore/ops/layer_norm/layer_norm.cc
@@ -0,0 +1,42 @@
+#include "infinicore/ops/layer_norm.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(LayerNorm);
+
+LayerNorm::LayerNorm(Tensor y, Tensor standardization, Tensor std_deviation, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, standardization, std_deviation, x, weight);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, standardization, std_deviation, x, weight, bias, epsilon);
+}
+
+void LayerNorm::execute(Tensor y, Tensor standardization, Tensor std_deviation, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(LayerNorm, y, standardization, std_deviation, x, weight, bias, epsilon);
+}
+
+Tensor layer_norm(const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    auto reduced_shape = x->shape();
+    reduced_shape.pop_back();
+    auto standardization = Tensor::empty(x->shape(), x->dtype(), x->device());
+    auto std_deviation = Tensor::empty(reduced_shape, x->dtype(), x->device());
+    layer_norm_(y, standardization, std_deviation, x, weight, bias, epsilon);
+    return y;
+}
+
+void layer_norm_(Tensor y, Tensor standardization, Tensor std_deviation, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    LayerNorm::execute(y, standardization, std_deviation, x, weight, bias, epsilon);
+}
+
+void layer_norm_(Tensor y, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    auto reduced_shape = x->shape();
+    reduced_shape.pop_back();
+    auto standardization = Tensor::empty(x->shape(), x->dtype(), x->device());
+    auto std_deviation = Tensor::empty(reduced_shape, x->dtype(), x->device());
+    LayerNorm::execute(y, standardization, std_deviation, x, weight, bias, epsilon);
+}
+
+void layer_norm_for_pybind(Tensor y, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    layer_norm_(y, x, weight, bias, epsilon);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc b/src/infinicore/ops/layer_norm/layer_norm_infiniop.cc
@@ -0,0 +1,65 @@
+#include "infinicore/ops/layer_norm.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::layer_norm_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, LayerNorm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, y, standardization, std_deviation, x, weight, bias;
+};
+
+void *plan(Tensor y, Tensor standardization, Tensor std_deviation, const Tensor &x, const Tensor &weight, const Tensor &bias, float epsilon) {
+    size_t seed = hash_combine(y, standardization, std_deviation, x, weight, bias, epsilon);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, LayerNorm,
+        seed,
+        y->desc(),
+        standardization->desc(),
+        std_deviation->desc(),
+        x->desc(),
+        weight->desc(),
+        bias->desc(),
+        epsilon);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, LayerNorm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(y),
+        graph::GraphTensor(standardization),
+        graph::GraphTensor(std_deviation),
+        graph::GraphTensor(x),
+        graph::GraphTensor(weight),
+        graph::GraphTensor(bias)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopLayerNorm(
+            planned->descriptor->desc,
+            planned->workspace->data(),
+            planned->workspace->numel(),
+            planned->y->data(),
+            planned->standardization->data(),
+            planned->std_deviation->data(),
+            planned->x->data(),
+            planned->weight->data(),
+            planned->bias->data(),
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(LayerNorm, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::layer_norm_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -54,6 +54,7 @@
 #include "ops/kron.hpp"
 #include "ops/kthvalue.hpp"
 #include "ops/kv_caching.hpp"
+#include "ops/layer_norm.hpp"
 #include "ops/ldexp.hpp"
 #include "ops/lerp.hpp"
 #include "ops/linear.hpp"
@@ -216,6 +217,7 @@ inline void bind(py::module &m) {
     bind_triplet_margin_loss(m);
     bind_selu(m);
     bind_sinh(m);
+    bind_layer_norm(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/layer_norm.hpp b/src/infinicore/pybind11/ops/layer_norm.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/layer_norm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_layer_norm(py::module &m) {
+    m.def("layer_norm",
+          &op::layer_norm,
+          py::arg("x"),
+          py::arg("weight"),
+          py::arg("bias"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(Layer Normalization.
+
+Args:
+    x: Input tensor
+    weight: Scale weights
+    bias: Bias weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+
+Returns:
+    Normalized tensor with same shape as input
+)doc");
+
+    m.def("layer_norm_",
+          &op::layer_norm_for_pybind,
+          py::arg("y"),
+          py::arg("x"),
+          py::arg("weight"),
+          py::arg("bias"),
+          py::arg("epsilon") = 1e-5f,
+          R"doc(In-place Layer Normalization.
+
+Args:
+    y: Output tensor
+    x: Input tensor
+    weight: Scale weights
+    bias: Bias weights
+    epsilon: Small constant for numerical stability, default is 1e-5
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/devices/metax/metax_kernel_common.h b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -5,7 +5,9 @@
 #include <maca_bfloat16.h>
 #include <maca_fp16.h>
 #include <maca_fp8.h>
+#include <mccub/block/block_reduce.cuh>
 #else
+#include <hccub/block/block_reduce.cuh>
 #include <hpcc_bfloat16.h>
 #include <hpcc_fp16.h>
 #include <hpcc_fp8.h>