InfiniTensor
diff --git a/‎src/ascend/add_rms_norm/kernel.h‎
Lines changed: 141 additions & 0 deletions b/‎src/ascend/add_rms_norm/kernel.h‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 174 additions & 0 deletions b/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 174 additions & 0 deletions
@@ -0,0 +1,141 @@
+#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
+#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
+
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnn_add.h"
+#include "aclnn_rms_norm.h"
+#include "ascend/common.h"
+#include "ascend/workspace_pool_.h"
+#include "base/add_rms_norm.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+// Decomposed implementation: aclnnAdd + aclnnRmsNorm.
+//
+// The fused aclnnAddRmsNorm API has ~200 us host-side launch overhead that
+// dominates small-tensor dispatch.  Decomposing into two fast ACLNN calls
+// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
+// NPU-side impact for inference tensor sizes.
+template <>
+class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
+ public:
+  Operator(const Tensor x1, const Tensor x2, const Tensor gamma, float eps,
+           Tensor y_out, Tensor x_out)
+      : AddRmsNorm(x1, x2, gamma, eps, y_out, x_out),
+        x1_cache_(x1),
+        x2_cache_(x2),
+        gamma_cache_(gamma),
+        y_out_cache_(y_out),
+        x_out_cache_(x_out) {
+    // Alpha scalar for aclnnAdd (x_out = x1 + 1.0 * x2).
+    alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
+
+    // aclnnRmsNorm writes rstd as a required side output.
+    // Size computed here; buffer obtained from pool in `operator()`.
+    rstd_shape_ = {static_cast<int64_t>(batch_size_),
+                   static_cast<int64_t>(nhead_)};
+    rstd_size_ = batch_size_ * nhead_ * sizeof(float);
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    x1_cache_.release();
+    x2_cache_.release();
+    gamma_cache_.release();
+    y_out_cache_.release();
+    x_out_cache_.release();
+
+    // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
+    if (alpha_) aclDestroyScalar(alpha_);
+  }
+
+  void operator()(const Tensor x1, const Tensor x2, const Tensor gamma,
+                  float eps, Tensor y_out, Tensor x_out) const override {
+    auto t_x1 = x1_cache_.get(const_cast<void*>(x1.data()));
+    auto t_x2 = x2_cache_.get(const_cast<void*>(x2.data()));
+    auto t_gamma = gamma_cache_.get(const_cast<void*>(gamma.data()));
+    auto t_y_out = y_out_cache_.get(y_out.data());
+    auto t_x_out = x_out_cache_.get(x_out.data());
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    // Step 1: x_out = x1 + x2.
+    if (!add_exec_) {
+      aclnnAddGetWorkspaceSize(t_x1, t_x2, alpha_, t_x_out, &add_ws_,
+                               &add_exec_);
+      aclSetAclOpExecutorRepeatable(add_exec_);
+    } else {
+      aclSetInputTensorAddr(add_exec_, 0, t_x1, const_cast<void*>(x1.data()));
+      aclSetInputTensorAddr(add_exec_, 1, t_x2, const_cast<void*>(x2.data()));
+      aclSetOutputTensorAddr(add_exec_, 0, t_x_out, x_out.data());
+    }
+    auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
+    aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);
+
+    // Obtain shared rstd buffer from pool.
+    auto& rstd_arena =
+        ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");
+
+    // Lazily create rstd tensor descriptor on first call.
+    if (!rstd_tensor_) {
+      rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
+                                     /*strides=*/nullptr, 0, ACL_FORMAT_ND,
+                                     rstd_shape_.data(), 2, rstd_arena.buf);
+    } else {
+      aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
+    }
+
+    // Step 2: y_out = rms_norm(x_out, gamma, eps).
+    if (!norm_exec_) {
+      aclnnRmsNormGetWorkspaceSize(t_x_out, t_gamma, eps, t_y_out, rstd_tensor_,
+                                   &norm_ws_, &norm_exec_);
+      aclSetAclOpExecutorRepeatable(norm_exec_);
+    } else {
+      aclSetInputTensorAddr(norm_exec_, 0, t_x_out, x_out.data());
+      aclSetInputTensorAddr(norm_exec_, 1, t_gamma,
+                            const_cast<void*>(gamma.data()));
+      aclSetOutputTensorAddr(norm_exec_, 0, t_y_out, y_out.data());
+      aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf);
+    }
+    auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_);
+    aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache x1_cache_;
+
+  mutable ascend::AclTensorCache x2_cache_;
+
+  mutable ascend::AclTensorCache gamma_cache_;
+
+  mutable ascend::AclTensorCache y_out_cache_;
+
+  mutable ascend::AclTensorCache x_out_cache_;
+
+  float alpha_storage_ = 1.0f;
+
+  aclScalar* alpha_ = nullptr;
+
+  std::vector<int64_t> rstd_shape_;
+
+  uint64_t rstd_size_ = 0;
+
+  mutable aclTensor* rstd_tensor_ = nullptr;
+
+  mutable aclOpExecutor* add_exec_ = nullptr;
+
+  mutable uint64_t add_ws_ = 0;
+
+  mutable aclOpExecutor* norm_exec_ = nullptr;
+
+  mutable uint64_t norm_ws_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif
@@ -0,0 +1,174 @@
+#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_
+#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_
+
+#ifdef INFINI_HAS_CUSTOM_KERNELS
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_cast.h"
+#include "ascend/common.h"
+#include "ascend/workspace_pool_.h"
+#include "base/add_rms_norm.h"
+#include "operator.h"
+
+// Forward-declare the generated AscendC kernel launch function.
+// This symbol is provided by the `no_workspace_kernel` static library
+// built from `ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp`
+// via `ascendc_library()`.
+extern "C" uint32_t aclrtlaunch_add_rms_norm(
+    uint32_t blockDim, void* stream, void* x1, void* x2, void* weight, void* y,
+    void* x_out, int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
+    int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
+    int64_t dtypeSize);
+
+namespace infini::ops {
+
+// Custom AscendC fused AddRmsNorm kernel (implementation index 2).
+//
+// A single-kernel implementation that computes x_out = x1 + x2 followed by
+// y = rms_norm(x_out, gamma, eps) in one launch, avoiding the decomposed
+// aclnnAdd + aclnnRmsNorm calls (index 0) or the fused aclnnAddRmsNorm call
+// (index 1).  Migrated from the custom RmsNorm kernel (index 1 of RmsNorm).
+//
+// Select via `implementation_index=2` in Python:
+//   infini.ops.add_rms_norm(x1, x2, gamma, eps, y_out, x_out,
+//                           implementation_index=2, stream=s)
+//
+// Requirements:
+//   - Input last dimension must be 32-byte aligned (divisible by 16 for fp16
+//     or 8 for fp32).  All standard LLM hidden dimensions satisfy this.
+//   - Weight must have the same dtype as input.
+//   - The custom kernel binary must be linked (`BUILD_CUSTOM_KERNEL=ON`).
+template <>
+class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
+ public:
+  Operator(const Tensor x1, const Tensor x2, const Tensor gamma, float eps,
+           Tensor y_out, Tensor x_out)
+      : AddRmsNorm(x1, x2, gamma, eps, y_out, x_out) {
+    // Dtype size in bytes.
+    dtype_size_ = (x1.dtype() == DataType::kFloat16) ? 2 : 4;
+
+    // Alignment check (32-byte boundary).
+    int64_t align_elems = 32 / dtype_size_;
+    dim_length_align_ =
+        ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
+        align_elems;
+    assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
+           "Custom AddRmsNorm kernel requires 32-byte aligned last dimension");
+
+    total_rows_ =
+        static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
+
+    // For fp16 input, weight needs fp32 conversion because the custom
+    // kernel always reads weight as fp32.
+    needs_weight_cast_ = (dtype_size_ == 2);
+
+    if (needs_weight_cast_) {
+      // Allocate persistent fp32 weight buffer on device.
+      size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
+      aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
+
+      // `AclTensorCache` for the cast source (fp16 weight descriptor).
+      weight_src_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
+                                                 ACL_FLOAT16, nullptr);
+
+      // `AclTensorCache` for the cast destination (fp32 weight buffer).
+      weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
+                                                 ACL_FLOAT, weight_fp32_data_);
+    }
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    weight_src_cache_.release();
+    weight_dst_cache_.release();
+
+    if (weight_fp32_data_) aclrtFree(weight_fp32_data_);
+  }
+
+  void operator()(const Tensor x1, const Tensor x2, const Tensor gamma,
+                  float eps, Tensor y_out, Tensor x_out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    // Determine fp32 weight pointer.
+    void* weight_fp32;
+
+    if (needs_weight_cast_) {
+      // Only re-cast when the weight data pointer changes.  Model weights
+      // are fixed after loading, so this typically runs once on the first
+      // call and is skipped on all subsequent calls.
+      const void* cur_weight = gamma.data();
+
+      if (cur_weight != last_weight_ptr_) {
+        auto t_src = weight_src_cache_.get(const_cast<void*>(cur_weight));
+        auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
+
+        if (!cast_exec_) {
+          aclnnCastGetWorkspaceSize(t_src, ACL_FLOAT, t_dst, &cast_ws_,
+                                    &cast_exec_);
+          aclSetAclOpExecutorRepeatable(cast_exec_);
+        } else {
+          aclSetInputTensorAddr(cast_exec_, 0, t_src,
+                                const_cast<void*>(cur_weight));
+          aclSetOutputTensorAddr(cast_exec_, 0, t_dst, weight_fp32_data_);
+        }
+
+        auto& arena = ascend::GetWorkspacePool().Ensure(stream, cast_ws_);
+        aclnnCast(arena.buf, cast_ws_, cast_exec_, stream);
+        last_weight_ptr_ = cur_weight;
+      }
+
+      weight_fp32 = weight_fp32_data_;
+    } else {
+      // Input is fp32 — weight is already fp32.
+      weight_fp32 = const_cast<void*>(gamma.data());
+    }
+
+    // Block-level tiling: distribute rows across cores.
+    static constexpr int64_t kMaxBlockDim = 40;
+    int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
+    int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
+    int64_t tail_length = former_length - 1;
+    int64_t former_num = total_rows_ - tail_length * used_cores;
+    uint32_t block_dim = static_cast<uint32_t>(used_cores);
+
+    // Launch custom AscendC kernel.
+    aclrtlaunch_add_rms_norm(
+        block_dim, stream, const_cast<void*>(x1.data()),
+        const_cast<void*>(x2.data()), weight_fp32, y_out.data(), x_out.data(),
+        total_rows_, static_cast<int64_t>(dim_), dim_length_align_, former_num,
+        former_length, tail_length, eps, dtype_size_);
+  }
+
+ private:
+  int64_t dtype_size_;
+
+  int64_t dim_length_align_;
+
+  int64_t total_rows_;
+
+  bool needs_weight_cast_;
+
+  void* weight_fp32_data_ = nullptr;
+
+  mutable ascend::AclTensorCache weight_src_cache_;
+
+  mutable ascend::AclTensorCache weight_dst_cache_;
+
+  mutable const void* last_weight_ptr_ = nullptr;
+
+  mutable aclOpExecutor* cast_exec_ = nullptr;
+
+  mutable uint64_t cast_ws_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif  // INFINI_HAS_CUSTOM_KERNELS
+#endif  // INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_