InfiniTensor
diff --git a/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 33 additions & 35 deletions b/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 33 additions & 35 deletions
@@ -19,11 +19,14 @@
 // This symbol is provided by the `no_workspace_kernel` static library
 // built from `ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp`
 // via `ascendc_library()`.
+// `aclrtlaunch_*` symbol name is generated by `ascendc_library()` /
+// `ascendc_add_operator()` and cannot be `PascalCase`d.
+// NOLINTNEXTLINE(readability-identifier-naming)
 extern "C" uint32_t aclrtlaunch_add_rms_norm(
-    uint32_t blockDim, void* stream, void* x1, void* x2, void* weight, void* y,
-    void* x_out, int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
-    int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
-    int64_t dtypeSize);
+    uint32_t block_dim, void* stream, void* x1, void* x2, void* weight, void* y,
+    void* x_out, int64_t total_rows, int64_t dim_length,
+    int64_t dim_length_align, int64_t former_num, int64_t former_length,
+    int64_t tail_length, float eps, int64_t dtype_code);
 
 namespace infini::ops {
 
@@ -50,36 +53,36 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
  public:
   Operator(const Tensor input, const Tensor other, const Tensor weight,
            float eps, Tensor out, Tensor residual_out)
-      : AddRmsNorm(input, other, weight, eps, out, residual_out) {
-    // Dtype size in bytes.
-    dtype_size_ = (input.dtype() == DataType::kFloat16) ? 2 : 4;
-
-    // Alignment check (32-byte boundary).
-    int64_t align_elems = 32 / dtype_size_;
+      : AddRmsNorm(input, other, weight, eps, out, residual_out),
+        dtype_{input.dtype()} {
+    assert((dtype_ == DataType::kFloat16 || dtype_ == DataType::kBFloat16 ||
+            dtype_ == DataType::kFloat32) &&
+           "`AddRmsNorm` custom kernel: `input` must be `fp16`, `bf16`, or "
+           "`fp32`.");
+
+    // 32-byte alignment on the last dimension — kernel relies on aligned
+    // `DataCopyPad` loads/stores.
+    int64_t align_elems = 32 / static_cast<int64_t>(kDataTypeToSize.at(dtype_));
     dim_length_align_ =
         ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
         align_elems;
     assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
-           "`AddRmsNorm`: custom kernel requires 32-byte aligned last "
-           "dimension.");
+           "`AddRmsNorm` custom kernel: last dimension must be 32-byte "
+           "aligned.");
 
     total_rows_ =
         static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
 
-    // For `float16` input, `weight` needs fp32 conversion because the custom
-    // kernel always reads `weight` as `float32`.
-    needs_weight_cast_ = (dtype_size_ == 2);
-
-    if (needs_weight_cast_) {
-      // Allocate persistent fp32 `weight` buffer on device.
+    // The custom kernel always reads `weight` as fp32.  fp16 / bf16 inputs
+    // trigger a lazy cast in `operator()` (guarded by `last_weight_ptr_`
+    // so that the cast runs only when the weight pointer changes — model
+    // weights are typically fixed after loading).
+    if (dtype_ != DataType::kFloat32) {
       size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
       aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
 
-      // `AclTensorCache` for the cast source (`float16` `weight` descriptor).
-      weight_src_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
-                                                 ACL_FLOAT16, nullptr);
-
-      // `AclTensorCache` for the cast destination (`float32` `weight` buffer).
+      weight_src_cache_ = ascend::AclTensorCache(
+          {static_cast<int64_t>(dim_)}, ascend::ToAclDtype(dtype_), nullptr);
       weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
                                                  ACL_FLOAT, weight_fp32_data_);
     }
@@ -99,15 +102,13 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
                   float eps, Tensor out, Tensor residual_out) const override {
     auto stream = static_cast<aclrtStream>(stream_);
 
-    // Determine `float32` `weight` pointer.
     void* weight_fp32;
 
-    if (needs_weight_cast_) {
-      // Only re-cast when the `weight` data pointer changes.  Model weights
-      // are fixed after loading, so this typically runs once on the first
-      // call and is skipped on all subsequent calls.
+    if (dtype_ != DataType::kFloat32) {
       const void* cur_weight = weight.data();
 
+      // Model weights are fixed after loading, so the cast typically runs
+      // once on the first call and is skipped on all subsequent calls.
       if (cur_weight != last_weight_ptr_) {
         auto t_src = weight_src_cache_.get(const_cast<void*>(cur_weight));
         auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
@@ -129,36 +130,33 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
 
       weight_fp32 = weight_fp32_data_;
     } else {
-      // `input` is `float32` — `weight` is already `float32`.
       weight_fp32 = const_cast<void*>(weight.data());
     }
 
-    // Block-level tiling: distribute rows across cores.
+    // Block-level tiling.  Ascend 910B has 20–40 AIV cores; over-subscribing
+    // is safe (runtime multiplexes) but wastes one weight load per block.
     static constexpr int64_t kMaxBlockDim = 40;
     int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
     int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
     int64_t tail_length = former_length - 1;
     int64_t former_num = total_rows_ - tail_length * used_cores;
     uint32_t block_dim = static_cast<uint32_t>(used_cores);
 
-    // Launch custom AscendC kernel.
     aclrtlaunch_add_rms_norm(block_dim, stream, const_cast<void*>(input.data()),
                              const_cast<void*>(other.data()), weight_fp32,
                              out.data(), residual_out.data(), total_rows_,
                              static_cast<int64_t>(dim_), dim_length_align_,
                              former_num, former_length, tail_length, eps,
-                             dtype_size_);
+                             static_cast<int64_t>(dtype_));
   }
 
  private:
-  int64_t dtype_size_;
+  DataType dtype_;
 
   int64_t dim_length_align_;
 
   int64_t total_rows_;
 
-  bool needs_weight_cast_;
-
   void* weight_fp32_data_ = nullptr;
 
   mutable ascend::AclTensorCache weight_src_cache_;