InfiniTensor
diff --git a/‎scripts/generate_wrappers.py‎
Lines changed: 2 additions & 0 deletions b/‎scripts/generate_wrappers.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/ascend/add_rms_norm/kernel.h‎
Lines changed: 8 additions & 11 deletions b/‎src/ascend/add_rms_norm/kernel.h‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 21 additions & 31 deletions b/‎src/ascend/add_rms_norm/kernel_custom.h‎
Lines changed: 21 additions & 31 deletions
diff --git a/‎src/ascend/add_rms_norm/kernel_fused.h‎
Lines changed: 5 additions & 8 deletions b/‎src/ascend/add_rms_norm/kernel_fused.h‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/ascend/atb_common_.h‎
Lines changed: 1 addition & 1 deletion b/‎src/ascend/atb_common_.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ascend/cat/kernel.h‎
Lines changed: 4 additions & 4 deletions b/‎src/ascend/cat/kernel.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/ascend/causal_softmax/kernel.h‎
Lines changed: 4 additions & 6 deletions b/‎src/ascend/causal_softmax/kernel.h‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/ascend/common.h‎
Lines changed: 2 additions & 2 deletions b/‎src/ascend/common.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ascend/flash_attention/kernel.h‎
Lines changed: 10 additions & 12 deletions b/‎src/ascend/flash_attention/kernel.h‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎src/ascend/linear/kernel.h‎
Lines changed: 6 additions & 10 deletions b/‎src/ascend/linear/kernel.h‎
Lines changed: 6 additions & 10 deletions
@@ -99,6 +99,7 @@ def _find_optional_tensor_params(op_name):
     source text.
     """
     source = (_BASE_DIR / f"{op_name}.h").read_text()
+
     return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
 
 
@@ -109,6 +110,7 @@ def _find_vector_tensor_params(op_name):
     import re
 
     source = (_BASE_DIR / f"{op_name}.h").read_text()
+
     return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
 
 
 
@@ -7,8 +7,8 @@
 #include "aclnn/aclnn_base.h"
 #include "aclnn_add.h"
 #include "aclnn_rms_norm.h"
-#include "ascend/common.h"
 #include "ascend/add_rms_norm/registry.h"
+#include "ascend/common.h"
 #include "ascend/workspace_pool_.h"
 #include "operator.h"
 
@@ -63,10 +63,8 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
                                &add_exec_);
       aclSetAclOpExecutorRepeatable(add_exec_);
     } else {
-      aclSetInputTensorAddr(add_exec_, 0, t_x1,
-                            const_cast<void*>(x1.data()));
-      aclSetInputTensorAddr(add_exec_, 1, t_x2,
-                            const_cast<void*>(x2.data()));
+      aclSetInputTensorAddr(add_exec_, 0, t_x1, const_cast<void*>(x1.data()));
+      aclSetInputTensorAddr(add_exec_, 1, t_x2, const_cast<void*>(x2.data()));
       aclSetOutputTensorAddr(add_exec_, 0, t_x_out, x_out.data());
     }
     auto& add_arena = ascend::workspacePool().ensure(stream, add_ws_);
@@ -78,18 +76,17 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
 
     // Lazily create rstd tensor descriptor on first call.
     if (!rstd_tensor_) {
-      rstd_tensor_ = aclCreateTensor(
-          rstd_shape_.data(), 2, ACL_FLOAT,
-          /*strides=*/nullptr, 0, ACL_FORMAT_ND, rstd_shape_.data(), 2,
-          rstd_arena.buf);
+      rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
+                                     /*strides=*/nullptr, 0, ACL_FORMAT_ND,
+                                     rstd_shape_.data(), 2, rstd_arena.buf);
     } else {
       aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
     }
 
     // Step 2: y_out = rms_norm(x_out, gamma, eps).
     if (!norm_exec_) {
-      aclnnRmsNormGetWorkspaceSize(t_x_out, t_gamma, eps, t_y_out,
-                                   rstd_tensor_, &norm_ws_, &norm_exec_);
+      aclnnRmsNormGetWorkspaceSize(t_x_out, t_gamma, eps, t_y_out, rstd_tensor_,
+                                   &norm_ws_, &norm_exec_);
       aclSetAclOpExecutorRepeatable(norm_exec_);
     } else {
       aclSetInputTensorAddr(norm_exec_, 0, t_x_out, x_out.data());
 
@@ -10,22 +10,22 @@
 #include "acl/acl.h"
 #include "aclnn/aclnn_base.h"
 #include "aclnnop/aclnn_cast.h"
-#include "ascend/common.h"
 #include "ascend/add_rms_norm/registry.h"
+#include "ascend/common.h"
 #include "ascend/workspace_pool_.h"
 #include "base/add_rms_norm.h"
 #include "operator.h"
 
 // Forward-declare the generated AscendC kernel launch function.
 // This symbol is provided by the `no_workspace_kernel` static library
-// built from `ascend/custom_kernel/csrc/ops/add_rms_norm/op_kernel/add_rms_norm.cpp`
-// via `ascendc_library()`.
+// built from
+// `ascend/custom_kernel/csrc/ops/add_rms_norm/op_kernel/add_rms_norm.cpp` via
+// `ascendc_library()`.
 extern "C" uint32_t aclrtlaunch_add_rms_norm(
-    uint32_t blockDim, void* stream,
-    void* x1, void* x2, void* weight, void* y, void* x_out,
-    int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
-    int64_t formerNum, int64_t formerLength, int64_t tailLength,
-    float eps, int64_t dtypeSize);
+    uint32_t blockDim, void* stream, void* x1, void* x2, void* weight, void* y,
+    void* x_out, int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
+    int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
+    int64_t dtypeSize);
 
 namespace infini::ops {
 
@@ -62,8 +62,8 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
     assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
            "Custom AddRmsNorm kernel requires 32-byte aligned last dimension");
 
-    total_rows_ = static_cast<int64_t>(batch_size_) *
-                  static_cast<int64_t>(nhead_);
+    total_rows_ =
+        static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
 
     // For fp16 input, weight needs fp32 conversion because the custom
     // kernel always reads weight as fp32.
@@ -72,16 +72,15 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
     if (needs_weight_cast_) {
       // Allocate persistent fp32 weight buffer on device.
       size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
-      aclrtMalloc(&weight_fp32_data_, fp32_bytes,
-                  ACL_MEM_MALLOC_NORMAL_ONLY);
+      aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
 
       // AclTensorCache for the cast source (fp16 weight descriptor).
-      weight_src_cache_ = ascend::AclTensorCache(
-          {static_cast<int64_t>(dim_)}, ACL_FLOAT16, nullptr);
+      weight_src_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
+                                                 ACL_FLOAT16, nullptr);
 
       // AclTensorCache for the cast destination (fp32 weight buffer).
-      weight_dst_cache_ = ascend::AclTensorCache(
-          {static_cast<int64_t>(dim_)}, ACL_FLOAT, weight_fp32_data_);
+      weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
+                                                 ACL_FLOAT, weight_fp32_data_);
     }
   }
 
@@ -105,8 +104,7 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
       const void* cur_weight = gamma.data();
 
       if (cur_weight != last_weight_ptr_) {
-        auto t_src =
-            weight_src_cache_.get(const_cast<void*>(cur_weight));
+        auto t_src = weight_src_cache_.get(const_cast<void*>(cur_weight));
         auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
 
         if (!cast_exec_) {
@@ -133,25 +131,17 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
     // Block-level tiling: distribute rows across cores.
     static constexpr int64_t kMaxBlockDim = 40;
     int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
-    int64_t former_length =
-        (total_rows_ + used_cores - 1) / used_cores;
+    int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
     int64_t tail_length = former_length - 1;
     int64_t former_num = total_rows_ - tail_length * used_cores;
     uint32_t block_dim = static_cast<uint32_t>(used_cores);
 
     // Launch custom AscendC kernel.
     aclrtlaunch_add_rms_norm(
-        block_dim, stream,
-        const_cast<void*>(x1.data()),
-        const_cast<void*>(x2.data()),
-        weight_fp32,
-        y_out.data(),
-        x_out.data(),
-        total_rows_,
-        static_cast<int64_t>(dim_),
-        dim_length_align_,
-        former_num, former_length, tail_length,
-        eps, dtype_size_);
+        block_dim, stream, const_cast<void*>(x1.data()),
+        const_cast<void*>(x2.data()), weight_fp32, y_out.data(), x_out.data(),
+        total_rows_, static_cast<int64_t>(dim_), dim_length_align_, former_num,
+        former_length, tail_length, eps, dtype_size_);
   }
 
  private:
 
@@ -76,16 +76,13 @@ class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
     auto stream = static_cast<aclrtStream>(stream_);
 
     if (!executor_) {
-      aclnnAddRmsNormGetWorkspaceSize(t_x1, t_x2, t_gamma,
-                                      static_cast<double>(eps), t_y_out,
-                                      rstd_tensor_, t_x_out, &ws_size_,
-                                      &executor_);
+      aclnnAddRmsNormGetWorkspaceSize(
+          t_x1, t_x2, t_gamma, static_cast<double>(eps), t_y_out, rstd_tensor_,
+          t_x_out, &ws_size_, &executor_);
       aclSetAclOpExecutorRepeatable(executor_);
     } else {
-      aclSetInputTensorAddr(executor_, 0, t_x1,
-                            const_cast<void*>(x1.data()));
-      aclSetInputTensorAddr(executor_, 1, t_x2,
-                            const_cast<void*>(x2.data()));
+      aclSetInputTensorAddr(executor_, 0, t_x1, const_cast<void*>(x1.data()));
+      aclSetInputTensorAddr(executor_, 1, t_x2, const_cast<void*>(x2.data()));
       aclSetInputTensorAddr(executor_, 2, t_gamma,
                             const_cast<void*>(gamma.data()));
       aclSetOutputTensorAddr(executor_, 0, t_y_out, y_out.data());
 
@@ -9,10 +9,10 @@
 #include <vector>
 
 #include "acl/acl.h"
+#include "ascend/data_type_.h"
 #include "atb/context.h"
 #include "atb/operation.h"
 #include "atb/types.h"
-#include "ascend/data_type_.h"
 #include "tensor.h"
 
 namespace infini::ops::ascend {
 
@@ -4,8 +4,8 @@
 #include <vector>
 
 #include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
 #include "aclnn/acl_meta.h"
+#include "aclnn/aclnn_base.h"
 #include "aclnnop/aclnn_cat.h"
 #include "ascend/common.h"
 #include "ascend/workspace_pool_.h"
@@ -55,9 +55,9 @@ class Operator<Cat, Device::Type::kAscend> : public Cat {
             in_caches_[i].get(const_cast<void*>(inputs[i]->data()));
       }
 
-      tensor_list_ = aclCreateTensorList(
-          const_cast<const aclTensor**>(acl_tensors.data()),
-          static_cast<uint64_t>(input_count_));
+      tensor_list_ =
+          aclCreateTensorList(const_cast<const aclTensor**>(acl_tensors.data()),
+                              static_cast<uint64_t>(input_count_));
 
       aclnnCatGetWorkspaceSize(tensor_list_, dim_, t_out, &ws_size_,
                                &executor_);
 
@@ -18,20 +18,18 @@
 namespace infini::ops {
 
 // Implements causal softmax via three ACLNN calls:
-//   1. InplaceCopy(temp, input)   — stride-aware copy to contiguous temp
+//   1. `InplaceCopy(temp, input)` — stride-aware copy to contiguous temp
 //   buffer.
-//   2. InplaceMaskedFillScalar(temp, mask, -inf) — apply upper-triangle mask.
-//   3. Softmax(temp, dim=-1, out) — softmax over the last dimension.
+//   2. `InplaceMaskedFillScalar(temp, mask, -inf)` — apply upper-triangle mask.
+//   3. `Softmax(temp, dim=-1, out)` — softmax over the last dimension.
 //
 // The boolean causal mask is pre-computed and uploaded to device once in the
 // constructor. Its shape (seq_len, total_seq_len) broadcasts over the batch.
 template <>
 class Operator<CausalSoftmax, Device::Type::kAscend> : public CausalSoftmax {
  public:
   Operator(const Tensor input, Tensor out)
-      : CausalSoftmax(input, out),
-        in_cache_(input),
-        out_cache_(out) {
+      : CausalSoftmax(input, out), in_cache_(input), out_cache_(out) {
     // Compute temp buffer size — allocated lazily from pool in `operator()`.
     size_t n_elems = input.numel();
     size_t elem_bytes = kDataTypeToSize.at(dtype_);
 
@@ -73,8 +73,8 @@ class AclTensorCache {
  public:
   AclTensorCache() = default;
 
-  // Construct from explicit metadata (for device buffers not wrapped in Tensor).
-  // Computes contiguous strides from shape.
+  // Construct from explicit metadata (for device buffers not wrapped in
+  // Tensor). Computes contiguous strides from shape.
   AclTensorCache(std::vector<int64_t> shape, aclDataType dtype, void* data)
       : shape_(std::move(shape)), dtype_(dtype) {
     strides_.resize(shape_.size());
 
@@ -34,9 +34,8 @@ inline aclIntArray* extractSeqLengths(const Tensor& cu_seqlens,
     cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
   } else {
     cu_host_buf.resize(n);
-    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t),
-                     cu_seqlens.data(), n * sizeof(int64_t),
-                     ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
+                     n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
     aclrtSynchronizeStream(stream);
     cu_host_ptr = cu_host_buf.data();
   }
@@ -67,9 +66,8 @@ inline aclIntArray* cumSeqLengths(const Tensor& cu_seqlens,
     cu_host_ptr = static_cast<const int64_t*>(cu_seqlens.data());
   } else {
     cu_host_buf.resize(n);
-    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t),
-                     cu_seqlens.data(), n * sizeof(int64_t),
-                     ACL_MEMCPY_DEVICE_TO_HOST, stream);
+    aclrtMemcpyAsync(cu_host_buf.data(), n * sizeof(int64_t), cu_seqlens.data(),
+                     n * sizeof(int64_t), ACL_MEMCPY_DEVICE_TO_HOST, stream);
     aclrtSynchronizeStream(stream);
     cu_host_ptr = cu_host_buf.data();
   }
@@ -141,10 +139,10 @@ class Operator<FlashAttention, Device::Type::kAscend> : public FlashAttention {
       const int64_t D = query.size(2);
       const int64_t B = query.size(0);
 
-      decode_q_cache_ = ascend::AclTensorCache(
-          {B, N, 1, D}, acl_dt, const_cast<void*>(query.data()));
-      decode_out_cache_ = ascend::AclTensorCache(
-          {B, N, 1, D}, acl_dt, output.data());
+      decode_q_cache_ = ascend::AclTensorCache({B, N, 1, D}, acl_dt,
+                                               const_cast<void*>(query.data()));
+      decode_out_cache_ =
+          ascend::AclTensorCache({B, N, 1, D}, acl_dt, output.data());
       block_table_cache_ = ascend::AclTensorCache(block_table.value());
 
       // Pre-compute KV reshape metadata.
@@ -224,8 +222,8 @@ class Operator<FlashAttention, Device::Type::kAscend> : public FlashAttention {
           t_q, key_list, val_list,
           nullptr,       // pseShift
           causal_mask_,  // attenMask (pre-computed, or nullptr)
-          seq_q,       // actualSeqLengths
-          seq_kv,      // actualSeqLengthsKv
+          seq_q,         // actualSeqLengths
+          seq_kv,        // actualSeqLengthsKv
           nullptr, nullptr, nullptr, nullptr,
           nullptr,           // deqScale1..quantOffset2
           nullptr, nullptr,  // antiquantScale, antiquantOffset
 
@@ -60,10 +60,8 @@ class Operator<Linear, Device::Type::kAscend> : public Linear {
       } else {
         aclSetInputTensorAddr(executor_, 0, t_bias,
                               const_cast<void*>(bias->data()));
-        aclSetInputTensorAddr(executor_, 1, t_a,
-                              const_cast<void*>(a.data()));
-        aclSetInputTensorAddr(executor_, 2, t_b,
-                              const_cast<void*>(b.data()));
+        aclSetInputTensorAddr(executor_, 1, t_a, const_cast<void*>(a.data()));
+        aclSetInputTensorAddr(executor_, 2, t_b, const_cast<void*>(b.data()));
         aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
       }
 
@@ -77,14 +75,12 @@ class Operator<Linear, Device::Type::kAscend> : public Linear {
     } else {
       if (!executor_) {
         int8_t cube_math_type = 1;
-        aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type,
-                                    &ws_size_, &executor_);
+        aclnnMatmulGetWorkspaceSize(t_a, t_b, t_out, cube_math_type, &ws_size_,
+                                    &executor_);
         aclSetAclOpExecutorRepeatable(executor_);
       } else {
-        aclSetInputTensorAddr(executor_, 0, t_a,
-                              const_cast<void*>(a.data()));
-        aclSetInputTensorAddr(executor_, 1, t_b,
-                              const_cast<void*>(b.data()));
+        aclSetInputTensorAddr(executor_, 0, t_a, const_cast<void*>(a.data()));
+        aclSetInputTensorAddr(executor_, 1, t_b, const_cast<void*>(b.data()));
         aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
       }