diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
index 6b3236f8b..e92836172 100644
--- a/csrc/CMakeLists.txt
+++ b/csrc/CMakeLists.txt
@@ -22,6 +22,7 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv.cpp
+    ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv_cube.cpp
     )
 if(BUILD_CATLASS_MODULE)
     list(APPEND OP_SRCS
@@ -53,6 +54,7 @@ set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp
+    ${PROJECT_OP_SRC_BASE}/tri_inv/op_kernel/tri_inv_cube_kernel.cpp
 )
 if(BUILD_CATLASS_MODULE)
     list(APPEND WORKSPACE_KERNEL_SRCS
diff --git a/csrc/pytorch_extensions.cpp b/csrc/pytorch_extensions.cpp
index a7d093730..9cbced6ee 100644
--- a/csrc/pytorch_extensions.cpp
+++ b/csrc/pytorch_extensions.cpp
@@ -98,6 +98,7 @@ TORCH_LIBRARY_FRAGMENT(npu, m)
         "int? sparse_count=None, int? sparse_mode=None) -> Tensor");
 
     m.def("triangular_inverse(Tensor x) -> Tensor");
+    m.def("cube_triangular_inverse(Tensor x) -> Tensor");
 }
 }  // namespace
 
@@ -141,5 +142,7 @@ TORCH_LIBRARY_IMPL(npu, PrivateUse1, m)
     m.impl("lightning_indexer", TORCH_FN(sglang::npu_kernel::lightning_indexer));
 
     m.impl("triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_col_sweep));
+
+    m.impl("cube_triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_cube_col_sweep));
 }
 }  // namespace
diff --git a/csrc/tri_inv/README.md b/csrc/tri_inv/README.md
index 67a4dc2fc..d05b56a91 100644
--- a/csrc/tri_inv/README.md
+++ b/csrc/tri_inv/README.md
@@ -1,5 +1,7 @@
 ##### Description of tri_inv
 
-This is a vector-only AscendC triangular inversion kernel on Ascend NPU.
+This implements several AscendC triangular inversion kernels on Ascend NPU.
 
-The kernel supports matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`.
+The AIV based kernels support matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`.
+
+The AIC based kernel support matrix sizes `16, 32, 64, 128` and data type `fp16`.
diff --git a/csrc/tri_inv/op_host/tiling_tri_inv_cube.h b/csrc/tri_inv/op_host/tiling_tri_inv_cube.h
new file mode 100644
index 000000000..687819d10
--- /dev/null
+++ b/csrc/tri_inv/op_host/tiling_tri_inv_cube.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cstdint>
+
+namespace sglang {
+
+namespace npu_kernel {
+
+/**
+ * @brief `tri_inv_cube_col_sweep` kernel tiling parameter structure.
+ */
+struct TriInvColumnSweepCubeTiling {
+    /// @brief Number of blocks.
+    uint32_t num_blocks;
+    /// @brief Total number of input elements.
+    uint32_t num_elems;
+    /// @brief Input matrix size.
+    uint32_t matrix_size;
+    /// @brief Workspace circular buffer length.
+    uint32_t ws_circular_buffer_len;
+};
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/tri_inv/op_host/tri_inv_cube.cpp b/csrc/tri_inv/op_host/tri_inv_cube.cpp
new file mode 100644
index 000000000..e767090ba
--- /dev/null
+++ b/csrc/tri_inv/op_host/tri_inv_cube.cpp
@@ -0,0 +1,81 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "defines.h"
+#include "torch_helper.h"
+
+#include "tiling_tri_inv_cube.h"
+#include "aclrtlaunch_tri_inv_cube_col_sweep_fp16.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace sglang {
+
+namespace npu_kernel {
+
+at::Tensor calc_tiling(const TriInvColumnSweepCubeTiling &tiling)
+{
+    constexpr uint32_t PADDING_BYTE = 32U;
+
+    // align to 32 bytes
+    int32_t tiling_size = (sizeof(TriInvColumnSweepCubeTiling) + PADDING_BYTE - 1) / PADDING_BYTE * PADDING_BYTE;
+    auto tiling_buffer = at::empty({tiling_size}, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
+
+    TriInvColumnSweepCubeTiling *tiling_data =
+        reinterpret_cast<TriInvColumnSweepCubeTiling *>(tiling_buffer.data_ptr());
+    tiling_data->num_blocks = tiling.num_blocks;
+    tiling_data->num_elems = tiling.num_elems;
+    tiling_data->matrix_size = tiling.matrix_size;
+    tiling_data->ws_circular_buffer_len = tiling.ws_circular_buffer_len;
+
+    auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);
+    return tiling_tensor;
+}
+
+HOST_API at::Tensor tri_inv_cube_col_sweep(const at::Tensor &tensor)
+{
+    platform_ascendc::PlatformAscendC *platformAscendC = platform_ascendc::PlatformAscendCManager::GetInstance();
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);
+
+    const auto dtype = tensor.options().dtype();
+    if (tensor.dim() < 2) {
+        throw std::runtime_error("Input tensor must have at least 2 dimensions.\n");
+    }
+
+    const uint32_t matrix_size = static_cast<uint32_t>(tensor.size(-1));
+    if (matrix_size != tensor.size(-2)) {
+        throw std::runtime_error("Only square matrices are supported.\n");
+    }
+
+    const uint32_t num_elems = static_cast<uint32_t>(tensor.numel());
+    const uint32_t block_dim = static_cast<uint32_t>(num_elems / (matrix_size * matrix_size));
+
+    auto tensor_out = at::empty_like(tensor, at::kFloat);
+
+    const uint32_t WS_CIRCULAR_BUFFER_LEN = 4;
+    const TriInvColumnSweepCubeTiling tiling{block_dim, num_elems, matrix_size, WS_CIRCULAR_BUFFER_LEN};
+    const at::Tensor tiling_device = calc_tiling(tiling);
+
+    // workspace
+    const uint64_t system_workspace_size = static_cast<uint64_t>(platformAscendC->GetLibApiWorkSpaceSize());
+    const uint64_t workspace_size = system_workspace_size + num_elems * WS_CIRCULAR_BUFFER_LEN * tensor.itemsize();
+    const auto options = at::TensorOptions().dtype(at::kByte).device(tensor.options().device());
+    auto workspace = at::empty({static_cast<int64_t>(workspace_size)}, options);
+
+    if (dtype == at::kHalf) {
+        EXEC_KERNEL_CMD(tri_inv_cube_col_sweep_fp16, block_dim, tensor, tensor_out, workspace, tiling_device);
+    } else {
+        throw std::runtime_error("Unsupported data type for tri_inv_cube_col_sweep. fp16 is currently supported.");
+    }
+    aclrtSynchronizeStream(acl_stream);
+    return tensor_out;
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/tri_inv/op_kernel/kernel_mat_gen.h b/csrc/tri_inv/op_kernel/kernel_mat_gen.h
new file mode 100644
index 000000000..9345c6fae
--- /dev/null
+++ b/csrc/tri_inv/op_kernel/kernel_mat_gen.h
@@ -0,0 +1,221 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * @file kernel_mat_gen.h
+ * @brief Kernel implementing an AIV matrix generator that generates the matrix
+ * formulation of column sweep.
+ */
+#pragma once
+
+namespace sglang {
+
+namespace npu_kernel {
+/**
+ * @brief Returns a sequence of matrices that encode the column-sweep steps in
+ * matrix notation. On the first iteration, it returns the identity matrix.
+ *
+ * @tparam Input data type. Support fp16/half.
+ *
+ * See discussion on Section 3.2.1 of [1], in particular Equations 3.8 and 3.9
+ * on page 54.
+ *
+ * \code{.python}
+  def aiv_matrix_gen(A: npt.ArrayLike):
+    n = A.shape[0]
+    I_n = np.eye(n, dtype=A.dtype)
+
+    # Your transformation A = 2I_n - A
+    A = 2 * I_n - A
+
+    for k in reversed(range(n)):
+      M = I_n.copy()
+      M[:, k] = A[:, k]
+      yield M
+
+  * \endcode
+  *
+  *  [1] Parallelism in Matrix Computations.E.Gallopoulos, B.Philippe and
+ A.H.Sameh.
+  * Hard cover(ISBN : 978 - 94 - 017 - 7187 - 0),
+  * Soft cover(ISBN : 978 - 94 - 024 - 0317 - 6),
+  * Electronic(ISBN : 978 - 94 - 017 - 7188 - 7)
+  **/
+template <typename T = half>
+class KernelMatGen
+{
+public:
+    /**
+     * @brief Class constructor.
+     *
+     * @param [in] matrix_size Input square matrix size.
+     * @param [in] circular_buffer_len Length of workspace circular buffer to
+     * overcome GM memory consistency issues.
+     */
+    __aicore__ inline KernelMatGen(uint32_t matrix_size, uint32_t circular_buffer_len)
+        : matrix_size_(matrix_size),
+          tile_len_(matrix_size * matrix_size),
+          aic_id_(AscendC::GetBlockIdx() / AscendC::GetTaskRation()),
+          global_in_offset_(aic_id_ * tile_len_),
+          ws_circular_buffer_len_(circular_buffer_len),
+          global_out_offset_(aic_id_ * tile_len_ * ws_circular_buffer_len_)
+    {}
+
+    /**
+     * @brief Initialize global and local memory structures.
+     *
+     * @param [in] vec_in Pointer to the input vector in global memory.
+     * @param [in] vec_out Pointer to the output vector in global memory.
+     */
+    __aicore__ inline void Init(GM_ADDR vec_in, GM_ADDR vec_out)
+    {
+        const uint32_t vec_len = AscendC::GetBlockNum() * tile_len_;
+        global_in_.SetGlobalBuffer((__gm__ T *)vec_in, vec_len);
+        global_out_.SetGlobalBuffer((__gm__ T *)vec_out, vec_len * ws_circular_buffer_len_);
+
+        pipe_.InitBuffer(in_q_, 1, tile_len_ * sizeof(T));
+        pipe_.InitBuffer(out_q_, 1, tile_len_ * sizeof(T));
+        pipe_.InitBuffer(work_buf_, tile_len_ * sizeof(T));
+    }
+
+    /**
+     * @brief Run the kernel.
+     *
+     */
+    __aicore__ inline void Process()
+    {
+        // Read input matrix into work_buf_.
+        const AscendC::LocalTensor<T> in_lt = in_q_.template AllocTensor<T>();
+        DataCopy(in_lt, global_in_[global_in_offset_], in_lt.GetSize());
+        in_q_.EnQue(in_lt);
+
+        ReadInputMatrixInUB();
+
+        // AIV-0 writes identity matrix for AIC
+        if (AscendC::GetSubBlockIdx() == 0) {
+            EnQueueIdentityMatrix();
+            AscendC::LocalTensor<T> out_lt = out_q_.template DeQue<T>();
+            DataCopy(global_out_[global_out_offset_], out_lt, out_lt.GetSize());
+            out_q_.FreeTensor(out_lt);
+        }
+
+        //  Sync with all AIVs in group, to write the matrix.
+        SyncGroup();
+
+        // First matrix is identity (just wait one more round)
+        SyncGroup();
+
+        const AscendC::LocalTensor<T> work_lt = work_buf_.Get<T>();
+        uint32_t circular_buf_idx = 1;
+
+        // Matrix column sweep algorithm requires `matrix_size_` iterations.
+        for (int32_t col_index = matrix_size_ - 2; col_index >= 0; col_index--) {
+            // AIV-0: writes the  (col_index + 1)-th column of the identity matrix and
+            // writes the "column-sweep" column of matrix `M`.
+            if (AscendC::GetSubBlockIdx() == 0) {
+                const AscendC::LocalTensor<T> vec_out_lt = out_q_.AllocTensor<T>();
+                AscendC::Duplicate(vec_out_lt, static_cast<T>(0), matrix_size_ * matrix_size_);
+                AscendC::PipeBarrier<PIPE_ALL>();
+
+                // Set one on the main diagonal
+                for (uint32_t i = 0; i < matrix_size_; i++) {
+                    vec_out_lt.SetValue(i * matrix_size_ + i, static_cast<T>(1));
+                }
+
+                AscendC::PipeBarrier<PIPE_ALL>();
+                // Write the (col_index)-th column of matrix M.
+                const uint32_t col_offset = col_index * matrix_size_;
+                DataCopy(vec_out_lt[col_offset], work_lt[col_offset], matrix_size_);
+                AscendC::PipeBarrier<PIPE_ALL>();
+                out_q_.EnQue<T>(vec_out_lt);
+
+                AscendC::LocalTensor<T> out_lt = out_q_.template DeQue<T>();
+                DataCopy(global_out_[global_out_offset_ + circular_buf_idx * tile_len_], out_lt, out_lt.GetSize());
+                out_q_.FreeTensor(out_lt);
+                circular_buf_idx = (circular_buf_idx + 1) % ws_circular_buffer_len_;
+            }
+
+            // Sync with all AIVs in group, to write the matrix.
+            SyncGroup();
+        }
+    }
+
+private:
+    /**
+     * @brief Read (and transform) the input triangular matrix A into the
+     * `work_buf_`. The transformation is `2*I_n - A`.
+     */
+    __aicore__ inline void ReadInputMatrixInUB()
+    {
+        AscendC::LocalTensor<T> vec_in_lt = in_q_.DeQue<T>();
+        AscendC::LocalTensor<T> work_lt = work_buf_.Get<T>();
+        Muls(work_lt, vec_in_lt, static_cast<T>(-1), vec_in_lt.GetSize());
+        for (uint32_t i = 0; i < matrix_size_; i++) {
+            work_lt.SetValue(i * matrix_size_ + i, 1);
+        }
+        in_q_.FreeTensor<T>(vec_in_lt);
+    }
+
+    /**
+     * @brief EnQue identity matrix on output queue.
+     *
+     */
+    __aicore__ inline void EnQueueIdentityMatrix()
+    {
+        const AscendC::LocalTensor<T> vec_out_lt = out_q_.AllocTensor<T>();
+        AscendC::Duplicate(vec_out_lt, static_cast<T>(0), matrix_size_ * matrix_size_);
+        AscendC::PipeBarrier<PIPE_ALL>();
+
+        // Set one on the main diagonal
+        for (uint32_t i = 0; i < matrix_size_; i++) {
+            vec_out_lt.SetValue(i * matrix_size_ + i, static_cast<T>(1));
+        }
+
+        out_q_.EnQue<T>(vec_out_lt);
+    }
+
+    /**
+     * @brief Returns a synchronization config.
+     *
+     * @param [in] mode Synchronization mode.
+     * @param [in] flag_id Flag to use for synchronization.
+     * @return Synchronization config.
+     */
+    __aicore__ inline int GetSyncConf(int mode, int flag_id)
+    {
+        return 1 | (mode << 4) | (flag_id << 8);
+    }
+
+    /**
+     * @brief Synchronize cube and vector cores within a single group.
+     *
+     */
+    __aicore__ inline void SyncGroup()
+    {
+        const int mode = 2;
+
+        const int AIV_SET_FLAG_ID = 11;
+        const int AIC_SET_FLAG_ID = 12;
+        ffts_cross_core_sync(PIPE_MTE3, GetSyncConf(mode, AIV_SET_FLAG_ID));
+        wait_flag_dev(AIC_SET_FLAG_ID);
+        return;
+    }
+
+    AscendC::TPipe pipe_;
+
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> in_q_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> out_q_;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> work_buf_;
+
+    AscendC::GlobalTensor<T> global_in_;
+    AscendC::GlobalTensor<T> global_out_;
+
+    const uint32_t matrix_size_;
+    const uint32_t tile_len_;
+    const uint32_t aic_id_;
+    const uint32_t global_in_offset_;
+    const uint32_t ws_circular_buffer_len_;
+    const uint32_t global_out_offset_;
+};
+}  // namespace npu_kernel
+}  // namespace sglang
\ No newline at end of file
diff --git a/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h b/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h
new file mode 100644
index 000000000..5389a525f
--- /dev/null
+++ b/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h
@@ -0,0 +1,491 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2026. All rights reserved.
+ *
+ * @file kernel_tri_inv.h
+ * @brief Kernel implementing a Vector matrix inverse kernel operation.
+ */
+
+#pragma once
+
+#include "kernel_operator.h"
+#include "kernel_mat_gen.h"
+
+namespace sglang {
+
+namespace npu_kernel {
+/**
+ * @brief Returns the matrix inverse of an upper triangular square matrix of
+ * size `matrix_size`. The matrix has ones on the main diagonal.
+ *
+ * The column sweep algorithm is used for the linear system Ax=e_j where e_j is
+ * the standard vector.
+ *
+ * The kernel assumes \f$ matrix_size_ + 1 \f$ synchronizations with the AIVs.
+ *
+ * @tparam InputT Input data type. Default value fp16/half.
+ *
+ */
+template <typename InputT>
+class KernelTriInvCubeColSweep
+{
+    using OutputT = float;
+
+public:
+    /**
+     * @brief Class constructor.
+     *
+     * @param [in] vec_len Total length of input tensor.
+     * @param [in] matrix_size Input square matrix size.
+     * @param [in] circular_buffer_len Length of workspace circular buffer to
+     * overcome GM memory consistency issues.
+     */
+    __aicore__ inline KernelTriInvCubeColSweep(uint32_t vec_len, uint32_t matrix_size, uint32_t circular_buffer_len)
+        : vec_len_(vec_len),
+          matrix_size_(matrix_size),
+          ws_circular_buffer_len_(circular_buffer_len),
+          tile_len_(matrix_size * matrix_size),
+          global_in_offset_(AscendC::GetBlockIdx() * tile_len_ * ws_circular_buffer_len_),
+          global_out_offset_(AscendC::GetBlockIdx() * tile_len_)
+    {}
+
+    /**
+     * @brief Initialize global and local memory structures.
+     *
+     * @param [in] matrix_stream_in Pointer where the input matrices will be
+     * written by AIVs in global memory.
+     * @param [in] inv_matrix_out Pointer where the output matrix inverses
+     * will be written.
+     */
+    __aicore__ inline void Init(GM_ADDR matrix_stream_in, GM_ADDR inv_matrix_out)
+    {
+        global_A_.SetGlobalBuffer((__gm__ InputT *)matrix_stream_in, vec_len_ * ws_circular_buffer_len_);
+        global_C_.SetGlobalBuffer((__gm__ OutputT *)inv_matrix_out, vec_len_);
+
+        pipe_.InitBuffer(a1_q_, 1, tile_len_ * sizeof(InputT));
+        pipe_.InitBuffer(a2_q_, 1, tile_len_ * sizeof(InputT));
+        pipe_.InitBuffer(b1_q_, 1, tile_len_ * sizeof(InputT));
+        pipe_.InitBuffer(b2_q_, 1, tile_len_ * sizeof(InputT));
+        pipe_.InitBuffer(co1_q_, 1, tile_len_ * sizeof(OutputT));
+    }
+
+    /**
+     * @brief Run kernel.
+     *
+     */
+    __aicore__ inline void Process()
+    {
+        // On the first iteration, the AIVs will send the identity matrix to AIC
+        SyncGroup();
+        LoadIdentityMatrixinL0C();
+
+        // Read again the identity matrix from AIV core.
+        uint32_t circular_buf_idx = 0;
+
+        // Matrix column sweep algorithm requires `matrix_size_` iterations.
+        for (uint32_t iter = 0; iter < matrix_size_; iter++) {
+            // Sync with all AIVs in group, to write the matrix.
+            SyncGroup();
+
+            // Load next matrix A and perform C = A @ C
+            LoadMatrixAintoL0A(circular_buf_idx);
+            MultiplyAWithC();
+            circular_buf_idx = (circular_buf_idx + 1) % ws_circular_buffer_len_;
+        }
+
+        // Write L0C matrix to global memory
+        CopyCL0ToGlobal();
+    }
+
+private:
+    /**
+     * @brief Loads matrix from global memory into L0A (`a1_q_` queue).
+     */
+    __aicore__ inline void LoadMatrixAintoL0A(uint32_t iter)
+    {
+        // Load matrix from global_A_ into L1
+        CopyGmToL1A(a1_q_, global_A_[global_in_offset_ + iter * tile_len_], m_blocks_, k_blocks_);
+        CopyL1ToL0A(a2_q_, a1_q_, m_blocks_, k_blocks_);
+    }
+
+    /**
+     * @brief Copy a tensor from global memory to the queue in A L1 memory.
+     *
+     * The data layout is transformed from ND to NZ.
+     *
+     * @param [in] q Destination queue. The position must be A1.
+     * @param [in] global Source global tensor.
+     * @param [in] fractals_h Number of fractal patterns in the height dimension of
+     * the input matrix.
+     * @param [in] fractals_w Number of fractal patterns in the width dimension of
+     * the input matrix.
+     */
+    __aicore__ inline void CopyGmToL1A(AscendC::TQue<AscendC::QuePosition::A1, 1> &q,
+                                       const AscendC::GlobalTensor<InputT> &global, uint16_t fractals_h,
+                                       uint16_t fractals_w)
+    {
+        const AscendC::LocalTensor<InputT> lt = q.template AllocTensor<InputT>();
+        AscendC::Nd2NzParams params;
+        params.ndNum = 1;
+        params.nValue = m_blocks_ * 16;
+        params.dValue = k_blocks_ * 16;
+        params.srcDValue = params.dValue;
+        params.dstNzC0Stride = params.nValue;
+        params.dstNzNStride = 1;
+        // params.dstNzMatrixStride = 1; // This should have no effect when ndNum=1;
+        DataCopy(lt, global, params);
+        q.EnQue(lt);
+    }
+
+    /**
+     * @brief Copy a tensor from global memory to the queue in B L1 memory.
+     *
+     * The data layout is transformed from ND to NZ.
+     *
+     * @param [in] q Destination queue. The position must be B1.
+     * @param [in] global Source global tensor.
+     * @param [in] fractals_h Number of fractal patterns in the height dimension of
+     * the input matrix.
+     * @param [in] fractals_w Number of fractal patterns in the width dimension of
+     * the input matrix.
+     */
+    __aicore__ inline void CopyGmToL1B(AscendC::TQue<AscendC::QuePosition::B1, 1> &q,
+                                       const AscendC::GlobalTensor<InputT> &global, uint16_t fractals_h,
+                                       uint16_t fractals_w)
+    {
+        const AscendC::LocalTensor<InputT> lt = q.template AllocTensor<InputT>();
+        AscendC::Nd2NzParams params;
+        params.ndNum = 1;
+        params.nValue = fractals_h * 16;
+        params.dValue = fractals_w * 16;
+        params.srcDValue = params.dValue;
+        params.dstNzC0Stride = params.nValue;
+        params.dstNzNStride = 1;
+        // params.dstNzMatrixStride = 1; // This should have no effect when ndNum=1;
+        DataCopy(lt, global, params);
+        q.EnQue(lt);
+    }
+
+    /**
+     * @brief Copy a tensor from a given A L1 to A L0 queue.
+     *
+     * The function performs an NZ to ZZ layout transformation.
+     *
+     * @param [in] l0_q Destination queue. The position must be A2.
+     * @param [in] l1_q Source queue. The position must be A1.
+     * @param [in] fractals_h Number of fractal patterns in the height dimension of
+     * the input matrix.
+     * @param [in] fractals_w Number of fractal patterns in the width dimension of
+     * the input matrix.
+     */
+    __aicore__ inline void CopyL1ToL0A(AscendC::TQue<AscendC::QuePosition::A2, 1> &l0_q,
+                                       AscendC::TQue<AscendC::QuePosition::A1, 1> &l1_q, uint16_t fractals_h,
+                                       uint16_t fractals_w)
+    {
+        const AscendC::LocalTensor<InputT> l0_lt = l0_q.template AllocTensor<InputT>();
+        AscendC::LocalTensor<InputT> l1_lt = l1_q.template DeQue<InputT>();
+
+        int src_offset = 0;
+        int dst_offset = 0;
+
+        for (uint16_t i = 0; i < fractals_h; ++i) {
+            AscendC::LoadData2dParams params;
+            params.repeatTimes = fractals_w;
+            params.srcStride = fractals_h;
+            params.ifTranspose = false;
+
+            LoadData(l0_lt[dst_offset], l1_lt[src_offset], params);
+
+            src_offset += 16 * 16;
+            dst_offset += fractals_w * 16 * 16;
+        }
+
+        l1_q.FreeTensor(l1_lt);
+        l0_q.EnQue(l0_lt);
+    }
+
+    /**
+     * @brief Copy a tensor from a given B L1 to B L0 queue.
+     *
+     * The function performs an NZ to ZN layout transformation.
+     *
+     * @param [in] l0_q Destination queue. The position must be B2.
+     * @param [in] l1_q Source queue. The position must be B1.
+     * @param [in] fractals_h Number of fractal patterns in the height dimension of
+     * the input matrix.
+     * @param [in] fractals_w Number of fractal patterns in the width dimension of
+     * the input matrix.
+     */
+    __aicore__ inline void CopyL1ToL0B(AscendC::TQue<AscendC::QuePosition::B2, 1> &l0_q,
+                                       AscendC::TQue<AscendC::QuePosition::B1, 1> &l1_q, uint16_t fractals_h,
+                                       uint16_t fractals_w)
+    {
+        AscendC::LocalTensor<InputT> l1_lt = l1_q.template DeQue<InputT>();
+        const AscendC::LocalTensor<InputT> l0_lt = l0_q.template AllocTensor<InputT>();
+        constexpr bool transpose = true;
+        int src_offset = 0;
+        int dst_offset = 0;
+
+        for (uint16_t i = 0; i < fractals_h; ++i) {
+            AscendC::LoadData2dParams params;
+            params.repeatTimes = fractals_w;
+            params.srcStride = fractals_h;
+            params.ifTranspose = transpose;
+
+            LoadData(l0_lt[dst_offset], l1_lt[src_offset], params);
+
+            src_offset += 16 * 16;
+            dst_offset += fractals_w * 16 * 16;
+        }
+
+        l1_q.FreeTensor(l1_lt);
+        l0_q.EnQue(l0_lt);
+    }
+
+    /**
+     * @brief Copy a tensor from CO1 queue to B1 queue.
+     *
+     * @param [in] dst_q Destination queue. The position must be B1.
+     * @param [in] src_q Source queue. The position must be CO1.
+     * @param [in] height Height of the matrix.
+     * @param [in] width Width of the matrix.
+     */
+    __aicore__ inline void CopyC01ToB1(AscendC::TQue<AscendC::QuePosition::B1, 1> &dst_q,
+                                       AscendC::TQue<AscendC::QuePosition::CO1, 1> &src_q, uint32_t height,
+                                       uint32_t width)
+    {
+        AscendC::LocalTensor<OutputT> src_lt = src_q.template DeQue<OutputT>();
+        const AscendC::LocalTensor<InputT> dst_lt = dst_q.template AllocTensor<InputT>();
+
+        AscendC::FixpipeParamsV220 params;
+        params.nSize = width;
+        params.mSize = height;
+        params.srcStride = height;
+        params.dstStride = width;
+        params.ndNum = 1;
+        params.quantPre = QuantMode_t::F322F16;
+
+        AscendC::Fixpipe<InputT, OutputT, AscendC::CFG_NZ>(dst_lt, src_lt, params);
+        src_q.FreeTensor(src_lt);
+        dst_q.EnQue(dst_lt);
+    }
+
+    /**
+     * @brief Returns a synchronization config.
+     *
+     * @param [in] mode Synchronization mode.
+     * @param [in] flag_id Flag to use for synchronization.
+     * @return Synchronization config.
+     */
+    __aicore__ inline int GetSyncConf(int mode, int flag_id)
+    {
+        return 1 | (mode << 4) | (flag_id << 8);
+    }
+
+    /**
+     * @brief Synchronize cube and vector cores within a single group.
+     *
+     */
+    __aicore__ inline void SyncGroup()
+    {
+        const int mode = 2;
+
+        const int AIV_SET_FLAG_ID = 11;
+        const int AIC_SET_FLAG_ID = 12;
+        ffts_cross_core_sync(PIPE_FIX, GetSyncConf(mode, AIC_SET_FLAG_ID));
+        wait_flag_dev(AIV_SET_FLAG_ID);
+
+        return;
+    }
+
+    /**
+     * @brief Perform matrix multiplication in Cube unit, like C = A @ C
+     *
+     * Assumes that the matrices A and C are enqueued.
+     */
+    __aicore__ inline void MultiplyAWithC()
+    {
+        AscendC::LocalTensor<InputT> a2_lt = a2_q_.DeQue<InputT>();
+
+        // Load C matrix from L0C into L0B.
+        CopyC01ToB1(b1_q_, co1_q_, M_, N_);
+        CopyL1ToL0B(b2_q_, b1_q_, k_blocks_, n_blocks_);
+        AscendC::LocalTensor<InputT> b2_lt = b2_q_.DeQue<InputT>();
+
+        AscendC::LocalTensor<OutputT> c1_lt = co1_q_.AllocTensor<OutputT>();
+
+        Mmad(c1_lt, a2_lt, b2_lt, {M_, N_, K_, false /* accumulate_c */, 0, false, false, false});
+
+        co1_q_.EnQue<OutputT>(c1_lt);
+        a2_q_.FreeTensor(a2_lt);
+        b2_q_.FreeTensor(b2_lt);
+    }
+
+    /**
+     * @brief Loads the identity matrix from global memory to L0C (`co1_q_`
+     * queue).
+     */
+    __aicore__ inline void LoadIdentityMatrixinL0C()
+    {
+        LoadIdentityMatrixinL0A();
+        LoadIdentityMatrixinL0B();
+        AscendC::LocalTensor<InputT> a2_lt = a2_q_.DeQue<InputT>();
+        AscendC::LocalTensor<InputT> b2_lt = b2_q_.DeQue<InputT>();
+        AscendC::LocalTensor<OutputT> c1_lt = co1_q_.AllocTensor<OutputT>();
+
+        Mmad(c1_lt, a2_lt, b2_lt, {M_, N_, K_, false, 0, false, false, false});
+
+        co1_q_.EnQue<OutputT>(c1_lt);
+        a2_q_.FreeTensor(a2_lt);
+        b2_q_.FreeTensor(b2_lt);
+    }
+
+    /**
+     * @brief Loads the identity matrix from global memory to L0A (`a1_q_`
+     * queue).
+     */
+    __aicore__ inline void LoadIdentityMatrixinL0A()
+    {
+        CopyGmToL1A(a1_q_, global_A_[global_in_offset_], m_blocks_, k_blocks_);
+        CopyL1ToL0A(a2_q_, a1_q_, m_blocks_, k_blocks_);
+    }
+
+    /**
+     * @brief Loads the identity matrix from global memory to L0B (`b1_q_`
+     * queue).
+     */
+    __aicore__ inline void LoadIdentityMatrixinL0B()
+    {
+        // Here, we "abuse" the 'global_A_' pointer
+        CopyGmToL1B(b1_q_, global_A_[global_in_offset_], k_blocks_, n_blocks_);
+
+        // Plain copy from L1 to L0B, because the layout is already correct.
+        AscendC::LocalTensor<InputT> src = b1_q_.template DeQue<InputT>();
+        const AscendC::LocalTensor<InputT> dst = b2_q_.template AllocTensor<InputT>();
+
+        AscendC::LoadData2dParams params;
+        params.repeatTimes = n_blocks_ * k_blocks_;
+        params.srcStride = 1;
+        params.ifTranspose = false;
+
+        LoadData(dst, src, params);
+
+        b1_q_.FreeTensor(src);
+        b2_q_.EnQue(dst);
+    }
+
+    /**
+     * @brief Copy a tensor from CO1 queue to global memory.
+     *
+     */
+    __aicore__ inline void CopyCL0ToGlobal()
+    {
+        constexpr uint16_t fractal_size = 16;
+
+        AscendC::LocalTensor<OutputT> lt = co1_q_.template DeQue<OutputT>();
+
+        AscendC::FixpipeParams<OutputT> params;
+        params.cburstNum = M_;
+        params.burstLen = N_ * fractal_size * sizeof(OutputT) / 32;
+        params.dstStride = M_;
+
+        AscendC::Nz2NdParams nz2nd_params;
+        nz2nd_params.nz2ndEn = true;
+        nz2nd_params.originalNSize = M_;
+        params.nz2ndParams = nz2nd_params;
+
+        AscendC::Fixpipe(global_C_[global_out_offset_], lt, params);
+
+        co1_q_.FreeTensor(lt);
+    }
+
+    AscendC::TPipe pipe_;
+
+    AscendC::TQue<AscendC::QuePosition::A1, 1> a1_q_;
+    AscendC::TQue<AscendC::QuePosition::A2, 1> a2_q_;
+    AscendC::TQue<AscendC::QuePosition::B1, 1> b1_q_;
+    AscendC::TQue<AscendC::QuePosition::B2, 1> b2_q_;
+
+    AscendC::TQue<AscendC::QuePosition::CO1, 1> co1_q_;
+
+    AscendC::GlobalTensor<InputT> global_A_;
+    AscendC::GlobalTensor<OutputT> global_C_;
+
+    const uint32_t vec_len_;
+    const uint32_t matrix_size_;
+    const uint32_t ws_circular_buffer_len_;
+    const uint32_t tile_len_;
+    const uint32_t global_in_offset_;
+    const uint32_t global_out_offset_;
+
+    constexpr static uint32_t M_CUBE_BLOCK_SIZE = 16;
+    constexpr static uint32_t N_CUBE_BLOCK_SIZE = 16;
+    constexpr static uint32_t K_CUBE_BLOCK_SIZE = 16;
+
+    const uint16_t M_ = matrix_size_;
+    const uint16_t K_ = matrix_size_;
+    const uint16_t N_ = matrix_size_;
+
+    const uint32_t n_blocks_ = N_ / N_CUBE_BLOCK_SIZE;
+    const uint32_t k_blocks_ = K_ / K_CUBE_BLOCK_SIZE;
+    const uint32_t m_blocks_ = M_ / M_CUBE_BLOCK_SIZE;
+};
+
+/**
+ * @brief Run the `tri_inv_cube_col_sweep` kernel.
+ *
+ * @tparam InputT Input data type. Supports fp16/half.
+ *
+ * @param [in] matrix_stream_in Pointer where the input matrices will be
+ * written by AIVs in global memory.
+ * @param [in] inv_matrix_out Pointer where the output matrix inverses
+ * will be written.
+ * @param [in] matrix_size Input square matrix size to invert.
+ */
+template <typename InputT>
+__aicore__ inline void run_tri_inv_cube_col_sweep(GM_ADDR matrix_stream_in, GM_ADDR inv_matrix_out, GM_ADDR workspace,
+                                                  uint32_t vec_len, uint32_t matrix_size,
+                                                  uint32_t ws_circular_buffer_len)
+{
+    if ASCEND_IS_AIV {
+        KernelMatGen<InputT> op(matrix_size, ws_circular_buffer_len);
+        op.Init(matrix_stream_in, workspace);
+        op.Process();
+    }
+
+    if ASCEND_IS_AIC {
+        KernelTriInvCubeColSweep<InputT> op(vec_len, matrix_size, ws_circular_buffer_len);
+        op.Init(workspace, inv_matrix_out);
+        op.Process();
+    }
+}
+
+/**
+ * @brief Copies tiling structure from global memory to registers.
+ *
+ * @tparam TilingT Structure representing kernel tiling parameters.
+ * @param [in] tiling Pointer to the structure allocated in registers.
+ * @param [in] tiling_global Pointer to the structure in global memory.
+ */
+template <typename TilingT>
+__aicore__ inline void GetTilingData(TilingT *const tiling, GM_ADDR tiling_global)
+{
+    uint32_t *const tiling_32b = reinterpret_cast<uint32_t *>(tiling);
+    const __gm__ uint32_t *const tiling_global_32b = reinterpret_cast<__gm__ uint32_t *>(tiling_global);
+
+    for (uint32_t i = 0; i < sizeof(TilingT) / sizeof(uint32_t); i++) {
+        tiling_32b[i] = tiling_global_32b[i];
+    }
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp b/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp
new file mode 100644
index 000000000..24b51f81b
--- /dev/null
+++ b/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp
@@ -0,0 +1,30 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernel_tri_inv_cube.h"
+
+#include "../op_host/tiling_tri_inv_cube.h"
+
+/**
+ * @brief Run the `tri_inv_cube_col_sweep` kernel on dtype fp16/half.
+ *
+ * @param [in] vec_in Pointer to input vector.
+ * @param [in] vec_out Pointer to output vector.
+ * @param [in] workspace Pointer to workspace.
+ * @param [in] tiling_gm Pointer to tiling vector.
+ */
+extern "C" __global__ __aicore__ void tri_inv_cube_col_sweep_fp16(GM_ADDR vec_in, GM_ADDR vec_out, GM_ADDR workspace,
+                                                                  GM_ADDR tiling_gm)
+{
+    sglang::npu_kernel::TriInvColumnSweepCubeTiling tiling;
+    sglang::npu_kernel::GetTilingData(&tiling, tiling_gm);
+    sglang::npu_kernel::run_tri_inv_cube_col_sweep<half>(vec_in, vec_out, workspace, tiling.num_elems,
+                                                         tiling.matrix_size, tiling.ws_circular_buffer_len);
+}
diff --git a/include/sgl_kenel_npu_ops.h b/include/sgl_kenel_npu_ops.h
index e2f5a6804..dbd72d177 100644
--- a/include/sgl_kenel_npu_ops.h
+++ b/include/sgl_kenel_npu_ops.h
@@ -114,7 +114,7 @@ at::Tensor lightning_indexer(
     c10::optional<int64_t> sparse_count, c10::optional<int64_t> sparse_mode);
 
 /**
- * @brief Triangular inverse of input tensor where last two dimensions represent
+ * @brief AIV-based triangular inverse of input tensor where last two dimensions represent
  * a matrix.
  *
  * @param [in] tensor_in Tensor of dimensions (..., n, n) where `n` is
@@ -123,6 +123,17 @@ at::Tensor lightning_indexer(
  * is inversed.
  */
 at::Tensor tri_inv_col_sweep(const at::Tensor &tensor_in);
+
+/**
+ * @brief AIC-based triangular inverse of input tensor where last two dimensions represent
+ * a matrix.
+ *
+ * @param [in] tensor_in Tensor of dimensions (..., n, n) where `n` is
+ * the matrix size.
+ * @return at::Tensor Returns tensor of same shape where each matrix of size n
+ * is inversed.
+ */
+at::Tensor tri_inv_cube_col_sweep(const at::Tensor &tensor_in);
 } // namespace npu_kernel
 
 } // namespace sglang
diff --git a/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py b/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py
index a545b6d6a..d6caf5634 100644
--- a/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py
+++ b/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py
@@ -30,6 +30,15 @@ def fast_inv_tril(A: torch.Tensor):
     return A_inv.to(dtype)
 
 
+def fast_cube_inv_tril(A: torch.Tensor):
+    dtype = A.dtype
+    assert A.shape[-2] == A.shape[-1]
+    chunk_size = A.shape[-1]
+    identity = torch.eye(chunk_size, dtype=torch.float16, device=A.device)
+    A_inv = torch.ops.npu.cube_triangular_inverse(identity - A.to(torch.float16))
+    return A_inv.to(dtype)
+
+
 def inv_tril_inplace(A: torch.Tensor):
     """
     Returns the "matrix inverse" of the last two dimensions of A that must be a strict lower-triangular matrix.
diff --git a/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py b/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py
index 624c49f00..c7fb1fe8c 100644
--- a/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py
+++ b/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py
@@ -3,7 +3,12 @@
 import pytest
 import torch
 import torch.nn.functional as F
-from sgl_kernel_npu.fla.chunk import chunk_gated_delta_rule_native, fast_inv_tril
+import typing
+from sgl_kernel_npu.fla.chunk import (
+    chunk_gated_delta_rule_native,
+    fast_inv_tril,
+    fast_cube_inv_tril,
+)
 
 device = "npu"
 
@@ -36,6 +41,7 @@ def print_diff(name, ref, tri, atol=0.005):
         print(f"Exceeds tolerance ({atol})!")
 
 
+@pytest.mark.parametrize("tri_inv_fn", [fast_inv_tril, fast_cube_inv_tril])
 @pytest.mark.parametrize(
     ("H", "D", "mask_p", "cu_seqlens", "dtype"),
     [
@@ -84,6 +90,7 @@ def test_chunk_varlen(
     mask_p: float,
     cu_seqlens: list[int],
     dtype: torch.dtype,
+    tri_inv_fn: typing.Callable,
 ):
     if D != 128:
         pytest.skip(
@@ -142,7 +149,7 @@ def test_chunk_varlen(
             initial_state=h0[i],
             output_final_state=True,
             use_qk_l2norm_in_kernel=False,
-            tri_inv_fn=fast_inv_tril,
+            tri_inv_fn=tri_inv_fn,
         )
         actual.append(actual_i)
         actual_ht.append(actual_ht_i)
@@ -151,8 +158,8 @@ def test_chunk_varlen(
 
     torch.npu.synchronize()
 
-    print_diff("o", ref, actual, 0.01)
-    print_diff("ht", ref_ht, actual_ht, 0.01)
+    print_diff("o", ref, actual, 0.05)
+    print_diff("ht", ref_ht, actual_ht, 0.05)
 
-    assert_close("o", ref, actual, 0.01)
-    assert_close("ht", ref_ht, actual_ht, 0.01)
+    assert_close("o", ref, actual, 0.05)
+    assert_close("ht", ref_ht, actual_ht, 0.05)
diff --git a/tests/python/sgl_kernel_npu/test_triangular_inverse.py b/tests/python/sgl_kernel_npu/test_triangular_inverse.py
index 22aed8daa..28e0c5c66 100644
--- a/tests/python/sgl_kernel_npu/test_triangular_inverse.py
+++ b/tests/python/sgl_kernel_npu/test_triangular_inverse.py
@@ -43,19 +43,46 @@ def np_triu_inv_cs(input_x, dtype: np.dtype = np.float16):
     return output.astype(dtype)
 
 
-def rand_np_tril(batch_size: int, n: int, dtype: np.dtype):
-    "Returns a random unit lower triangular matrix of size n."
-    A = np.random.rand(batch_size, n, n).astype(dtype)
-    A = np.tril(A)
+def np_tril_inv_cube_cs(input_x, dtype: np.dtype = np.float16):
+    """Return the matrix inverse of a 3d tensor where first dimension is batch dimension.
+    Each batch dimension returns U_inv_n of input U_n, i.e., U_n U_inv_n = I_n.
+
+    The algorithm is a matrix-formulation of the column sweep's algorithm.
+    """
+    output = np.zeros_like(input_x)
+    batch_dim, n, _ = input_x.shape
+    I_n = np.eye(n, dtype=np.float32)
+
+    for idx in range(batch_dim):
+        U_n = input_x[idx, :, :]
+        U_n = 2 * I_n - U_n
+        U_inv_n = I_n.copy()
+
+        for k in reversed(range(n)):
+            M = I_n.copy()
+            M[:, k] = U_n[:, k]
+            U_inv_n = M.astype(np.float32) @ U_inv_n.astype(np.float32)
+            # FIXPIPE fp32 -> fp16
+            U_inv_n = U_inv_n.astype(dtype)
+
+        output[idx, :, :] = U_inv_n
+
+    return output.astype(dtype)
+
+
+def rand_np_triu(batch_size: int, n: int, dtype: np.dtype):
+    "Returns a random unit upper triangular matrix of size n."
+    A = 0.1 * np.random.rand(batch_size, n, n).astype(dtype)
+    A = np.triu(A)
     for k in range(batch_size):
         np.fill_diagonal(A[k, :, :], 1.0)
     return A.astype(dtype)
 
 
-def ones_np_tril(batch_size: int, n: int, dtype: np.dtype):
-    "Returns an all-ones lower triangular matrix of size n."
+def ones_np_triu(batch_size: int, n: int, dtype: np.dtype):
+    "Returns an all-ones upper triangular matrix of size n."
     A = np.ones((batch_size, n, n)).astype(dtype)
-    A = np.tril(A)
+    A = np.triu(A)
     return A.astype(dtype)
 
 
@@ -64,7 +91,7 @@ def ones_np_tril(batch_size: int, n: int, dtype: np.dtype):
 @pytest.mark.parametrize("data_type", [np.float16, np.float32], ids=str)
 @pytest.mark.parametrize(
     "mat_gen",
-    (rand_np_tril, ones_np_tril),
+    (rand_np_triu, ones_np_triu),
 )
 def test_tri_inv_col_sweep(
     batch_size: int,
@@ -92,12 +119,56 @@ def test_tri_inv_col_sweep(
     assert torch.equal(actual, expected)
 
 
+@pytest.mark.parametrize("batch_size", [2, 4, 40, 256])
+@pytest.mark.parametrize("matrix_size", [16, 32, 64, 128])
+@pytest.mark.parametrize("data_type", [np.float16], ids=str)
+@pytest.mark.parametrize(
+    "mat_gen",
+    (rand_np_triu, ones_np_triu),
+)
+def test_tri_inv_cube_col_sweep(
+    batch_size: int,
+    matrix_size: int,
+    data_type: np.dtype,
+    mat_gen: callable,
+):
+
+    input_x_cpu = mat_gen(batch_size, matrix_size, data_type)
+    expected_cpu = np_tril_inv_cube_cs(input_x_cpu, data_type).transpose(0, 2, 1)
+
+    input_x_cpu = input_x_cpu.transpose(0, 2, 1)
+    torch.npu.synchronize()
+    input_x = torch.from_numpy(input_x_cpu).half().npu()
+    torch.npu.synchronize()
+    expected = torch.from_numpy(expected_cpu).half().npu()
+    torch.npu.synchronize()
+    actual = torch.ops.npu.cube_triangular_inverse(input_x).transpose(1, 2)
+    torch.npu.synchronize()
+
+    torch.set_printoptions(
+        threshold=10_000,  # bigger than 16*16
+        linewidth=200,  # avoid line wrapping
+        precision=4,  # optional
+        sci_mode=False,  # optional
+    )
+
+    print("Input")
+    print(input_x.cpu())
+    print("Expected")
+    print(expected_cpu)
+    print("Actual")
+    print(actual.cpu())
+
+    assert actual.shape == expected.shape, "Output shape does not match expected shape."
+    assert torch.allclose(actual.float(), expected.float(), atol=0.1, rtol=0.5)
+
+
 @pytest.mark.parametrize("batch_size", [1, 2, 4, 40, 256])
 @pytest.mark.parametrize("matrix_size", [16, 32, 64, 128])
 @pytest.mark.parametrize("data_type", [np.float32], ids=str)
 @pytest.mark.parametrize(
     "mat_gen,atol,rtol",
-    [(rand_np_tril, 1e-5, 1e-5), (ones_np_tril, 0, 0)],
+    [(rand_np_triu, 1e-5, 1e-5), (ones_np_triu, 0, 0)],
 )
 def test_tri_inv_col_sweep_np_linalg_inv(
     batch_size: int,
@@ -107,8 +178,8 @@ def test_tri_inv_col_sweep_np_linalg_inv(
     atol: float,
     rtol: float,
 ):
-
-    input_x_cpu = mat_gen(batch_size, matrix_size, data_type)
+    # the copy forces the input to be contiguous
+    input_x_cpu = mat_gen(batch_size, matrix_size, data_type).transpose(0, 2, 1).copy()
     golden_numpy_cpu = np.linalg.inv(input_x_cpu)
 
     # Convert input matrices from row-major order to column-major order