sgl-project · gioelegott · Jan 21, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -22,6 +22,7 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv.cpp
+    ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv_cube.cpp
     )
 if(BUILD_CATLASS_MODULE)
     list(APPEND OP_SRCS
@@ -53,6 +54,7 @@ set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp
+    ${PROJECT_OP_SRC_BASE}/tri_inv/op_kernel/tri_inv_cube_kernel.cpp
 )
 if(BUILD_CATLASS_MODULE)
     list(APPEND WORKSPACE_KERNEL_SRCS

diff --git a/csrc/pytorch_extensions.cpp b/csrc/pytorch_extensions.cpp
@@ -98,6 +98,7 @@ TORCH_LIBRARY_FRAGMENT(npu, m)
         "int? sparse_count=None, int? sparse_mode=None) -> Tensor");
 
     m.def("triangular_inverse(Tensor x) -> Tensor");
+    m.def("cube_triangular_inverse(Tensor x) -> Tensor");
 }
 }  // namespace
 
@@ -141,5 +142,7 @@ TORCH_LIBRARY_IMPL(npu, PrivateUse1, m)
     m.impl("lightning_indexer", TORCH_FN(sglang::npu_kernel::lightning_indexer));
 
     m.impl("triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_col_sweep));
+
+    m.impl("cube_triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_cube_col_sweep));
 }
 }  // namespace
diff --git a/csrc/tri_inv/README.md b/csrc/tri_inv/README.md
@@ -1,5 +1,7 @@
 ##### Description of tri_inv
 
-This is a vector-only AscendC triangular inversion kernel on Ascend NPU.
+This implements several AscendC triangular inversion kernels on Ascend NPU.
 
-The kernel supports matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`.
+The AIV based kernels support matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`.
+
+The AIC based kernel support matrix sizes `16, 32, 64, 128` and data type `fp16`.
diff --git a/csrc/tri_inv/op_host/tiling_tri_inv_cube.h b/csrc/tri_inv/op_host/tiling_tri_inv_cube.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cstdint>
+
+namespace sglang {
+
+namespace npu_kernel {
+
+/**
+ * @brief `tri_inv_cube_col_sweep` kernel tiling parameter structure.
+ */
+struct TriInvColumnSweepCubeTiling {
+    /// @brief Number of blocks.
+    uint32_t num_blocks;
+    /// @brief Total number of input elements.
+    uint32_t num_elems;
+    /// @brief Input matrix size.
+    uint32_t matrix_size;
+    /// @brief Workspace circular buffer length.
+    uint32_t ws_circular_buffer_len;
+};
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/tri_inv/op_host/tri_inv_cube.cpp b/csrc/tri_inv/op_host/tri_inv_cube.cpp
@@ -0,0 +1,81 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "defines.h"
+#include "torch_helper.h"
+
+#include "tiling_tri_inv_cube.h"
+#include "aclrtlaunch_tri_inv_cube_col_sweep_fp16.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace sglang {
+
+namespace npu_kernel {
+
+at::Tensor calc_tiling(const TriInvColumnSweepCubeTiling &tiling)
+{
+    constexpr uint32_t PADDING_BYTE = 32U;
+
+    // align to 32 bytes
+    int32_t tiling_size = (sizeof(TriInvColumnSweepCubeTiling) + PADDING_BYTE - 1) / PADDING_BYTE * PADDING_BYTE;
+    auto tiling_buffer = at::empty({tiling_size}, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
+
+    TriInvColumnSweepCubeTiling *tiling_data =
+        reinterpret_cast<TriInvColumnSweepCubeTiling *>(tiling_buffer.data_ptr());
+    tiling_data->num_blocks = tiling.num_blocks;
+    tiling_data->num_elems = tiling.num_elems;
+    tiling_data->matrix_size = tiling.matrix_size;
+    tiling_data->ws_circular_buffer_len = tiling.ws_circular_buffer_len;
+
+    auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer);
+    return tiling_tensor;
+}
+
+HOST_API at::Tensor tri_inv_cube_col_sweep(const at::Tensor &tensor)
+{
+    platform_ascendc::PlatformAscendC *platformAscendC = platform_ascendc::PlatformAscendCManager::GetInstance();
+    auto acl_stream = c10_npu::getCurrentNPUStream().stream(false);
+
+    const auto dtype = tensor.options().dtype();
+    if (tensor.dim() < 2) {
+        throw std::runtime_error("Input tensor must have at least 2 dimensions.\n");
+    }
+
+    const uint32_t matrix_size = static_cast<uint32_t>(tensor.size(-1));
+    if (matrix_size != tensor.size(-2)) {
+        throw std::runtime_error("Only square matrices are supported.\n");
+    }
+
+    const uint32_t num_elems = static_cast<uint32_t>(tensor.numel());
+    const uint32_t block_dim = static_cast<uint32_t>(num_elems / (matrix_size * matrix_size));
+
+    auto tensor_out = at::empty_like(tensor, at::kFloat);
+
+    const uint32_t WS_CIRCULAR_BUFFER_LEN = 4;
+    const TriInvColumnSweepCubeTiling tiling{block_dim, num_elems, matrix_size, WS_CIRCULAR_BUFFER_LEN};
+    const at::Tensor tiling_device = calc_tiling(tiling);
+
+    // workspace
+    const uint64_t system_workspace_size = static_cast<uint64_t>(platformAscendC->GetLibApiWorkSpaceSize());
+    const uint64_t workspace_size = system_workspace_size + num_elems * WS_CIRCULAR_BUFFER_LEN * tensor.itemsize();
+    const auto options = at::TensorOptions().dtype(at::kByte).device(tensor.options().device());
+    auto workspace = at::empty({static_cast<int64_t>(workspace_size)}, options);
+
+    if (dtype == at::kHalf) {
+        EXEC_KERNEL_CMD(tri_inv_cube_col_sweep_fp16, block_dim, tensor, tensor_out, workspace, tiling_device);
+    } else {
+        throw std::runtime_error("Unsupported data type for tri_inv_cube_col_sweep. fp16 is currently supported.");
+    }
+    aclrtSynchronizeStream(acl_stream);
+    return tensor_out;
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/tri_inv/op_kernel/kernel_mat_gen.h b/csrc/tri_inv/op_kernel/kernel_mat_gen.h
@@ -0,0 +1,221 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * @file kernel_mat_gen.h
+ * @brief Kernel implementing an AIV matrix generator that generates the matrix
+ * formulation of column sweep.
+ */
+#pragma once
+
+namespace sglang {
+
+namespace npu_kernel {
+/**
+ * @brief Returns a sequence of matrices that encode the column-sweep steps in
+ * matrix notation. On the first iteration, it returns the identity matrix.
+ *
+ * @tparam Input data type. Support fp16/half.
+ *
+ * See discussion on Section 3.2.1 of [1], in particular Equations 3.8 and 3.9
+ * on page 54.
+ *
+ * \code{.python}
+  def aiv_matrix_gen(A: npt.ArrayLike):
+    n = A.shape[0]
+    I_n = np.eye(n, dtype=A.dtype)
+
+    # Your transformation A = 2I_n - A
+    A = 2 * I_n - A
+
+    for k in reversed(range(n)):
+      M = I_n.copy()
+      M[:, k] = A[:, k]
+      yield M
+
+  * \endcode
+  *
+  *  [1] Parallelism in Matrix Computations.E.Gallopoulos, B.Philippe and
+ A.H.Sameh.
+  * Hard cover(ISBN : 978 - 94 - 017 - 7187 - 0),
+  * Soft cover(ISBN : 978 - 94 - 024 - 0317 - 6),
+  * Electronic(ISBN : 978 - 94 - 017 - 7188 - 7)
+  **/
+template <typename T = half>
+class KernelMatGen
+{
+public:
+    /**
+     * @brief Class constructor.
+     *
+     * @param [in] matrix_size Input square matrix size.
+     * @param [in] circular_buffer_len Length of workspace circular buffer to
+     * overcome GM memory consistency issues.
+     */
+    __aicore__ inline KernelMatGen(uint32_t matrix_size, uint32_t circular_buffer_len)
+        : matrix_size_(matrix_size),
+          tile_len_(matrix_size * matrix_size),
+          aic_id_(AscendC::GetBlockIdx() / AscendC::GetTaskRation()),
+          global_in_offset_(aic_id_ * tile_len_),
+          ws_circular_buffer_len_(circular_buffer_len),
+          global_out_offset_(aic_id_ * tile_len_ * ws_circular_buffer_len_)
+    {}
+
+    /**
+     * @brief Initialize global and local memory structures.
+     *
+     * @param [in] vec_in Pointer to the input vector in global memory.
+     * @param [in] vec_out Pointer to the output vector in global memory.
+     */
+    __aicore__ inline void Init(GM_ADDR vec_in, GM_ADDR vec_out)
+    {
+        const uint32_t vec_len = AscendC::GetBlockNum() * tile_len_;
+        global_in_.SetGlobalBuffer((__gm__ T *)vec_in, vec_len);
+        global_out_.SetGlobalBuffer((__gm__ T *)vec_out, vec_len * ws_circular_buffer_len_);
+
+        pipe_.InitBuffer(in_q_, 1, tile_len_ * sizeof(T));
+        pipe_.InitBuffer(out_q_, 1, tile_len_ * sizeof(T));
+        pipe_.InitBuffer(work_buf_, tile_len_ * sizeof(T));
+    }
+
+    /**
+     * @brief Run the kernel.
+     *
+     */
+    __aicore__ inline void Process()
+    {
+        // Read input matrix into work_buf_.
+        const AscendC::LocalTensor<T> in_lt = in_q_.template AllocTensor<T>();
+        DataCopy(in_lt, global_in_[global_in_offset_], in_lt.GetSize());
+        in_q_.EnQue(in_lt);
+
+        ReadInputMatrixInUB();
+
+        // AIV-0 writes identity matrix for AIC
+        if (AscendC::GetSubBlockIdx() == 0) {
+            EnQueueIdentityMatrix();
+            AscendC::LocalTensor<T> out_lt = out_q_.template DeQue<T>();
+            DataCopy(global_out_[global_out_offset_], out_lt, out_lt.GetSize());
+            out_q_.FreeTensor(out_lt);
+        }
+
+        //  Sync with all AIVs in group, to write the matrix.
+        SyncGroup();
+
+        // First matrix is identity (just wait one more round)
+        SyncGroup();
+
+        const AscendC::LocalTensor<T> work_lt = work_buf_.Get<T>();
+        uint32_t circular_buf_idx = 1;
+
+        // Matrix column sweep algorithm requires `matrix_size_` iterations.
+        for (int32_t col_index = matrix_size_ - 2; col_index >= 0; col_index--) {
+            // AIV-0: writes the  (col_index + 1)-th column of the identity matrix and
+            // writes the "column-sweep" column of matrix `M`.
+            if (AscendC::GetSubBlockIdx() == 0) {
+                const AscendC::LocalTensor<T> vec_out_lt = out_q_.AllocTensor<T>();
+                AscendC::Duplicate(vec_out_lt, static_cast<T>(0), matrix_size_ * matrix_size_);
+                AscendC::PipeBarrier<PIPE_ALL>();
+
+                // Set one on the main diagonal
+                for (uint32_t i = 0; i < matrix_size_; i++) {
+                    vec_out_lt.SetValue(i * matrix_size_ + i, static_cast<T>(1));
+                }
+
+                AscendC::PipeBarrier<PIPE_ALL>();
+                // Write the (col_index)-th column of matrix M.
+                const uint32_t col_offset = col_index * matrix_size_;
+                DataCopy(vec_out_lt[col_offset], work_lt[col_offset], matrix_size_);
+                AscendC::PipeBarrier<PIPE_ALL>();
+                out_q_.EnQue<T>(vec_out_lt);
+
+                AscendC::LocalTensor<T> out_lt = out_q_.template DeQue<T>();
+                DataCopy(global_out_[global_out_offset_ + circular_buf_idx * tile_len_], out_lt, out_lt.GetSize());
+                out_q_.FreeTensor(out_lt);
+                circular_buf_idx = (circular_buf_idx + 1) % ws_circular_buffer_len_;
+            }
+
+            // Sync with all AIVs in group, to write the matrix.
+            SyncGroup();
+        }
+    }
+
+private:
+    /**
+     * @brief Read (and transform) the input triangular matrix A into the
+     * `work_buf_`. The transformation is `2*I_n - A`.
+     */
+    __aicore__ inline void ReadInputMatrixInUB()
+    {
+        AscendC::LocalTensor<T> vec_in_lt = in_q_.DeQue<T>();
+        AscendC::LocalTensor<T> work_lt = work_buf_.Get<T>();
+        Muls(work_lt, vec_in_lt, static_cast<T>(-1), vec_in_lt.GetSize());
+        for (uint32_t i = 0; i < matrix_size_; i++) {
+            work_lt.SetValue(i * matrix_size_ + i, 1);
+        }
+        in_q_.FreeTensor<T>(vec_in_lt);
+    }
+
+    /**
+     * @brief EnQue identity matrix on output queue.
+     *
+     */
+    __aicore__ inline void EnQueueIdentityMatrix()
+    {
+        const AscendC::LocalTensor<T> vec_out_lt = out_q_.AllocTensor<T>();
+        AscendC::Duplicate(vec_out_lt, static_cast<T>(0), matrix_size_ * matrix_size_);
+        AscendC::PipeBarrier<PIPE_ALL>();
+
+        // Set one on the main diagonal
+        for (uint32_t i = 0; i < matrix_size_; i++) {
+            vec_out_lt.SetValue(i * matrix_size_ + i, static_cast<T>(1));
+        }
+
+        out_q_.EnQue<T>(vec_out_lt);
+    }
+
+    /**
+     * @brief Returns a synchronization config.
+     *
+     * @param [in] mode Synchronization mode.
+     * @param [in] flag_id Flag to use for synchronization.
+     * @return Synchronization config.
+     */
+    __aicore__ inline int GetSyncConf(int mode, int flag_id)
+    {
+        return 1 | (mode << 4) | (flag_id << 8);
+    }
+
+    /**
+     * @brief Synchronize cube and vector cores within a single group.
+     *
+     */
+    __aicore__ inline void SyncGroup()
+    {
+        const int mode = 2;
+
+        const int AIV_SET_FLAG_ID = 11;
+        const int AIC_SET_FLAG_ID = 12;
+        ffts_cross_core_sync(PIPE_MTE3, GetSyncConf(mode, AIV_SET_FLAG_ID));
+        wait_flag_dev(AIC_SET_FLAG_ID);
+        return;
+    }
+
+    AscendC::TPipe pipe_;
+
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> in_q_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> out_q_;
+
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> work_buf_;
+
+    AscendC::GlobalTensor<T> global_in_;
+    AscendC::GlobalTensor<T> global_out_;
+
+    const uint32_t matrix_size_;
+    const uint32_t tile_len_;
+    const uint32_t aic_id_;
+    const uint32_t global_in_offset_;
+    const uint32_t ws_circular_buffer_len_;
+    const uint32_t global_out_offset_;
+};
+}  // namespace npu_kernel
+}  // namespace sglang