diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
index dbedd6ac1..c5e33117b 100644
--- a/csrc/CMakeLists.txt
+++ b/csrc/CMakeLists.txt
@@ -19,9 +19,6 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgmv_shrink.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmv_expand.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmv_shrink.cpp
-    ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmc_expand.cpp
-    ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmc_shrink.cpp
-    ${PROJECT_OP_SRC_BASE}/lora/op_host/tiling/sgemmc_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv.cpp
@@ -54,10 +51,6 @@ ascendc_library(no_workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/recurrent_gated_delta_rule/op_kernel/recurrent_gated_delta_rule_kernel.cpp
 )
 
-ascendc_include_directories(no_workspace_kernel PRIVATE
-    ${PROJECT_OP_SRC_BASE}/utils/kernel
-)
-
 # kernel side files with workspace
 set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
@@ -66,8 +59,6 @@ set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/causal_conv1d_update/op_kernel/causal_conv1d_update.cpp
     ${PROJECT_OP_SRC_BASE}/causal_conv1d/op_kernel/causal_conv1d.cpp
-    ${PROJECT_OP_SRC_BASE}/lora/op_kernel/sgemmc_expand_kernel.cpp
-    ${PROJECT_OP_SRC_BASE}/lora/op_kernel/sgemmc_shrink_kernel.cpp
 )
 if(BUILD_CATLASS_MODULE)
     list(APPEND WORKSPACE_KERNEL_SRCS
@@ -85,10 +76,6 @@ if(BUILD_CATLASS_MODULE)
     )
 endif()
 
-ascendc_include_directories(workspace_kernel PRIVATE
-    ${PROJECT_OP_SRC_BASE}/utils/kernel
-)
-
 ascendc_compile_definitions(workspace_kernel PRIVATE
        -DHAVE_WORKSPACE
        -DHAVE_TILING
@@ -100,7 +87,6 @@ add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS})
 target_link_libraries(${OP_PLUGIN_NAME} PRIVATE
         workspace_kernel
         no_workspace_kernel
-        host_intf_pub
         torch_npu
         ascendcl
         tiling_api
@@ -129,7 +115,6 @@ target_include_directories(${OP_PLUGIN_NAME} PRIVATE
         ${TORCH_DIR}/include
         ${TORCH_DIR}/include/torch/csrc/api/include
         ${TORCH_NPU_DIR}/include
-        ${ASCEND_INCLUDE_DIR}
         ${ASCEND_INCLUDE_DIR}/external
         ${ASCEND_INCLUDE_DIR}/experiment/platform
         ${ASCEND_INCLUDE_DIR}/experiment/runtime
diff --git a/csrc/lora/op_host/sgemmc_expand.cpp b/csrc/lora/op_host/sgemmc_expand.cpp
deleted file mode 100644
index c9524e311..000000000
--- a/csrc/lora/op_host/sgemmc_expand.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "defines.h"
-#include "tiling/sgemmc_tiling.h"
-#include "torch_helper.h"
-
-#include "aclrtlaunch_sgemmc_expand.h"
-
-namespace sglang {
-namespace npu_kernel {
-
-HOST_API at::Tensor sgemmc_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
-                                  at::Tensor &lora_ranks, at::Tensor &slice_offsets, at::Tensor &y)
-{
-    at::ScalarType scalar_type = y.scalar_type();
-    TORCH_CHECK(scalar_type == at::kHalf || scalar_type == at::kBFloat16, "only support half and bf16");
-    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
-    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
-                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
-    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
-
-    at::Tensor y_out = y;
-    void *x_ptr = x.data_ptr();
-    void *weight_ptr = weight.data_ptr();
-    void *y_ptr = y.data_ptr();
-    void *y_out_ptr = y_out.data_ptr();
-
-    void *lora_indices_ptr = lora_indices.data_ptr();
-    int lora_indices_size = lora_indices.size(0);
-    void *seq_len_ptr = seq_len.data_ptr();
-    int seq_len_size = seq_len.size(0);
-    void *lora_ranks_ptr = lora_ranks.data_ptr();
-    int lora_ranks_size = lora_ranks.size(0);
-    void *slice_offsets_ptr = slice_offsets.data_ptr();
-    int slice_offsets_size = slice_offsets.size(0);
-    int slice_count = slice_offsets_size - 1;
-    int batch_size = x.size(0);
-    int max_lora_rank = x.size(1) / slice_count;
-    int output_full_dim = y.size(1);
-
-    uint32_t block_dim;
-    uint32_t workspace_size;
-
-    at::Tensor tiling_tensor = GenerateTiling(block_dim, workspace_size, batch_size, max_lora_rank, output_full_dim,
-                                              TorchNpuHelper::ConvertDataType(scalar_type));
-    auto workspace_tensor =
-        at::empty({workspace_size}, at::TensorOptions().dtype(at::kByte).device(x.options().device()));
-
-    /* launch the kernel function via torch */
-    EXEC_KERNEL_CMD(sgemmc_expand, block_dim, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr,
-                    seq_len_size, lora_ranks_ptr, lora_ranks_size, slice_offsets_ptr, slice_offsets_size, y_ptr,
-                    y_out_ptr, batch_size, max_lora_rank, output_full_dim, workspace_tensor, tiling_tensor);
-
-    return y_out;
-}
-
-}  // namespace npu_kernel
-}  // namespace sglang
diff --git a/csrc/lora/op_host/sgemmc_shrink.cpp b/csrc/lora/op_host/sgemmc_shrink.cpp
deleted file mode 100644
index 1ca57e955..000000000
--- a/csrc/lora/op_host/sgemmc_shrink.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "defines.h"
-#include "tiling/sgemmc_tiling.h"
-#include "torch_helper.h"
-
-#include "aclrtlaunch_sgemmc_shrink.h"
-
-namespace sglang {
-namespace npu_kernel {
-
-HOST_API void sgemmc_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
-                            at::Tensor &lora_ranks, at::Tensor &lora_scales, at::Tensor &y)
-{
-    at::ScalarType scalar_type = x.scalar_type();
-    TORCH_CHECK(scalar_type == at::kHalf || scalar_type == at::kBFloat16, "only support half and bf16");
-    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
-    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
-                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
-    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
-    TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out");
-    void *x_ptr = x.data_ptr();
-    void *weight_ptr = weight.data_ptr();
-
-    void *lora_indices_ptr = lora_indices.data_ptr();
-    int lora_indices_size = lora_indices.size(0);
-    void *seq_len_ptr = seq_len.data_ptr();
-    int seq_len_size = seq_len.size(0);
-    void *lora_ranks_ptr = lora_ranks.data_ptr();
-    int lora_ranks_size = lora_ranks.size(0);
-    void *lora_scales_ptr = lora_scales.data_ptr();
-    int lora_scales_size = lora_scales.size(0);
-
-    void *y_ptr = y.data_ptr();
-    int batch_size = x.size(0);
-    int input_hidden_token = x.size(1);
-    uint32_t max_lora_rank = y.size(1);
-
-    uint32_t block_dim;
-    uint32_t workspace_size;
-
-    at::Tensor tiling_tensor = GenerateTiling(block_dim, workspace_size, batch_size, input_hidden_token, max_lora_rank,
-                                              TorchNpuHelper::ConvertDataType(scalar_type));
-
-    auto workspace_tensor =
-        at::empty({workspace_size}, at::TensorOptions().dtype(at::kByte).device(x.options().device()));
-    /* launch the kernel function via torch */
-    EXEC_KERNEL_CMD(sgemmc_shrink, block_dim, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr,
-                    seq_len_size, lora_ranks_ptr, lora_ranks_size, lora_scales_ptr, lora_scales_size, y_ptr, batch_size,
-                    input_hidden_token, max_lora_rank, workspace_tensor, tiling_tensor);
-    return;
-}
-
-}  // namespace npu_kernel
-}  // namespace sglang
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling.cpp b/csrc/lora/op_host/tiling/sgemmc_tiling.cpp
deleted file mode 100644
index 69c5d4d3b..000000000
--- a/csrc/lora/op_host/tiling/sgemmc_tiling.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "common.h"
-#include "sgemmc_tiling.h"
-
-namespace sglang {
-namespace npu_kernel {
-
-matmul_tiling::DataType ConvertToMatMulTypes(host_utils::DataType data_type)
-{
-    switch (data_type) {
-        case host_utils::DataType::DT_BFLOAT16:
-            return matmul_tiling::DataType::DT_BFLOAT16;
-        case host_utils::DataType::DT_FLOAT:
-            return matmul_tiling::DataType::DT_FLOAT;
-        case host_utils::DataType::DT_FLOAT16:
-            return matmul_tiling::DataType::DT_FLOAT16;
-    }
-
-    return matmul_tiling::DataType::DT_FLOAT16;
-}
-
-at::Tensor GenerateTiling(uint32_t &block_dim, uint32_t &workspace_size, uint32_t batch_size, uint32_t inner_size,
-                          uint32_t output_size, const host_utils::DataType type)
-{
-    auto ascendc_platform = *platform_ascendc::PlatformAscendCManager::GetInstance();
-    uint32_t aiv_num = ascendc_platform.GetCoreNumAiv();
-    uint32_t aic_num = ascendc_platform.GetCoreNumAic();
-    workspace_size = ascendc_platform.GetLibApiWorkSpaceSize();
-
-    auto tilingBuffer = at::empty({sizeof(SGEMMCTilingData)}, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
-    SGEMMCTilingData *tiling_data = reinterpret_cast<SGEMMCTilingData *>(tilingBuffer.data_ptr());
-
-    matmul_tiling::MultiCoreMatmulTiling cubeTiling(ascendc_platform);
-
-    const matmul_tiling::DataType data_type = ConvertToMatMulTypes(type);
-
-    cubeTiling.EnableBias(false);
-    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::VECTOR, data_type, false);
-    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, data_type, true);
-    cubeTiling.SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND,
-                        matmul_tiling::DataType::DT_FLOAT);
-    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, data_type);
-    cubeTiling.EnableMultiCoreSplitK(false);
-    cubeTiling.SetDim(aic_num);
-
-    cubeTiling.SetOrgShape(1, inner_size, output_size);
-    cubeTiling.SetShape(1, inner_size, output_size);
-    cubeTiling.SetBufferSpace(-1, -1, -1);
-
-    if (cubeTiling.GetTiling(tiling_data->cubeTiling) == -1) {
-        TORCH_CHECK(false, "Generate tiling failed.");
-        return {};
-    }
-
-    tiling_data->batch = batch_size;
-    tiling_data->dataType = (type == host_utils::DataType::DT_BFLOAT16);
-
-    block_dim = batch_size * tiling_data->cubeTiling.usedCoreNum;
-
-    return tilingBuffer;
-}
-
-}  // namespace npu_kernel
-}  // namespace sglang
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling.h b/csrc/lora/op_host/tiling/sgemmc_tiling.h
deleted file mode 100644
index 075bf6594..000000000
--- a/csrc/lora/op_host/tiling/sgemmc_tiling.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef SGEMMC_TILING_H
-#define SGEMMC_TILING_H
-
-#include <register/tilingdata_base.h>
-#include <tiling/tiling_api.h>
-
-#include "torch_helper.h"
-#include "common_tiling.h"
-#include "sgemmc_tiling_data.h"
-
-namespace sglang {
-namespace npu_kernel {
-
-at::Tensor GenerateTiling(uint32_t &blockDim, uint32_t &workspace, uint32_t batch, uint32_t hidden_size, uint32_t k,
-                          const host_utils::DataType type);
-
-}  // namespace npu_kernel
-}  // namespace sglang
-
-#endif  // SGEMMC_TILING_H
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling_data.h b/csrc/lora/op_host/tiling/sgemmc_tiling_data.h
deleted file mode 100644
index 88c99cd8a..000000000
--- a/csrc/lora/op_host/tiling/sgemmc_tiling_data.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef SGEMMC_TILING_DATA_H
-#define SGEMMC_TILING_DATA_H
-
-#include <cstdint>
-
-namespace AscendC {
-namespace tiling {
-
-struct TCubeTiling;
-
-}  // namespace tiling
-}  // namespace AscendC
-
-namespace sglang {
-namespace npu_kernel {
-
-#pragma pack(push, 1)
-struct SGEMMCTilingData {
-    uint32_t dataType;
-    uint32_t batch;
-    uint32_t hidden;
-    uint32_t k;
-    uint32_t slices;
-    AscendC::tiling::TCubeTiling cubeTiling;
-};
-#pragma pack(pop)
-
-}  // namespace npu_kernel
-}  // namespace sglang
-
-#endif  // SGEMMC_TILING_DATA_H
diff --git a/csrc/lora/op_kernel/lora_common_kernel.h b/csrc/lora/op_kernel/lora_common_kernel.h
deleted file mode 100644
index 34ce9bb59..000000000
--- a/csrc/lora/op_kernel/lora_common_kernel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef SGL_KERNEL_NPU_KERNEL_LORA_COMMON_H
-#define SGL_KERNEL_NPU_KERNEL_LORA_COMMON_H
-
-#include "kernel_operator.h"
-
-namespace lora_common {
-
-template <typename scalar_t>
-class BlockIterator
-{
-    AscendC::GlobalTensor<scalar_t> blocks;
-    int64_t previous_block;
-    int64_t previous_offset;
-
-public:
-    __aicore__ explicit BlockIterator(AscendC::GlobalTensor<scalar_t> &blocks_)
-        : blocks(blocks_), previous_block(0), previous_offset(0)
-    {}
-    __aicore__ inline int64_t GetBlockIdx(int64_t index)
-    {
-        int64_t current_offset = previous_offset;
-        uint64_t blockIdx = previous_block;
-
-        for (; blockIdx < blocks.GetSize(); ++blockIdx) {
-            int64_t blockOffset = blocks.GetValue(blockIdx);
-            if (index >= current_offset + blockOffset) {
-                current_offset += blockOffset;
-            } else {
-                previous_offset = current_offset;
-                previous_block = blockIdx;
-                return blockIdx;
-            }
-        }
-
-        return -1;
-    }
-};
-
-}  // namespace lora_common
-
-#endif  // SGL_KERNEL_NPU_KERNEL_LORA_COMMON_H
diff --git a/csrc/lora/op_kernel/sgemmc_expand_kernel.cpp b/csrc/lora/op_kernel/sgemmc_expand_kernel.cpp
deleted file mode 100644
index 32bb42ad9..000000000
--- a/csrc/lora/op_kernel/sgemmc_expand_kernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef SGL_KERNEL_NPU_KERNEL_SGEMMC_EXPAND_H
-#define SGL_KERNEL_NPU_KERNEL_SGEMMC_EXPAND_H
-
-#include <type_traits>
-#include "kernel_operator.h"
-#include "lib/matmul_intf.h"
-#include "lora_common_kernel.h"
-#include "common_tiling_kernel.h"
-
-#include "../op_host/tiling/sgemmc_tiling_data.h"
-
-template <typename scalar_t, typename inner_t>
-class SGEMMCExpand
-{
-public:
-    using X_T = scalar_t;
-    using W_T = scalar_t;
-    using INNER_T = inner_t;
-    using Y_T = scalar_t;
-
-    using X_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::VECTOR, X_T, false>;
-    using W_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, W_T, true>;
-    using Y_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, INNER_T>;
-    using BIAS_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, X_T>;
-
-    using MAT_TYPE = AscendC::Matmul<X_MAT_TYPE, W_MAT_TYPE, Y_MAT_TYPE, BIAS_MAT_TYPE, CFG_MDL>;
-
-public:
-    __aicore__ explicit SGEMMCExpand(AscendC::TPipe *pipe) : pipe_(pipe) {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR loraIndices, uint32_t loraIndicesSize,
-                                GM_ADDR seqLen, uint32_t seqLenSize, GM_ADDR loraRanks, uint32_t loraRanksSize,
-                                GM_ADDR sliceOffsets, uint32_t sliceOffsetsSize, GM_ADDR yIn, GM_ADDR yOut,
-                                uint32_t batchSize, uint32_t maxLoRARank, uint32_t outputFullDim, GM_ADDR workspace,
-                                TCubeTiling &tiling)
-    {
-        this->tiling = tiling;
-
-        batchSize_ = batchSize;
-        maxLoRARank_ = maxLoRARank;
-        sliceCount_ = sliceOffsetsSize - 1;
-        outputFullDim_ = outputFullDim;
-        singleLoRAWeightLen_ = maxLoRARank_ * outputFullDim_;
-
-        xInGm_.SetGlobalBuffer(reinterpret_cast<__gm__ X_T *>(x));
-        wInGm_.SetGlobalBuffer(reinterpret_cast<__gm__ W_T *>(weight));
-        yInGm_.SetGlobalBuffer(reinterpret_cast<__gm__ Y_T *>(yIn));
-        yOutGm_.SetGlobalBuffer(reinterpret_cast<__gm__ Y_T *>(yOut));
-        loraIndicesGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(loraIndices), loraIndicesSize);
-        seqLenGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(seqLen), seqLenSize);
-        loraRanksGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(loraRanks), loraRanksSize);
-        sliceOffsetsGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(sliceOffsets), sliceOffsetsSize);
-
-        workspaceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ Y_T *>(workspace));
-
-        REGIST_MATMUL_OBJ(pipe_, GetSysWorkSpacePtr(), matmulObj, &tiling);
-    }
-
-    __aicore__ inline void Process()
-    {
-        int64_t blocks = AscendC::GetBlockNum();
-        int64_t blockIdx = AscendC::GetBlockIdx();
-
-        AscendC::WaitPreTaskEnd();
-
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
-        requestBlock = blockIterator.GetBlockIdx(blockIdx);
-        if (requestBlock < 0) {
-            return;
-        }
-
-        int32_t reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
-        if (reqLoRAIndex_ < 0) {
-            return;
-        }
-
-        int64_t reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
-        int32_t reqLoRARank_ = loraRanksGm_.GetValue(reqLoRAIndex_);
-
-        if (reqLoRARank_ == 0) {
-            return;
-        }
-
-        matmulObj.SetWorkspace(workspaceGlobal);
-        matmulObj.SetTensorA(xInGm_);
-        matmulObj.SetTensorB(wInGm_);
-        matmulObj.template Iterate<false>();
-
-        uint32_t baseM = tiling.baseM;
-        uint32_t baseN = tiling.baseN;
-        pipe_->InitBuffer(vectorCalcBuf, baseM * baseN * sizeof(INNER_T));
-        pipe_->InitBuffer(vectorInQueue, 1, baseM * baseN * sizeof(INNER_T));
-        pipe_->InitBuffer(vectorYInQueue, 1, baseM * baseN * sizeof(INNER_T));
-        pipe_->InitBuffer(vectorOutQueue, 1, baseM * baseN * sizeof(Y_T));
-
-        AscendC::DataCopyParams copyParams = {(uint16_t)baseM,
-                                              (uint16_t)(baseN * sizeof(Y_T) / AscendC::DEFAULT_C0_SIZE), (uint16_t)0,
-                                              (uint16_t)((tiling.N - baseN) * sizeof(Y_T) / AscendC::DEFAULT_C0_SIZE)};
-        uint32_t iterateTimes = AscendC::Ceil(tiling.singleCoreM, baseM) * AscendC::Ceil(tiling.singleCoreN, baseN);
-        for (uint32_t i = 0; i < iterateTimes; ++i) {
-            auto cInLocal = vectorInQueue.AllocTensor<INNER_T>();
-            matmulObj.template GetTensorC<false>(cInLocal);
-            vectorInQueue.EnQue(cInLocal);
-
-            AscendC::LocalTensor<Y_T> yInLocalCube = vectorYInQueue.AllocTensor<Y_T>();
-            DataCopy(yInLocalCube, yInGm_[i], baseM * baseN);
-            vectorYInQueue.EnQue(yInLocalCube);
-
-            AscendC::LocalTensor<INNER_T> tmpTensor = vectorCalcBuf.Get<INNER_T>();
-            AscendC::LocalTensor<Y_T> yInLocal = vectorYInQueue.DeQue<Y_T>();
-            AscendC::LocalTensor<INNER_T> yLocal = vectorInQueue.DeQue<INNER_T>();
-            Cast(tmpTensor, yInLocal, AscendC::RoundMode::CAST_NONE, baseM * baseN);
-            pipe_barrier(PIPE_V);
-            vectorYInQueue.FreeTensor(yInLocal);
-
-            Add(yLocal, yLocal, tmpTensor, baseM * baseN);
-            pipe_barrier(PIPE_V);
-
-            AscendC::LocalTensor<Y_T> yOutLocal = vectorOutQueue.AllocTensor<Y_T>();
-            Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, baseM * baseN);
-            pipe_barrier(PIPE_V);
-
-            vectorOutQueue.EnQue<Y_T>(yOutLocal);
-
-            // copy out
-            auto cOutLocal = vectorOutQueue.DeQue<Y_T>();
-            DataCopy(yOutGm_[i], cOutLocal, copyParams);
-            vectorOutQueue.FreeTensor(cOutLocal);
-        }
-        matmulObj.End();
-        AscendC::SetNextTaskStart();
-    }
-
-private:
-    AscendC::TPipe *pipe_;
-    MAT_TYPE matmulObj;
-
-    AscendC::GlobalTensor<Y_T> workspaceGlobal;
-
-    TCubeTiling tiling;
-    AscendC::TQue<AscendC::QuePosition::VECIN, 1> vectorInQueue;
-    AscendC::TQue<AscendC::QuePosition::VECIN, 1> vectorYInQueue;
-    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> vectorOutQueue;
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> vectorCalcBuf;
-
-    AscendC::GlobalTensor<X_T> xInGm_;
-    AscendC::GlobalTensor<W_T> wInGm_;
-    AscendC::GlobalTensor<Y_T> yInGm_;
-    AscendC::GlobalTensor<Y_T> yOutGm_;
-
-    AscendC::GlobalTensor<int32_t> seqLenGm_;
-    AscendC::GlobalTensor<int32_t> loraIndicesGm_;
-
-    AscendC::GlobalTensor<int32_t> loraRanksGm_;
-    AscendC::GlobalTensor<int32_t> sliceOffsetsGm_;
-
-    uint32_t batchSize_;
-    uint32_t sliceCount_;
-    uint32_t maxLoRARank_;
-    uint32_t outputHiddenDim_;
-    uint32_t sliceOffset_;
-    uint32_t outputFullDim_;
-    uint32_t singleLoRAWeightLen_;
-    int64_t reqLoRAIndex_;
-    int32_t reqLoRARank_;
-    uint64_t reqLoRAWeightOffset_;
-    int32_t reqSlice_;
-    uint32_t numOutputElementsPerInputTile_;
-    uint32_t numStreamInPerOutputTile_;
-    uint64_t yOffset_;
-};
-
-extern "C" __global__ __aicore__ void sgemmc_expand(GM_ADDR x, GM_ADDR weight, GM_ADDR loraIndices,
-                                                    uint32_t loraIndicesSize, GM_ADDR seqLen, uint32_t seqLenSize,
-                                                    GM_ADDR loraRanks, uint32_t loraRanksSize, GM_ADDR sliceOffsets,
-                                                    uint32_t sliceOffsetsSize, GM_ADDR yIn, GM_ADDR yOut,
-                                                    uint32_t batchSize, uint32_t maxLoRARank, uint32_t outputFullDim,
-                                                    GM_ADDR workspace, GM_ADDR tiling)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_1);
-
-    AscendC::TPipe pipe;
-    sglang::npu_kernel::SGEMMCTilingData tilingData;
-    kernel_utils::CopyTiling(&tilingData, tiling);
-
-    if (tilingData.dataType == 1) {
-        SGEMMCExpand<bfloat16_t, float> op(&pipe);
-        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, sliceOffsets,
-                sliceOffsetsSize, yIn, yOut, batchSize, maxLoRARank, outputFullDim, workspace, tilingData.cubeTiling);
-        op.Process();
-    } else {
-        SGEMMCExpand<half, float> op(&pipe);
-        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, sliceOffsets,
-                sliceOffsetsSize, yIn, yOut, batchSize, maxLoRARank, outputFullDim, workspace, tilingData.cubeTiling);
-        op.Process();
-    }
-}
-
-#endif  // SGL_KERNEL_NPU_KERNEL_SGEMMC_EXPAND_H
diff --git a/csrc/lora/op_kernel/sgemmc_shrink_kernel.cpp b/csrc/lora/op_kernel/sgemmc_shrink_kernel.cpp
deleted file mode 100644
index 0d7f69bb4..000000000
--- a/csrc/lora/op_kernel/sgemmc_shrink_kernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#ifndef SGL_KERNEL_NPU_KERNEL_SGEMMC_SHRINK_H
-#define SGL_KERNEL_NPU_KERNEL_SGEMMC_SHRINK_H
-
-#include <type_traits>
-#include "kernel_operator.h"
-#include "lib/matmul_intf.h"
-#include "lora_common_kernel.h"
-#include "common_tiling_kernel.h"
-
-#include "../op_host/tiling/sgemmc_tiling_data.h"
-
-template <typename scalar_t, typename inner_t>
-class SGEMMCShrink
-{
-public:
-    using X_T = scalar_t;
-    using W_T = scalar_t;
-    using INNER_T = inner_t;
-    using Y_T = scalar_t;
-
-    using X_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::VECTOR, X_T, false>;
-    using W_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, W_T, true>;
-    using Y_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, INNER_T>;
-    using BIAS_MAT_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, X_T>;
-
-    using MAT_TYPE = AscendC::Matmul<X_MAT_TYPE, W_MAT_TYPE, Y_MAT_TYPE, BIAS_MAT_TYPE, CFG_MDL>;
-
-public:
-    __aicore__ explicit SGEMMCShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR loraIndices, uint32_t loraIndicesSize,
-                                GM_ADDR seqLen, uint32_t seqLenSize, GM_ADDR loraRanks, uint32_t loraRanksSize,
-                                GM_ADDR loraScales, uint32_t loraScalesSize, GM_ADDR y, uint32_t batchSize,
-                                uint32_t inputHiddenDim, uint32_t maxLoRARank, GM_ADDR workspace, TCubeTiling &tiling)
-    {
-        this->tiling = tiling;
-
-        batchSize_ = batchSize;
-        inputHiddenDim_ = inputHiddenDim;
-        maxLoRARank_ = maxLoRARank;
-        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
-
-        xInGm_.SetGlobalBuffer(reinterpret_cast<__gm__ X_T *>(x));
-        yOutGm_.SetGlobalBuffer(reinterpret_cast<__gm__ Y_T *>(y));
-        wInGm_.SetGlobalBuffer(reinterpret_cast<__gm__ W_T *>(weight));
-        loraIndicesGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(loraIndices), loraIndicesSize);
-        seqLenGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(seqLen), seqLenSize);
-        loraRanksGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(loraRanks), loraRanksSize);
-        loraScalesGm_.SetGlobalBuffer(reinterpret_cast<__gm__ half *>(loraScales), loraScalesSize);
-
-        workspaceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ Y_T *>(workspace));
-
-        REGIST_MATMUL_OBJ(pipe_, GetSysWorkSpacePtr(), matmulObj, &tiling);
-    }
-
-    __aicore__ inline void Process()
-    {
-        int64_t blocks = AscendC::GetBlockNum();
-        int64_t blockIdx = AscendC::GetBlockIdx();
-
-        AscendC::WaitPreTaskEnd();
-
-        int64_t batchIdx = 0;
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
-        requestBlock = blockIterator.GetBlockIdx(batchIdx);
-        if (requestBlock < 0) {
-            return;
-        }
-
-        int32_t reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
-        if (reqLoRAIndex_ < 0) {
-            return;
-        }
-
-        int64_t reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
-        int32_t reqLoRARank_ = loraRanksGm_.GetValue(reqLoRAIndex_);
-
-        if (reqLoRARank_ == 0) {
-            return;
-        }
-
-        matmulObj.SetWorkspace(workspaceGlobal);
-        matmulObj.SetTensorA(xInGm_);
-        matmulObj.SetTensorB(wInGm_);
-        matmulObj.template Iterate<false>();
-
-        half loraScale = loraScalesGm_.GetValue(reqLoRAIndex_);
-        INNER_T scalar = AscendC::ScalarCast<half, INNER_T, AscendC::RoundMode::CAST_ROUND>(loraScale);
-
-        uint32_t baseM = this->tiling.baseM;
-        uint32_t baseN = this->tiling.baseN;
-        pipe_->InitBuffer(vectorCalcBuf, baseM * baseN * sizeof(INNER_T));
-        pipe_->InitBuffer(vectorInQueue, 1, baseM * baseN * sizeof(INNER_T));
-        pipe_->InitBuffer(vectorOutQueue, 1, baseM * baseN * sizeof(Y_T));
-
-        AscendC::DataCopyParams copyParams = {
-            (uint16_t)baseM, (uint16_t)(baseN * sizeof(Y_T) / AscendC::DEFAULT_C0_SIZE), (uint16_t)0,
-            (uint16_t)((this->tiling.N - baseN) * sizeof(Y_T) / AscendC::DEFAULT_C0_SIZE)};
-        uint32_t iterateTimes =
-            AscendC::Ceil(this->tiling.singleCoreM, baseM) * AscendC::Ceil(this->tiling.singleCoreN, baseN);
-        for (uint32_t i = 0; i < iterateTimes; ++i) {
-            // compute
-            auto cInLocal = vectorInQueue.AllocTensor<INNER_T>();
-            matmulObj.template GetTensorC<false>(cInLocal);
-            vectorInQueue.EnQue(cInLocal);
-            // any vector operator
-            auto src = vectorInQueue.DeQue<INNER_T>();
-            auto dst = vectorOutQueue.AllocTensor<Y_T>();
-
-            AscendC::LocalTensor<INNER_T> tmpTensor = vectorCalcBuf.Get<INNER_T>();
-            AscendC::Muls(tmpTensor, src, scalar, baseM * baseN);
-            AscendC::PipeBarrier<PIPE_V>();
-            AscendC::Cast(dst, tmpTensor, AscendC::RoundMode::CAST_NONE, baseM * baseN);
-            AscendC::PipeBarrier<PIPE_V>();
-            vectorOutQueue.EnQue(dst);
-            vectorInQueue.FreeTensor(src);
-            // copy out
-            auto cOutLocal = vectorOutQueue.DeQue<Y_T>();
-            DataCopy(yOutGm_[i], cOutLocal, copyParams);
-            vectorOutQueue.FreeTensor(cOutLocal);
-        }
-        matmulObj.End();
-        AscendC::SetNextTaskStart();
-    }
-
-private:
-    AscendC::TPipe *pipe_;
-    MAT_TYPE matmulObj;
-
-    AscendC::GlobalTensor<X_T> xInGm_;
-    AscendC::GlobalTensor<W_T> wInGm_;
-    AscendC::GlobalTensor<Y_T> yOutGm_;
-    AscendC::GlobalTensor<int32_t> loraIndicesGm_;
-    AscendC::GlobalTensor<int32_t> seqLenGm_;
-    AscendC::GlobalTensor<int32_t> loraRanksGm_;
-    AscendC::GlobalTensor<half> loraScalesGm_;
-
-    AscendC::GlobalTensor<Y_T> workspaceGlobal;
-
-    TCubeTiling tiling;
-    AscendC::TQue<AscendC::QuePosition::VECIN, 1> vectorInQueue;
-    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> vectorOutQueue;
-    AscendC::TBuf<AscendC::QuePosition::VECCALC> vectorCalcBuf;
-
-    uint32_t batchSize_;
-    uint32_t inputHiddenDim_;
-    uint32_t maxLoRARank_;
-    uint32_t singleLoRAWeightLen_;
-
-    uint64_t reqLoRAWeightOffset_;
-    int32_t reqLoRAIndex_;
-    int32_t reqLoRARank_;
-};
-
-extern "C" __global__ __aicore__ void sgemmc_shrink(GM_ADDR x, GM_ADDR weight, GM_ADDR loraIndices,
-                                                    uint32_t loraIndicesSize, GM_ADDR seqLen, uint32_t seqLenSize,
-                                                    GM_ADDR loraRanks, uint32_t loraRanksSize, GM_ADDR loraScales,
-                                                    uint32_t loraScalesSize, GM_ADDR y, uint32_t batchSize,
-                                                    uint32_t inputHiddenDim, uint32_t maxLoRARank, GM_ADDR workspace,
-                                                    GM_ADDR tiling)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIC_1_1);
-
-    AscendC::TPipe pipe;
-    sglang::npu_kernel::SGEMMCTilingData tilingData;
-    kernel_utils::CopyTiling(&tilingData, tiling);
-
-    if (tilingData.dataType == 1) {
-        SGEMMCShrink<bfloat16_t, float> op(&pipe);
-        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, loraScales,
-                loraScalesSize, y, batchSize, inputHiddenDim, maxLoRARank, workspace, tilingData.cubeTiling);
-        op.Process();
-    } else {
-        SGEMMCShrink<half, float> op(&pipe);
-        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, loraScales,
-                loraScalesSize, y, batchSize, inputHiddenDim, maxLoRARank, workspace, tilingData.cubeTiling);
-        op.Process();
-    }
-}
-
-#endif  // SGL_KERNEL_NPU_KERNEL_SGEMMC_SHRINK_H
diff --git a/csrc/lora/op_kernel/sgemmv_expand_kernel.cpp b/csrc/lora/op_kernel/sgemmv_expand_kernel.cpp
index 8111ed3a8..284b75ccf 100644
--- a/csrc/lora/op_kernel/sgemmv_expand_kernel.cpp
+++ b/csrc/lora/op_kernel/sgemmv_expand_kernel.cpp
@@ -20,7 +20,6 @@
 #define SGL_KERNEL_NPU_KERNEL_SGEMMV_EXPAND_H
 
 #include "kernel_operator.h"
-#include "lora_common_kernel.h"
 
 template <typename scalar_t>
 class SGEMMVExpand
@@ -107,19 +106,11 @@ class SGEMMVExpand
         if (endIdx > batchSize_) {
             endIdx = batchSize_;
         }
-
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
         for (int64_t idx = startIdx; idx < endIdx; idx++) {
             yOffset_ = outputFullDim_ * idx + sliceOffset_;
 
-            requestBlock = blockIterator.GetBlockIdx(idx);
-            if (requestBlock < 0) {
-                continue;
-            }
-
             // Set up LoRA index
-            reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
+            CopyInIndex(idx);
             if (reqLoRAIndex_ < 0) {
                 continue;
             }
@@ -153,6 +144,22 @@ class SGEMMVExpand
     }
 
 private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
     __aicore__ inline void ComputeLastIteration()
     {
         int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
diff --git a/csrc/lora/op_kernel/sgemmv_shrink_kernel.cpp b/csrc/lora/op_kernel/sgemmv_shrink_kernel.cpp
index 54dbf27e6..a01fa4df7 100644
--- a/csrc/lora/op_kernel/sgemmv_shrink_kernel.cpp
+++ b/csrc/lora/op_kernel/sgemmv_shrink_kernel.cpp
@@ -20,7 +20,6 @@
 #define SGL_KERNEL_NPU_KERNEL_SGEMMV_SHRINK_H
 
 #include "kernel_operator.h"
-#include "lora_common_kernel.h"
 
 template <typename scalar_t>
 class SGEMMVShrink
@@ -72,16 +71,9 @@ class SGEMMVShrink
         if (endIdx > batchSize_) {
             endIdx = batchSize_;
         }
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
         for (int64_t idx = startIdx; idx < endIdx; idx++) {
             // set up LoRA index
-            requestBlock = blockIterator.GetBlockIdx(idx);
-            if (requestBlock < 0) {
-                continue;
-            }
-
-            reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
+            CopyInIndex(idx);
             if (reqLoRAIndex_ < 0) {
                 continue;
             }
@@ -134,6 +126,22 @@ class SGEMMVShrink
         }
     }
 
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
     __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
     {
         AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
diff --git a/csrc/lora/op_kernel/sgmv_expand_kernel.cpp b/csrc/lora/op_kernel/sgmv_expand_kernel.cpp
index 5bb573990..9be8b01ce 100644
--- a/csrc/lora/op_kernel/sgmv_expand_kernel.cpp
+++ b/csrc/lora/op_kernel/sgmv_expand_kernel.cpp
@@ -20,7 +20,6 @@
 #define SGL_KERNEL_NPU_KERNEL_SGMV_EXPAND_H
 
 #include "kernel_operator.h"
-#include "lora_common_kernel.h"
 
 template <typename scalar_t>
 class SGMVExpand
@@ -103,18 +102,11 @@ class SGMVExpand
         if (endIdx > batchSize_) {
             endIdx = batchSize_;
         }
-
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
         for (int64_t idx = startIdx; idx < endIdx; idx++) {
             yOffset_ = outputFullDim_ * idx + sliceOffset_;
 
-            requestBlock = blockIterator.GetBlockIdx(idx);
-            if (requestBlock < 0) {
-                continue;
-            }
-
-            reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
+            // Set up LoRA index
+            CopyInIndex(idx);
             if (reqLoRAIndex_ < 0) {
                 continue;
             }
@@ -136,6 +128,22 @@ class SGMVExpand
     }
 
 private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
     __aicore__ inline void ComputeLastIteration()
     {
         int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
diff --git a/csrc/lora/op_kernel/sgmv_shrink_kernel.cpp b/csrc/lora/op_kernel/sgmv_shrink_kernel.cpp
index e483e5ab1..05b7212a8 100644
--- a/csrc/lora/op_kernel/sgmv_shrink_kernel.cpp
+++ b/csrc/lora/op_kernel/sgmv_shrink_kernel.cpp
@@ -20,7 +20,6 @@
 #define SGL_KERNEL_NPU_KERNEL_SGMV_SHRINK_H
 
 #include "kernel_operator.h"
-#include "lora_common_kernel.h"
 
 template <typename scalar_t>
 class SGMVShrink
@@ -70,17 +69,9 @@ class SGMVShrink
         if (endIdx > batchSize_) {
             endIdx = batchSize_;
         }
-
-        int64_t requestBlock = 0;
-        lora_common::BlockIterator blockIterator(seqLenGm_);
         for (int64_t idx = startIdx; idx < endIdx; idx++) {
             // set up LoRA index
-            requestBlock = blockIterator.GetBlockIdx(idx);
-            if (requestBlock < 0) {
-                continue;
-            }
-
-            reqLoRAIndex_ = loraIndicesGm_.GetValue(requestBlock);
+            CopyInIndex(idx);
             if (reqLoRAIndex_ < 0) {
                 continue;
             }
@@ -125,6 +116,22 @@ class SGMVShrink
         }
     }
 
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
     __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
     {
         AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
diff --git a/csrc/pytorch_extensions.cpp b/csrc/pytorch_extensions.cpp
index 4a6810cf6..5c761af07 100644
--- a/csrc/pytorch_extensions.cpp
+++ b/csrc/pytorch_extensions.cpp
@@ -72,17 +72,14 @@ TORCH_LIBRARY_FRAGMENT(npu, m)
         "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y,"
         "            int slice_offset, int slice_size) -> Tensor");
 
-    m.def(
-        "bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y,"
-        "            float scale) -> ()");
+    m.def("bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y, float scale) -> ()");
 
     m.def(
         "sgmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y,"
         "            int slice_offset, int slice_size) -> Tensor");
 
     m.def(
-        "sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y,"
-        " float scale) -> ()");
+        "sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y, float scale) -> ()");
 
     m.def(
         "sgemmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! lora_ranks,"
@@ -99,14 +96,6 @@ TORCH_LIBRARY_FRAGMENT(npu, m)
         "Tensor(b!)? intermediate_state=None, Tensor? cache_indices=None, "
         "Tensor? num_accepted_tokens=None, Tensor? g=None, Tensor? gk=None) -> Tensor");
 
-    m.def(
-        "sgemmc_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! lora_ranks,"
-        "              Tensor! sliceOffsets, Tensor! y) -> Tensor");
-
-    m.def(
-        "sgemmc_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! lora_ranks,"
-        "              Tensor! lora_scales, Tensor! y) -> ()");
-
 #ifdef BUILD_CATLASS_MODULE
     m.def("catlass_matmul_basic(Tensor tensor_a, Tensor tensor_b, Tensor(a!) tensor_c, str? format_mode=None) -> ()");
 #endif
@@ -166,10 +155,6 @@ TORCH_LIBRARY_IMPL(npu, PrivateUse1, m)
 
     m.impl("recurrent_gated_delta_rule", TORCH_FN(sglang::npu_kernel::recurrent_gated_delta_rule));
 
-    m.impl("sgemmc_expand", TORCH_FN(sglang::npu_kernel::sgemmc_expand));
-
-    m.impl("sgemmc_shrink", TORCH_FN(sglang::npu_kernel::sgemmc_shrink));
-
 #ifdef BUILD_CATLASS_MODULE
     m.impl("catlass_matmul_basic", TORCH_FN(sglang::npu_kernel::catlass_matmul_basic));
 #endif
diff --git a/csrc/utils/common_tiling.h b/csrc/utils/common_tiling.h
index dd8b22371..a7543142c 100644
--- a/csrc/utils/common_tiling.h
+++ b/csrc/utils/common_tiling.h
@@ -19,27 +19,6 @@
 
 namespace host_utils {
 
-enum class DataType : int32_t {
-    DT_FLOAT16 = 0,   // fp16 type
-    DT_BFLOAT16 = 1,  // bfloat16 type
-    DT_FLOAT = 2,     // float type
-    DT_DOUBLE = 3,    // double type
-    DT_BOOL = 4,      // bool type
-    DT_INT8 = 5,      // int8 type
-    DT_INT16,         // int16 type
-    DT_INT32,         // int32 type
-    DT_INT64,         // int64 type
-    DT_UINT8,         // unsigned int8 type
-    DT_UINT16,        // unsigned int16 type
-    DT_UINT32,        // unsigned int32 type
-    DT_UINT64,        // unsigned int64 type
-    DT_COMPLEX64,     // complex64 type
-    DT_COMPLEX128,    // complex128 type
-
-    DT_UNDEFINED,
-    DT_MAX  // max type
-};
-
 constexpr uint32_t FP16_SIZE = 2;
 constexpr uint32_t FP32_SIZE = 4;
 constexpr uint32_t BLOCK_SIZE = 16;
@@ -257,7 +236,5 @@ inline __attribute__((always_inline)) void PpMatmulTilingCheck(const PpTilingDat
     TORCH_CHECK(tilingData.nLoop > 0, "nLoop is invalid");
     TORCH_CHECK(tilingData.blockDim > 0, "nLoop is invalid");
 }
-
 }  // namespace host_utils
-
 #endif
diff --git a/csrc/utils/kernel/common_tiling_kernel.h b/csrc/utils/kernel/common_tiling_kernel.h
deleted file mode 100644
index efa8135f4..000000000
--- a/csrc/utils/kernel/common_tiling_kernel.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2026 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-#ifndef COMMMON_TILING_KERNEL_H
-#define COMMMON_TILING_KERNEL_H
-
-#include "kernel_operator.h"
-
-namespace kernel_utils {
-
-template <typename T>
-__aicore__ inline void CopyTiling(T *tiling, GM_ADDR tilingGM)
-{
-    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
-    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
-
-    for (int i = 0; i < sizeof(T) / sizeof(uint32_t); ++i, ++ptr) {
-        *ptr = *(tiling32 + i);
-    }
-}
-
-}  // namespace kernel_utils
-
-#endif  // COMMMON_TILING_KERNEL_H
diff --git a/csrc/utils/torch_helper.h b/csrc/utils/torch_helper.h
index 239f0824c..ab92de038 100644
--- a/csrc/utils/torch_helper.h
+++ b/csrc/utils/torch_helper.h
@@ -25,7 +25,6 @@
 
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "common_tiling.h"
 
 namespace sglang {
 namespace npu_kernel {
@@ -53,49 +52,6 @@ class TorchNpuHelper
         return const_cast<void *>(at_tensor.data_ptr());
     }
 
-    inline static host_utils::DataType ConvertDataType(const at::ScalarType type)
-    {
-        switch (type) {
-            case at::ScalarType::Float:
-                return host_utils::DataType::DT_FLOAT;
-
-            case at::ScalarType::Half:
-                return host_utils::DataType::DT_FLOAT16;
-
-            case at::ScalarType::BFloat16:
-                return host_utils::DataType::DT_BFLOAT16;
-
-            case at::ScalarType::Double:
-                return host_utils::DataType::DT_DOUBLE;
-
-            case at::ScalarType::Bool:
-                return host_utils::DataType::DT_BOOL;
-
-            case at::ScalarType::Char:
-                return host_utils::DataType::DT_INT8;
-
-            case at::ScalarType::Short:
-                return host_utils::DataType::DT_INT16;
-
-            case at::ScalarType::Int:
-                return host_utils::DataType::DT_INT32;
-
-            case at::ScalarType::Long:
-                return host_utils::DataType::DT_INT64;
-
-            case at::ScalarType::Byte:
-                return host_utils::DataType::DT_UINT8;
-
-            case at::ScalarType::ComplexFloat:
-                return host_utils::DataType::DT_COMPLEX64;
-
-            case at::ScalarType::ComplexDouble:
-                return host_utils::DataType::DT_COMPLEX128;
-        }
-
-        return host_utils::DataType::DT_MAX;
-    }
-
     template <typename T>
     inline static T ConvertType(T value)
     {
diff --git a/include/sgl_kenel_npu_ops.h b/include/sgl_kenel_npu_ops.h
index a156d93b1..95460b68a 100644
--- a/include/sgl_kenel_npu_ops.h
+++ b/include/sgl_kenel_npu_ops.h
@@ -106,15 +106,6 @@ at::Tensor recurrent_gated_delta_rule(
     c10::optional<at::Tensor> num_accepted_tokens_opt,
     c10::optional<at::Tensor> g_opt, c10::optional<at::Tensor> gk_opt);
 
-at::Tensor sgemmc_expand(at::Tensor &x, at::Tensor &weight,
-                         at::Tensor &lora_indices, at::Tensor &seq_len,
-                         at::Tensor &lora_ranks, at::Tensor &slice_offsets,
-                         at::Tensor &y);
-
-void sgemmc_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices,
-                   at::Tensor &seq_len, at::Tensor &lora_ranks,
-                   at::Tensor &lora_scales, at::Tensor &y);
-
 #ifdef BUILD_CATLASS_MODULE
 void catlass_matmul_basic(const at::Tensor &tensor_a,
                           const at::Tensor &tensor_b, at::Tensor &tensor_c,
diff --git a/python/sgl_kernel_npu/sgl_kernel_npu/mamba/mamba_state_update_triton.py b/python/sgl_kernel_npu/sgl_kernel_npu/mamba/mamba_state_update_triton.py
index 854910021..08542cd44 100644
--- a/python/sgl_kernel_npu/sgl_kernel_npu/mamba/mamba_state_update_triton.py
+++ b/python/sgl_kernel_npu/sgl_kernel_npu/mamba/mamba_state_update_triton.py
@@ -113,12 +113,8 @@ def move_intermediate_cache(
     dst_layer_stride, dst_size_stride = int(ssm_states.stride()[0]), int(
         ssm_states.stride()[1]
     )
-    assert len(dst_indices_tensor) == len(
-        last_steps_tensor
-    ), "Destination indices lengths must match"
-    assert len(src_indices_tensor) == len(
-        last_steps_tensor
-    ), "Source indices lengths must match"
+    assert len(dst_indices_tensor) == len(last_steps_tensor), "Destination indices lengths must match"
+    assert len(src_indices_tensor) == len(last_steps_tensor), "Source indices lengths must match"
 
     # Grid: one thread per valid index
     grid = (len(dst_indices_tensor),)
diff --git a/tests/python/sgl_kernel_npu/test_mamba_state_update.py b/tests/python/sgl_kernel_npu/test_mamba_state_update.py
index ed78f1797..e755f9767 100644
--- a/tests/python/sgl_kernel_npu/test_mamba_state_update.py
+++ b/tests/python/sgl_kernel_npu/test_mamba_state_update.py
@@ -139,9 +139,7 @@ def test_move_intermediate_cache(
     valid_indices = random.sample(population, num_valid)
     last_step_pos = [random.randint(0, D - 1) for _ in range(num_valid)]
     dst_indices_tensor = torch.tensor(valid_indices, device=device, dtype=torch.int32)
-    src_indices_tensor = torch.arange(
-        dst_indices_tensor.shape[0], device=device, dtype=torch.int32
-    )
+    src_indices_tensor = torch.arange(dst_indices_tensor.shape[0], device=device, dtype=torch.int32)
     last_steps_tensor = torch.tensor(last_step_pos, device=device, dtype=torch.int32)
 
     valid_mask = last_steps_tensor >= 0
@@ -153,12 +151,6 @@ def test_move_intermediate_cache(
         :, src_state_indices, valid_last_steps
     ]
 
-    move_intermediate_cache(
-        dst_cache_clone,
-        src_cache,
-        dst_indices_tensor,
-        src_indices_tensor,
-        last_steps_tensor,
-    )
+    move_intermediate_cache(dst_cache_clone, src_cache, dst_indices_tensor, src_indices_tensor, last_steps_tensor)
 
     assert_close("move_cache", dst_cache, dst_cache_clone, 1e-3)