sgl-project · ping1jing2 · May 6, 2026 · Jan 26, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -19,6 +19,9 @@ FILE(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgmv_shrink.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmv_expand.cpp
     ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmv_shrink.cpp
+    ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmc_expand.cpp
+    ${PROJECT_OP_SRC_BASE}/lora/op_host/sgemmc_shrink.cpp
+    ${PROJECT_OP_SRC_BASE}/lora/op_host/tiling/sgemmc_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp
     ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv.cpp
@@ -51,6 +54,10 @@ ascendc_library(no_workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/recurrent_gated_delta_rule/op_kernel/recurrent_gated_delta_rule_kernel.cpp
 )
 
+ascendc_include_directories(no_workspace_kernel PRIVATE
+    ${PROJECT_OP_SRC_BASE}/utils/kernel
+)
+
 # kernel side files with workspace
 set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
@@ -59,6 +66,8 @@ set(WORKSPACE_KERNEL_SRCS
     ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp
     ${PROJECT_OP_SRC_BASE}/causal_conv1d_update/op_kernel/causal_conv1d_update.cpp
     ${PROJECT_OP_SRC_BASE}/causal_conv1d/op_kernel/causal_conv1d.cpp
+    ${PROJECT_OP_SRC_BASE}/lora/op_kernel/sgemmc_expand_kernel.cpp
+    ${PROJECT_OP_SRC_BASE}/lora/op_kernel/sgemmc_shrink_kernel.cpp
 )
 if(BUILD_CATLASS_MODULE)
     list(APPEND WORKSPACE_KERNEL_SRCS
@@ -76,6 +85,10 @@ if(BUILD_CATLASS_MODULE)
     )
 endif()
 
+ascendc_include_directories(workspace_kernel PRIVATE
+    ${PROJECT_OP_SRC_BASE}/utils/kernel
+)
+
 ascendc_compile_definitions(workspace_kernel PRIVATE
        -DHAVE_WORKSPACE
        -DHAVE_TILING
@@ -115,6 +128,7 @@ target_include_directories(${OP_PLUGIN_NAME} PRIVATE
         ${TORCH_DIR}/include
         ${TORCH_DIR}/include/torch/csrc/api/include
         ${TORCH_NPU_DIR}/include
+        ${ASCEND_INCLUDE_DIR}
         ${ASCEND_INCLUDE_DIR}/external
         ${ASCEND_INCLUDE_DIR}/experiment/platform
         ${ASCEND_INCLUDE_DIR}/experiment/runtime

diff --git a/csrc/lora/op_host/sgemmc_expand.cpp b/csrc/lora/op_host/sgemmc_expand.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "defines.h"
+#include "tiling/sgemmc_tiling.h"
+#include "torch_helper.h"
+
+#include "aclrtlaunch_sgemmc_expand.h"
+
+namespace sglang {
+namespace npu_kernel {
+
+HOST_API at::Tensor sgemmc_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
+                                  at::Tensor &lora_ranks, at::Tensor &slice_offsets, at::Tensor &y)
+{
+    at::ScalarType scalar_type = y.scalar_type();
+    TORCH_CHECK(scalar_type == at::kHalf || scalar_type == at::kBFloat16, "only support half and bf16");
+    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
+    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
+                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
+    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
+    TORCH_CHECK(slice_offsets.dim() == 1 && slice_offsets.size(0) > 1,
+                "slice_offsets should be a vector of size 2 and more.");
+    TORCH_CHECK(lora_ranks.dim() == 1, "lora_ranks should be a vector.");
+
+    at::Tensor y_out = y;
+    void *x_ptr = x.data_ptr();
+    void *weight_ptr = weight.data_ptr();
+    void *y_ptr = y.data_ptr();
+    void *y_out_ptr = y_out.data_ptr();
+
+    void *lora_indices_ptr = lora_indices.data_ptr();
+    int lora_indices_size = lora_indices.size(0);
+    void *seq_len_ptr = seq_len.data_ptr();
+    int seq_len_size = seq_len.size(0);
+    void *lora_ranks_ptr = lora_ranks.data_ptr();
+    int lora_ranks_size = lora_ranks.size(0);
+    void *slice_offsets_ptr = slice_offsets.data_ptr();
+    int slice_offsets_size = slice_offsets.size(0);
+    int slice_count = slice_offsets_size - 1;
+    int batch_size = x.size(0);
+    int max_lora_rank = x.size(1) / slice_count;
+    int output_full_dim = y.size(1);
+
+    uint32_t block_dim;
+    uint32_t workspace_size;
+
+    at::Tensor tiling_tensor = GenerateTiling(block_dim, workspace_size, batch_size, max_lora_rank, output_full_dim,
+                                              slice_count, TorchNpuHelper::ConvertDataType(scalar_type));
+    auto workspace_tensor =
+        at::empty({workspace_size}, at::TensorOptions().dtype(at::kByte).device(x.options().device()));
+
+    /* launch the kernel function via torch */
+    EXEC_KERNEL_CMD(sgemmc_expand, block_dim, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr,
+                    seq_len_size, lora_ranks_ptr, lora_ranks_size, slice_offsets_ptr, slice_offsets_size, y_ptr,
+                    y_out_ptr, batch_size, max_lora_rank, output_full_dim, workspace_tensor, tiling_tensor);
+
+    return y_out;
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/lora/op_host/sgemmc_shrink.cpp b/csrc/lora/op_host/sgemmc_shrink.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "defines.h"
+#include "tiling/sgemmc_tiling.h"
+#include "torch_helper.h"
+
+#include "aclrtlaunch_sgemmc_shrink.h"
+
+namespace sglang {
+namespace npu_kernel {
+
+HOST_API void sgemmc_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
+                            at::Tensor &lora_ranks, at::Tensor &lora_scales, at::Tensor &y, int64_t slices)
+{
+    at::ScalarType scalar_type = x.scalar_type();
+    TORCH_CHECK(scalar_type == at::kHalf || scalar_type == at::kBFloat16, "only support half and bf16");
+    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
+    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
+                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
+    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
+    TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out");
+
+    void *x_ptr = x.data_ptr();
+    void *weight_ptr = weight.data_ptr();
+    void *lora_indices_ptr = lora_indices.data_ptr();
+    int lora_indices_size = lora_indices.size(0);
+    void *seq_len_ptr = seq_len.data_ptr();
+    int seq_len_size = seq_len.size(0);
+    void *lora_ranks_ptr = lora_ranks.data_ptr();
+    int lora_ranks_size = lora_ranks.size(0);
+    void *lora_scales_ptr = lora_scales.data_ptr();
+    int lora_scales_size = lora_scales.size(0);
+
+    void *y_ptr = y.data_ptr();
+    int batch_size = x.size(0);
+    int input_hidden_token = x.size(1);
+    uint32_t max_lora_rank = y.size(1) / slices;
+    uint32_t slice_count = slices;
+
+    uint32_t block_dim;
+    uint32_t workspace_size;
+
+    at::Tensor tiling_tensor = GenerateTiling(block_dim, workspace_size, batch_size, input_hidden_token, max_lora_rank,
+                                              slice_count, TorchNpuHelper::ConvertDataType(scalar_type));
+
+    auto workspace_tensor =
+        at::empty({workspace_size}, at::TensorOptions().dtype(at::kByte).device(x.options().device()));
+    /* launch the kernel function via torch */
+    EXEC_KERNEL_CMD(sgemmc_shrink, block_dim, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr,
+                    seq_len_size, lora_ranks_ptr, lora_ranks_size, lora_scales_ptr, lora_scales_size, y_ptr, batch_size,
+                    input_hidden_token, max_lora_rank, slice_count, workspace_tensor, tiling_tensor);
+    return;
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling.cpp b/csrc/lora/op_host/tiling/sgemmc_tiling.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "common.h"
+#include "sgemmc_tiling.h"
+
+namespace sglang {
+namespace npu_kernel {
+
+matmul_tiling::DataType ConvertToMatMulTypes(host_utils::DataType data_type)
+{
+    switch (data_type) {
+        case host_utils::DataType::DT_BFLOAT16:
+            return matmul_tiling::DataType::DT_BFLOAT16;
+        case host_utils::DataType::DT_FLOAT:
+            return matmul_tiling::DataType::DT_FLOAT;
+        case host_utils::DataType::DT_FLOAT16:
+            return matmul_tiling::DataType::DT_FLOAT16;
+    }
+
+    return matmul_tiling::DataType::DT_FLOAT16;
+}
+
+at::Tensor GenerateTiling(uint32_t &block_dim, uint32_t &workspace_size, uint32_t batch_size, uint32_t inner_size,
+                          uint32_t output_size, uint32_t slice_count, const host_utils::DataType type)
+{
+    auto ascendc_platform = *platform_ascendc::PlatformAscendCManager::GetInstance();
+    uint32_t aiv_num = ascendc_platform.GetCoreNumAiv();
+    uint32_t aic_num = ascendc_platform.GetCoreNumAic();
+    workspace_size = ascendc_platform.GetLibApiWorkSpaceSize();
+
+    auto tilingBuffer = at::empty({sizeof(SGEMMCTilingData)}, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
+    SGEMMCTilingData *tiling_data = reinterpret_cast<SGEMMCTilingData *>(tilingBuffer.data_ptr());
+
+    matmul_tiling::MatmulApiTiling cubeTiling(ascendc_platform);
+
+    const matmul_tiling::DataType data_type = ConvertToMatMulTypes(type);
+
+    cubeTiling.EnableBias(false);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::VECTOR, data_type, false);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, data_type, true);
+    cubeTiling.SetCType(matmul_tiling::TPosition::VECIN, matmul_tiling::CubeFormat::ND,
+                        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, data_type);
+
+    cubeTiling.SetOrgShape(1, output_size, inner_size);
+    cubeTiling.SetShape(1, output_size, inner_size);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+
+    if (cubeTiling.GetTiling(tiling_data->cubeTiling) == -1) {
+        TORCH_CHECK(false, "Generate tiling failed.");
+        return {};
+    }
+
+    tiling_data->tilingKey = (type == host_utils::DataType::DT_BFLOAT16);
+
+    block_dim = batch_size * slice_count;
+    workspace_size = ascendc_platform.GetLibApiWorkSpaceSize() +
+                     static_cast<uint32_t>(batch_size * tiling_data->cubeTiling.baseM * tiling_data->cubeTiling.baseN *
+                                           sizeof(float));
+
+    return TorchNpuHelper::CopyTensorHostToDevice(tilingBuffer);
+}
+
+}  // namespace npu_kernel
+}  // namespace sglang
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling.h b/csrc/lora/op_host/tiling/sgemmc_tiling.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef SGEMMC_TILING_H
+#define SGEMMC_TILING_H
+
+#include <register/tilingdata_base.h>
+#include <tiling/tiling_api.h>
+
+#include "torch_helper.h"
+#include "common_tiling.h"
+#include "sgemmc_tiling_data.h"
+
+namespace sglang {
+namespace npu_kernel {
+
+at::Tensor GenerateTiling(uint32_t &blockDim, uint32_t &workspace, uint32_t batch, uint32_t hidden_size, uint32_t k,
+                          uint32_t slice_count, const host_utils::DataType type);
+
+}  // namespace npu_kernel
+}  // namespace sglang
+
+#endif  // SGEMMC_TILING_H
diff --git a/csrc/lora/op_host/tiling/sgemmc_tiling_data.h b/csrc/lora/op_host/tiling/sgemmc_tiling_data.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#ifndef SGEMMC_TILING_DATA_H
+#define SGEMMC_TILING_DATA_H
+
+#include <cstdint>
+
+namespace AscendC {
+namespace tiling {
+
+struct TCubeTiling;
+
+}  // namespace tiling
+}  // namespace AscendC
+
+namespace sglang {
+namespace npu_kernel {
+
+#pragma pack(push, 1)
+struct SGEMMCTilingData {
+    uint32_t tilingKey;
+    AscendC::tiling::TCubeTiling cubeTiling;
+};
+#pragma pack(pop)
+
+}  // namespace npu_kernel
+}  // namespace sglang
+
+#endif  // SGEMMC_TILING_DATA_H