@@ -47,13 +47,12 @@ class SGEMMCExpand
4747 __aicore__ inline void Init (GM_ADDR x, GM_ADDR weight, GM_ADDR loraIndices, uint32_t loraIndicesSize,
4848 GM_ADDR seqLen, uint32_t seqLenSize, GM_ADDR loraRanks, uint32_t loraRanksSize,
4949 GM_ADDR sliceOffsets, uint32_t sliceOffsetsSize, GM_ADDR yIn, GM_ADDR yOut,
50- uint32_t batchSize, uint32_t numBlocksPerCore , uint32_t maxLoRARank ,
51- uint32_t outputFullDim, GM_ADDR workspace, TCubeTiling &tiling)
50+ uint32_t batchSize, uint32_t maxLoRARank , uint32_t outputFullDim, GM_ADDR workspace ,
51+ TCubeTiling &tiling)
5252 {
5353 this ->tiling = tiling;
5454
5555 batchSize_ = batchSize;
56- numBlocksPerCore_ = numBlocksPerCore;
5756 maxLoRARank_ = maxLoRARank;
5857 sliceCount_ = sliceOffsetsSize - 1 ;
5958 outputFullDim_ = outputFullDim;
@@ -78,15 +77,11 @@ class SGEMMCExpand
7877 int64_t blocks = AscendC::GetBlockNum ();
7978 int64_t blockIdx = AscendC::GetBlockIdx ();
8079
81- int64_t startIdx = blockIdx * numBlocksPerCore_;
82- int64_t endIdx = startIdx + numBlocksPerCore_;
83-
8480 AscendC::WaitPreTaskEnd ();
8581
86- int64_t batchIdx = 0 ;
8782 int64_t requestBlock = 0 ;
8883 lora_common::BlockIterator blockIterator (seqLenGm_);
89- requestBlock = blockIterator.GetBlockIdx (batchIdx );
84+ requestBlock = blockIterator.GetBlockIdx (blockIdx );
9085 if (requestBlock < 0 ) {
9186 return ;
9287 }
@@ -178,7 +173,6 @@ class SGEMMCExpand
178173
179174 uint32_t batchSize_;
180175 uint32_t sliceCount_;
181- uint32_t numBlocksPerCore_;
182176 uint32_t maxLoRARank_;
183177 uint32_t outputHiddenDim_;
184178 uint32_t sliceOffset_;
@@ -197,8 +191,8 @@ extern "C" __global__ __aicore__ void sgemmc_expand(GM_ADDR x, GM_ADDR weight, G
197191 uint32_t loraIndicesSize, GM_ADDR seqLen, uint32_t seqLenSize,
198192 GM_ADDR loraRanks, uint32_t loraRanksSize, GM_ADDR sliceOffsets,
199193 uint32_t sliceOffsetsSize, GM_ADDR yIn, GM_ADDR yOut,
200- uint32_t batchSize, uint32_t numBlocksPerCore , uint32_t maxLoRARank ,
201- uint32_t outputFullDim, GM_ADDR workspace, GM_ADDR tiling)
194+ uint32_t batchSize, uint32_t maxLoRARank , uint32_t outputFullDim ,
195+ GM_ADDR workspace, GM_ADDR tiling)
202196{
203197 KERNEL_TASK_TYPE_DEFAULT (KERNEL_TYPE_MIX_AIC_1_1);
204198
@@ -209,14 +203,12 @@ extern "C" __global__ __aicore__ void sgemmc_expand(GM_ADDR x, GM_ADDR weight, G
209203 if (tilingData.dataType == 1 ) {
210204 SGEMMCExpand<bfloat16_t , float > op (&pipe);
211205 op.Init (x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, sliceOffsets,
212- sliceOffsetsSize, yIn, yOut, batchSize, numBlocksPerCore, maxLoRARank, outputFullDim, workspace,
213- tilingData.cubeTiling );
206+ sliceOffsetsSize, yIn, yOut, batchSize, maxLoRARank, outputFullDim, workspace, tilingData.cubeTiling );
214207 op.Process ();
215208 } else {
216209 SGEMMCExpand<half, float > op (&pipe);
217210 op.Init (x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, loraRanks, loraRanksSize, sliceOffsets,
218- sliceOffsetsSize, yIn, yOut, batchSize, numBlocksPerCore, maxLoRARank, outputFullDim, workspace,
219- tilingData.cubeTiling );
211+ sliceOffsetsSize, yIn, yOut, batchSize, maxLoRARank, outputFullDim, workspace, tilingData.cubeTiling );
220212 op.Process ();
221213 }
222214}
0 commit comments