diff --git a/ucm/sparse/gsa_on_device/csrc/ascend/hamming_dist_top_k/op_kernel/hamming_dist_top_k_split_s.h b/ucm/sparse/gsa_on_device/csrc/ascend/hamming_dist_top_k/op_kernel/hamming_dist_top_k_split_s.h index 4d002502e..837080511 100644 --- a/ucm/sparse/gsa_on_device/csrc/ascend/hamming_dist_top_k/op_kernel/hamming_dist_top_k_split_s.h +++ b/ucm/sparse/gsa_on_device/csrc/ascend/hamming_dist_top_k/op_kernel/hamming_dist_top_k_split_s.h @@ -1011,13 +1011,14 @@ class HammingDistTopKSplitSKernel { if ASCEND_IS_AIC { return; } // topK小于32,不使用内部API排序 - if (curKScalar < 32) { useInnerSort = false; } + // (TODO) do not enable inner sort for now as 910B3 has bug + // if (curKScalar < 32) { useInnerSort = false; } - if (useInnerSort) { - CustomSort(topKIndexOutTensor, curKScalar); - SelectBlockTableFromTopK(curBatchIdx, curKScalar, outGmOffset); - return; - } + // if (useInnerSort) { + // CustomSort(topKIndexOutTensor, curKScalar); + // SelectBlockTableFromTopK(curBatchIdx, curKScalar, outGmOffset); + // return; + // } // 将 TopK 的“chunk索引”映射成 block_id,并写回 GM(indices) WriteBlockTableFromTopK(curBatchIdx, topKIndexOutTensor, curKScalar, outGmOffset); @@ -1509,9 +1510,11 @@ class HammingDistTopKSplitSKernel { LocalTensor blockIdUb = topKIndexInnerInQueue_.AllocTensor(); LocalTensor tableBlockTensor = tableBlockBuf_.template Get(); - DataCopyParams copyParams{1, static_cast(param_.blockCount * sizeof(int32_t)), 0, - 0}; - DataCopy(tableBlockTensor, keyBlockTableGm_[curBatchIdx * param_.blockCount], copyParams); + DataCopyExtParams copyParams{1, static_cast(param_.blockCount * sizeof(int32_t)), + 0, 0, 0}; + DataCopyPadExtParams copyPadParams{false, 0, 0, 0}; + DataCopyPad(tableBlockTensor, keyBlockTableGm_[curBatchIdx * param_.blockCount], copyParams, + copyPadParams); ::AscendC::WriteBlockTableFromTopK(curBatchIdx, topKIndexUb, blockIdUb, curKScalar, outGmOffset, tableBlockTensor, indicesGm_,