@@ -969,41 +969,19 @@ void invokeNoAuxTcRedundant(T* scores,
969969 int64_t num_cases = num_tokens * n_group;
970970 int64_t topk_with_k2_num_blocks = (num_cases - 1 ) / NUM_WARPS_PER_BLOCK + 1 ;
971971
972- #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
973972 topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0 , stream>>>(
974973 group_scores,
975974 scores_with_bias,
976975 num_cases,
977976 n_group,
978977 num_experts / n_group);
979- #else
980- auto * kernel_instance1 = &topk_with_k2_kernel<T>;
981- cudaLaunchConfig_t config;
982- config.gridDim = topk_with_k2_num_blocks;
983- config.blockDim = BLOCK_SIZE;
984- config.dynamicSmemBytes = 0 ;
985- config.stream = stream;
986- cudaLaunchAttribute attrs[1 ];
987- attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
988- attrs[0 ].val .programmaticStreamSerializationAllowed = false ;
989- config.numAttrs = 1 ;
990- config.attrs = attrs;
991- cudaLaunchKernelEx (&config,
992- kernel_instance1,
993- group_scores,
994- scores_with_bias,
995- num_cases,
996- n_group,
997- num_experts / n_group);
998- #endif
999978
1000979 int64_t topk_with_k_group_num_blocks =
1001980 (num_tokens - 1 ) / NUM_WARPS_PER_BLOCK + 1 ;
1002981 size_t dynamic_smem_in_bytes =
1003982 warp_topk::calc_smem_size_for_block_wide<T, int32_t >(NUM_WARPS_PER_BLOCK,
1004983 topk);
1005984
1006- #ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
1007985 group_idx_and_topk_idx_redundant_kernel<T>
1008986 <<<topk_with_k_group_num_blocks,
1009987 BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
10251003 num_experts / n_group,
10261004 routed_scaling_factor,
10271005 redundant_ep_rank_num_plus_one);
1028- #else
1029- auto * kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
1030- config.gridDim = topk_with_k_group_num_blocks;
1031- config.blockDim = BLOCK_SIZE;
1032- config.dynamicSmemBytes = dynamic_smem_in_bytes;
1033- config.stream = stream;
1034- attrs[0 ].id = cudaLaunchAttributeProgrammaticStreamSerialization;
1035- attrs[0 ].val .programmaticStreamSerializationAllowed = false ;
1036- config.numAttrs = 1 ;
1037- config.attrs = attrs;
1038- cudaLaunchKernelEx (&config,
1039- kernel_instance2,
1040- scores,
1041- group_scores,
1042- topk_values,
1043- topk_indices,
1044- scores_with_bias,
1045- num_tokens,
1046- n_group,
1047- topk_group,
1048- topk,
1049- num_experts,
1050- num_experts / n_group,
1051- renormalize,
1052- routed_scaling_factor);
1053- #endif
10541006}
10551007
10561008#define INSTANTIATE_NOAUX_TC (T, IdxT ) \
0 commit comments