Skip to content

Commit 61fc368

Browse files
[Fix] fix eplb noaux (PaddlePaddle#5239)
* fix eplb noaux * fix eplb noaux
1 parent bc118c3 commit 61fc368

2 files changed

Lines changed: 0 additions & 49 deletions

File tree

custom_ops/gpu_ops/noauxtc_kernel.h

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -969,41 +969,19 @@ void invokeNoAuxTcRedundant(T* scores,
969969
int64_t num_cases = num_tokens * n_group;
970970
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
971971

972-
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
973972
topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
974973
group_scores,
975974
scores_with_bias,
976975
num_cases,
977976
n_group,
978977
num_experts / n_group);
979-
#else
980-
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
981-
cudaLaunchConfig_t config;
982-
config.gridDim = topk_with_k2_num_blocks;
983-
config.blockDim = BLOCK_SIZE;
984-
config.dynamicSmemBytes = 0;
985-
config.stream = stream;
986-
cudaLaunchAttribute attrs[1];
987-
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
988-
attrs[0].val.programmaticStreamSerializationAllowed = false;
989-
config.numAttrs = 1;
990-
config.attrs = attrs;
991-
cudaLaunchKernelEx(&config,
992-
kernel_instance1,
993-
group_scores,
994-
scores_with_bias,
995-
num_cases,
996-
n_group,
997-
num_experts / n_group);
998-
#endif
999978

1000979
int64_t topk_with_k_group_num_blocks =
1001980
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
1002981
size_t dynamic_smem_in_bytes =
1003982
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
1004983
topk);
1005984

1006-
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
1007985
group_idx_and_topk_idx_redundant_kernel<T>
1008986
<<<topk_with_k_group_num_blocks,
1009987
BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
10251003
num_experts / n_group,
10261004
routed_scaling_factor,
10271005
redundant_ep_rank_num_plus_one);
1028-
#else
1029-
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
1030-
config.gridDim = topk_with_k_group_num_blocks;
1031-
config.blockDim = BLOCK_SIZE;
1032-
config.dynamicSmemBytes = dynamic_smem_in_bytes;
1033-
config.stream = stream;
1034-
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
1035-
attrs[0].val.programmaticStreamSerializationAllowed = false;
1036-
config.numAttrs = 1;
1037-
config.attrs = attrs;
1038-
cudaLaunchKernelEx(&config,
1039-
kernel_instance2,
1040-
scores,
1041-
group_scores,
1042-
topk_values,
1043-
topk_indices,
1044-
scores_with_bias,
1045-
num_tokens,
1046-
n_group,
1047-
topk_group,
1048-
topk,
1049-
num_experts,
1050-
num_experts / n_group,
1051-
renormalize,
1052-
routed_scaling_factor);
1053-
#endif
10541006
}
10551007

10561008
#define INSTANTIATE_NOAUX_TC(T, IdxT) \

fastdeploy/eplb/experts_manager.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,6 @@ def allreduce_load_weight_result(self):
448448
if (
449449
self.fd_config.scheduler_config.splitwise_role == "mixed"
450450
or self.fd_config.scheduler_config.splitwise_role == "decode"
451-
or self.fd_config.scheduler_config.splitwise_role == "prefill"
452451
or not self.eplb_config.redundant_expert_enable_schedule_cordon
453452
):
454453
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")

0 commit comments

Comments
 (0)