[Fix] fix eplb noaux (PaddlePaddle#5239)

xiaoxiaohehe001 · web-flow · commit 61fc368066d0 · 2025-11-26T17:50:51.000+08:00
* fix eplb noaux

* fix eplb noaux
diff --git a/custom_ops/gpu_ops/noauxtc_kernel.h b/custom_ops/gpu_ops/noauxtc_kernel.h
@@ -969,41 +969,19 @@ void invokeNoAuxTcRedundant(T* scores,
   int64_t num_cases = num_tokens * n_group;
   int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
   topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
       group_scores,
       scores_with_bias,
       num_cases,
       n_group,
       num_experts / n_group);
-#else
-  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
-  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance1,
-                     group_scores,
-                     scores_with_bias,
-                     num_cases,
-                     n_group,
-                     num_experts / n_group);
-#endif
 
   int64_t topk_with_k_group_num_blocks =
       (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
   size_t dynamic_smem_in_bytes =
       warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                            topk);
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
   group_idx_and_topk_idx_redundant_kernel<T>
       <<<topk_with_k_group_num_blocks,
          BLOCK_SIZE,
@@ -1025,32 +1003,6 @@ void invokeNoAuxTcRedundant(T* scores,
                    num_experts / n_group,
                    routed_scaling_factor,
                    redundant_ep_rank_num_plus_one);
-#else
-  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = false;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(&config,
-                     kernel_instance2,
-                     scores,
-                     group_scores,
-                     topk_values,
-                     topk_indices,
-                     scores_with_bias,
-                     num_tokens,
-                     n_group,
-                     topk_group,
-                     topk,
-                     num_experts,
-                     num_experts / n_group,
-                     renormalize,
-                     routed_scaling_factor);
-#endif
 }
 
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                      \
diff --git a/fastdeploy/eplb/experts_manager.py b/fastdeploy/eplb/experts_manager.py
@@ -448,7 +448,6 @@ def allreduce_load_weight_result(self):
             if (
                 self.fd_config.scheduler_config.splitwise_role == "mixed"
                 or self.fd_config.scheduler_config.splitwise_role == "decode"
-                or self.fd_config.scheduler_config.splitwise_role == "prefill"
                 or not self.eplb_config.redundant_expert_enable_schedule_cordon
             ):
                 self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")