-
Notifications
You must be signed in to change notification settings - Fork 742
[OP]Unify MoE op with moe_permute path for bf16 GLM #7164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
8021e7d
2f9f76d
9efe026
4db4771
ef57f4e
442b576
4bd6b95
ad24148
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,11 @@ | |
| from .fused_moe_backend_base import UnquantizedFusedMoEMethod | ||
|
|
||
| if current_platform.is_cuda(): | ||
| from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce | ||
| from fastdeploy.model_executor.ops.gpu import ( | ||
| count_tokens_per_expert_func, | ||
| moe_expert_dispatch, | ||
| moe_expert_reduce, | ||
| ) | ||
|
|
||
| try: | ||
| from fastdeploy.model_executor.ops.gpu import ( | ||
|
|
@@ -286,6 +290,70 @@ def apply_tp( | |
| layer.gate_correction_bias, | ||
| getattr(layer, "renormalize", True), | ||
| ) | ||
| if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE and self.moe_quant_type == "w16a16": | ||
| # moe_permute path: CUDA-graph safe, no D2H copies | ||
| print("use moe_permute in tp") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 调试用的 生产代码中不应包含调试输出,会影响日志可读性和性能。 建议:删除此行,或改用 logger.debug("use moe_permute in tp") |
||
| topk_idx_i32 = topk_idx.astype(paddle.int32) | ||
| override_buffer_size = x.shape[0] * layer.top_k + layer.num_experts * (128 - 1) | ||
| ( | ||
| permute_input, | ||
| permute_indices_per_token, # zipped_expertwise_rowmap | ||
| dst_weights, | ||
| _scale_out, | ||
| _m_indices, | ||
| ) = paddle.nn.functional.moe_permute( | ||
| hidden_states=x, | ||
| scale=None, | ||
| expert_routemap_topk=topk_idx_i32, | ||
| expert_prob_topk=topk_weights, | ||
| num_experts=layer.num_experts, | ||
| tokens_per_expert=[], | ||
| padding_alignment=128, | ||
| return_expert_indices=True, | ||
| override_buffer_size=override_buffer_size, | ||
| ) | ||
|
|
||
| # Compute token_nums_per_expert (prefix sum) on GPU. | ||
| # Use PADDED counts (row 1) because moe_permute with padding_alignment=128 | ||
| # lays out each expert's tokens in 128-aligned blocks. Using actual counts | ||
| # (row 0) would cause moe_expert_ffn to read wrong positions. | ||
| # Use matmul with a cached lower-triangular matrix instead of | ||
| # paddle.cumsum, because CUB inclusive_scan allocates temp memory | ||
| # which is forbidden during CUDA graph capture. | ||
| padded_counts = count_tokens_per_expert_func(topk_idx, layer.num_experts)[ | ||
| 1 | ||
| ] # [num_experts], int32, 128-aligned | ||
| if not hasattr(self, "_cumsum_tril") or self._cumsum_tril.shape[0] != layer.num_experts: | ||
| self._cumsum_tril = paddle.tril( | ||
| paddle.ones([layer.num_experts, layer.num_experts], dtype="float32") | ||
| ) | ||
| token_nums_per_expert = paddle.mv(self._cumsum_tril, padded_counts.cast("float32")).cast(paddle.int64) | ||
|
|
||
| if topk_ids_hookfunc is not None: | ||
| topk_ids_hookfunc(topk_ids=topk_idx) | ||
|
|
||
| ffn_out = self.compute_ffn( | ||
| layer, | ||
| permute_input, | ||
| token_nums_per_expert, | ||
| None, # expert_idx_per_token not needed for w16a16 without bias | ||
| False, | ||
| -1, | ||
| None, # dequant_scale | ||
| None, # max_tokens_per_expert | ||
| ) | ||
|
|
||
| fused_moe_out, _out_probs = paddle.nn.functional.moe_unpermute( | ||
| hidden_states_unzipped=ffn_out, | ||
| zipped_expertwise_rowmap=permute_indices_per_token, | ||
| expert_routemap_topk=topk_idx_i32, | ||
| token_prob_unzipped=dst_weights, | ||
| total_zipped_tokens=x.shape[0], | ||
| num_experts=layer.num_experts, | ||
| using_weighted_combine=True, | ||
| ) | ||
| return fused_moe_out | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 在 这是预期行为吗?是否应该:
当前设计意味着当 |
||
|
|
||
| ( | ||
| permute_input, | ||
| token_nums_per_expert, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这段逻辑为什么要放在 layer.topk_method == "noaux_tc" 控制下?