Skip to content

Commit 2b0fd53

Browse files
authored
[Cherry-Pick][Optimization]support fused noauxtc kernel on ep mode(#7936) (#7917)
* support fused noauxtc kernel on ep mode * fix unit test
1 parent 8a1e71d commit 2b0fd53

2 files changed

Lines changed: 6 additions & 0 deletions

File tree

fastdeploy/model_executor/layers/moe/ep.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import fastdeploy
2828
from fastdeploy import envs
2929
from fastdeploy.config import MoEPhase
30+
from fastdeploy.platforms import current_platform
3031
from fastdeploy.utils import singleton
3132

3233

@@ -531,6 +532,9 @@ def moe_select(self, layer: nn.Layer, gate_out: paddle.Tensor):
531532
if layer.topk_method == "noaux_tc":
532533
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
533534

535+
use_fused = (
536+
layer.fd_config.scheduler_config.enable_moe_scores_elementwise_fuse and current_platform.is_cuda()
537+
)
534538
score, topk_weights, topk_idx = get_moe_scores(
535539
gate_out,
536540
layer.n_group,
@@ -540,6 +544,7 @@ def moe_select(self, layer: nn.Layer, gate_out: paddle.Tensor):
540544
layer.gate_correction_bias,
541545
getattr(layer, "renormalize", True),
542546
topk_reduce_func=getattr(layer, "topk_reduce_func", None),
547+
use_fused_cast=use_fused,
543548
)
544549
else:
545550
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(

tests/model_executor/test_ep.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ def fake_get_moe_scores(*_args, **_kwargs):
419419
routed_scaling_factor=1.0,
420420
gate_correction_bias=None,
421421
renormalize=False,
422+
fd_config=SimpleNamespace(scheduler_config=SimpleNamespace(enable_moe_scores_elementwise_fuse=False)),
422423
)
423424
gate_out = paddle.randn([1, 4], dtype="float32")
424425

0 commit comments

Comments
 (0)