Skip to content

Commit 6b53bbb

Browse files
Remove extra argument from per_token_quant calls in deepgemm MoE backend (#7258)
1 parent 2647d80 commit 6b53bbb

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
120120
# down_proj
121121
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
122122
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
123-
ffn_out, quant_config_weight_block_size_0, not disable_ue8m0_cast
123+
ffn_out, quant_config_weight_block_size_0
124124
)
125125

126126
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous()
@@ -397,7 +397,7 @@ def apply_ep_prefill(
397397
# down_proj
398398
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
399399
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
400-
ffn_out, self.quant_config.weight_block_size[0], self.quant_config.deepgemm_scale_ue8m0
400+
ffn_out, self.quant_config.weight_block_size[0]
401401
)
402402
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous().transpose([1, 0])
403403
else:

0 commit comments

Comments
 (0)