Skip to content

Commit 9f3b3ce

Browse files
authored
[Optimization] merge_allreduce (#7039)
1 parent f142b48 commit 9f3b3ce

File tree

3 files changed

+17
-6
lines changed

3 files changed

+17
-6
lines changed

fastdeploy/model_executor/models/glm4_moe.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from paddleformers.utils.log import logger
2727

2828
from fastdeploy.config import FDConfig
29+
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
2930
from fastdeploy.model_executor.forward_meta import ForwardMeta
3031
from fastdeploy.model_executor.graph_optimization.decorator import (
3132
support_graph_optimization,
@@ -160,8 +161,16 @@ def __init__(
160161
default_initializer=paddle.nn.initializer.Constant(0),
161162
)
162163

164+
# In pure-TP mode (tp>1, ep=1) both branches return partial sums, so we
165+
# defer the all-reduce to after combining them — saving one collective.
166+
# In all other modes (EP, EP+attn-TP, no parallelism) each branch handles
167+
# its own reduction internally (reduce_results default=True), so we must
168+
# NOT add an extra all-reduce here.
169+
self.merge_ffn_tp = self.use_tp and not self.use_ep
170+
163171
self.experts = FusedMoE(
164172
fd_config,
173+
reduce_results=not self.merge_ffn_tp,
165174
renormalize=self.norm_topk_prob,
166175
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
167176
num_experts=fd_config.model_config.n_routed_experts,
@@ -182,14 +191,16 @@ def __init__(
182191
intermediate_size=shared_experts_intermediate_size,
183192
layer_id=layer_id,
184193
prefix=f"{prefix}.shared_experts",
194+
reduce_results=not self.merge_ffn_tp,
185195
)
186196

187197
def forward(self, x, forward_meta: ForwardMeta = None):
188198
out = self.experts(x, self.gate, forward_meta)
189199
if self.n_shared_experts > 0:
190-
shared_experts_out = self.shared_experts(x)
191-
out = out + shared_experts_out
192-
200+
out = out + self.shared_experts(x)
201+
if self.merge_ffn_tp:
202+
# Both branches produced partial sums; combine first, then single all-reduce.
203+
out = tensor_model_parallel_all_reduce(out, self.tp_group)
193204
return out
194205

195206

tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
185185
# 校验返回内容与概率信息
186186
assert (
187187
resp_json["choices"][0]["message"]["content"]
188-
== "\n<think>这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典"
188+
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
189189
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."
190190

191191

tests/e2e/utils/rollout_routing_replay_test_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
157157
model_path = os.getenv("MODEL_PATH")
158158
if model_path:
159159
baseline_path = os.path.join(
160-
model_path, f"R3_BaseLine_dev_uint8_0312/routing_replay_output_baseline_{model_name}"
160+
model_path, f"R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}"
161161
)
162162
else:
163-
baseline_path = f"./R3_BaseLine_dev_uint8_0312/routing_replay_output_baseline_{model_name}"
163+
baseline_path = f"./R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}"
164164
stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
165165

166166
nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")

0 commit comments

Comments
 (0)