diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index ebdfe088cc7..368144c1a21 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -82,10 +82,16 @@ def process_loaded_weights(self, layer, weights) -> None: layer.weight.set_value(weights) def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: - linear_out = paddle.matmul(x, layer.weight) if layer.with_bias: - linear_out = paddle.add(linear_out, layer.bias) - return linear_out + bias = layer.bias + assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], ( + f"bias must be 1D with size equal to the last dim of weight, " + f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}" + ) + out = paddle.nn.functional.linear(x, layer.weight, bias) + else: + out = paddle.matmul(x, layer.weight) + return out class LinearBase(nn.Layer): diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index 79ba1c66e7c..2a6a7c2fe3a 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -184,7 +184,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload): # 校验返回内容与概率信息 assert ( resp_json["choices"][0]["message"]["content"] - == "\n这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典" + == "\n我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典" ), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}." diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py index 5701b8d3028..cdfc74e49f4 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py @@ -181,7 +181,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload): # 校验返回内容与概率信息 assert ( resp_json["choices"][0]["message"]["content"] - == "\n这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典" + == "\n我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典" ), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}." diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py index ca63e18fe91..37b5efbe76a 100644 --- a/tests/e2e/utils/rollout_routing_replay_test_utils.py +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -156,9 +156,11 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode cur_save_routing_path = f"./R3_tmp/routing_replay_output_{model_name}/" model_path = os.getenv("MODEL_PATH") if model_path: - baseline_path = os.path.join(model_path, f"R3_BaseLine_25_uint8/routing_replay_output_baseline_{model_name}") + baseline_path = os.path.join( + model_path, f"R3_BaseLine_25_uint8_0403/routing_replay_output_baseline_{model_name}" + ) else: - baseline_path = f"./R3_BaseLine_25_uint8/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_25_uint8_0403/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")