From 6898863b38a66ec1d8e8b6e9ca6769def1fbb45e Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Thu, 7 May 2026 15:04:12 +0800 Subject: [PATCH 01/19] add fleet fallback --- fastdeploy/config.py | 2 +- fastdeploy/engine/args_utils.py | 6 +- .../model_executor/models/model_base.py | 2 + .../models/paddleformers/__init__.py | 13 + .../models/paddleformers/base_fleet.py | 576 ++++++++++++++++++ fastdeploy/worker/worker_process.py | 4 +- requirements.txt | 2 +- 7 files changed, 599 insertions(+), 6 deletions(-) create mode 100644 fastdeploy/model_executor/models/paddleformers/base_fleet.py diff --git a/fastdeploy/config.py b/fastdeploy/config.py index f18a4c6ee0a..c1187231794 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -60,7 +60,7 @@ _ResolvedTask = Literal["generate", "encode", "embed"] # Model implementation backend options -ModelImpl = Literal["auto", "fastdeploy", "paddleformers"] +ModelImpl = Literal["auto", "fastdeploy", "paddleformers", "paddlefleet"] _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "generate": [], diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 2d66ec75f42..1d3253ba391 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -139,6 +139,7 @@ class EngineArgs: 'auto': Use native FastDeploy implementation when available, fallback to PaddleFormers. 'fastdeploy': Use only native FastDeploy implementations. 'paddleformers': Use PaddleFormers backend with FastDeploy optimizations. + 'paddlefleet': Use PaddleFleet backend. """ override_pooler_config: Optional[Union[dict, PoolerConfig]] = None """ @@ -636,7 +637,7 @@ def __post_init__(self): "kvcache_storage_backend is only supported when ENABLE_V1_KVCACHE_SCHEDULER=1" ) - valid_model_impls = ["auto", "fastdeploy", "paddleformers"] + valid_model_impls = ["auto", "fastdeploy", "paddleformers", "paddlefleet"] if self.model_impl not in valid_model_impls: raise NotImplementedError( f"not support model_impl: '{self.model_impl}'. " f"Must be one of: {', '.join(valid_model_impls)}" @@ -974,13 +975,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument( "--model-impl", type=str, - choices=["auto", "fastdeploy", "paddleformers"], + choices=["auto", "fastdeploy", "paddleformers", "paddlefleet"], default=EngineArgs.model_impl, help=( "Model implementation backend. " "'auto': Use native FastDeploy when available, fallback to PaddleFormers. " "'fastdeploy': Use only native FastDeploy implementations. " "'paddleformers': Use PaddleFormers backend with FastDeploy optimizations." + "'paddlefleet': Use PaddleFleet backend." ), ) diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index d55c88947e7..79ab1ff3ed7 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -194,6 +194,8 @@ def _try_resolve_paddleformers( elif model_impl == "auto" and is_fallback: # Auto mode fallback when no native implementation exists backend_arch = "PaddleFormersForCausalLM" + elif model_impl == "paddlefleet": + backend_arch = "PaddleFleetForCausalLM" elif model_impl == "fastdeploy": return None else: diff --git a/fastdeploy/model_executor/models/paddleformers/__init__.py b/fastdeploy/model_executor/models/paddleformers/__init__.py index 77174269389..e3db52fcae4 100644 --- a/fastdeploy/model_executor/models/paddleformers/__init__.py +++ b/fastdeploy/model_executor/models/paddleformers/__init__.py @@ -21,10 +21,12 @@ ) from .base import PaddleFormersModelBase +from .base_fleet import PaddleFleetModelBase from .causallm import CausalLMMixin __all__ = [ "PaddleFormersForCausalLM", + "PaddleFleetForCausalLM", ] @@ -38,3 +40,14 @@ class PaddleFormersForCausalLM(CausalLMMixin, PaddleFormersModelBase, ModelForCa @classmethod def name(cls): return "PaddleFormersForCausalLM" + + +@ModelRegistry.register_model_class( + architecture="PaddleFleetForCausalLM", + module_name="paddleformers", + category=ModelCategory.TEXT_GENERATION, +) +class PaddleFleetForCausalLM(PaddleFleetModelBase, ModelForCasualLM): + @classmethod + def name(cls): + return "PaddleFleetForCausalLM" diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py new file mode 100644 index 00000000000..056d3e0b9fb --- /dev/null +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -0,0 +1,576 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""Generic PaddleFormers modeling backend base class.""" + +import math +from collections.abc import Iterable +from typing import TYPE_CHECKING, Dict + +import paddle +from paddle import nn +from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding +from paddlefleet.models.gpt.lm_head import GPTLMHead +from paddlefleet.transformer.layer import FleetLayer +from paddlefleet.transformer.transformer_config import TransformerConfig +from paddleformers.trainer.trainer_utils import set_random_seed +from paddleformers.transformers import AutoConfig +from paddleformers.transformers.auto.modeling import AutoModelForCausalLM +from paddleformers.utils.log import logger + +from fastdeploy.model_executor.forward_meta import ForwardMeta # noqa: F401 +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) + +if TYPE_CHECKING: + from fastdeploy.config import FDConfig + +from fastdeploy.model_executor.layers.attention.attention import Attention + + +class FastDeployAttention(FleetLayer): + """ + FastDeploy version of DotProductAttention, holding an internal FastDeploy Attention module. + + This class can be used to replace PaddleFleet's DotProductAttention, + using FastDeploy's attention backend for computation. + """ + + def __init__( + self, + config: TransformerConfig, + fd_attention: Attention, + num_attention_heads: int, + num_key_value_heads: int, + softmax_scale: float, + hidden_size_per_attention_head: int, + hidden_size_per_partition: int, + layer_id: int, + ): + """ + Initialize FastDeployAttention. + + Args: + fd_attention: FastDeploy Attention instance + num_attention_heads: Number of attention heads + num_key_value_heads: Number of KV heads + softmax_scale: Softmax scaling factor + hidden_size_per_attention_head: Hidden dimension per attention head + hidden_size_per_partition: Hidden size per partition + layer_id: Current layer ID + """ + super().__init__(config) + self.fd_attention = fd_attention + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.softmax_scale = softmax_scale + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.hidden_size_per_partition = hidden_size_per_partition + self.layer_id = layer_id + + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + attention_mask: paddle.Tensor, + attn_mask_startend_row_indices: paddle.Tensor = None, + attn_mask_type=None, + attention_bias: paddle.Tensor = None, + packed_seq_params=None, + use_rr_flash_attention: bool = False, + ): + """ + Forward pass. + + Args: + query: Query tensor, supported formats: + - 4D BSHD: [b, sq, np, hn] (PaddleFleet default) + - 3D HSD: [np, sq, hn] + - 3D SHD: [sq, np, hn] + key: Key tensor, same format as above, but head count may differ (GQA) + value: Value tensor, same format as above + attention_mask: Attention mask + attn_mask_startend_row_indices: FlashMask start-end row indices + attn_mask_type: Attention mask type + attention_bias: Attention bias + packed_seq_params: Packed sequence parameters + use_rr_flash_attention: Whether to use RR Flash Attention + + Returns: + Attention output tensor + """ + # Try to get forward_meta from config (PaddleFleet does not pass this parameter when calling) + forward_meta = getattr(self.config, "forward_meta", None) + if forward_meta is None: + raise ValueError("forward_meta must be provided") + # Set scaling factor + original_scale = getattr(self.fd_attention, "scale", None) + if original_scale is None: + self.fd_attention.scale = self.softmax_scale + + try: + # Refer to the processing logic of fastdeploy_append_attention_forward + # Support 3D (SHD) and 4D (BSHD) input + + # 4D input: squeeze to 3D (only supports batch=1) + def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: + if t.ndim == 4: + if int(t.shape[0]) != 1: + raise ValueError( + f"{name} batch size {int(t.shape[0])} not supported, only batch=1 is supported" + ) + return t.squeeze(0) + if t.ndim == 3: + return t + raise ValueError(f"{name} has unexpected dims {t.ndim}, expect 3 or 4") + + q = squeeze_to_3d(query, "query") + k = squeeze_to_3d(key, "key") + v = squeeze_to_3d(value, "value") + + seq_len = int(q.shape[0]) + + # SHD: [seq, heads, dim] -> flatten to [seq, heads*dim] + q_flat = q.reshape([seq_len, -1]) + k_flat = k.reshape([seq_len, -1]) + v_flat = v.reshape([seq_len, -1]) + + # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] + qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) + + output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) + # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] + # PaddleFleet expects 3D output format + output = output.unsqueeze(0) + + return output + finally: + # Restore original scale + if original_scale is None: + if hasattr(self.fd_attention, "scale"): + delattr(self.fd_attention, "scale") + else: + self.fd_attention.scale = original_scale + + +@support_graph_optimization +class PaddleFleetModelBase(nn.Layer): + """ + A mixin-style base class to provide PaddleFormers backend logic on top of nn.Layer. + This class subclasses nn.Layer and provides common methods to + initialize and manage a PaddleFormers model. + """ + + def __init__(self, fd_config: "FDConfig", **kwargs): + super().__init__(fd_config) + logger.info("Initializing PaddleFormers backend.") + self.fd_config = fd_config # FastDeploy's top-level FDConfig + self.model_config = fd_config.model_config # FastDeploy's ModelConfig + self.paddleformers_config = AutoConfig.from_pretrained(self.model_config.model) + + # Assign parallel config from fd_config.parallel_config to paddleformers_config + parallel_config = fd_config.parallel_config + self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size + self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel + self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size + + self.paddleformers_config.max_seq_len = fd_config.model_config.max_model_len + self.paddleformers_config.param_dtype = "bfloat16" + self.paddleformers_config.moe_grouped_gemm = True + # fp32_residual_connection=True causes embedding output to be cast to float32, + # which mismatches bfloat16 model weights (e.g. RMSNorm weight). + # FastDeploy handles dtype consistency itself, so disable this. + self.paddleformers_config.fp32_residual_connection = False + # Initialize PaddleFleet parallel_state so that its TP group is consistent with FastDeploy. + # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank + # via parallel_state. Without initialization, it defaults to 1, causing weights + # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). + if parallel_config.tensor_parallel_size > 1: + self._init_paddlefleet_parallel_state(parallel_config) + + # The specific text model config + # Sync important config values from text_config to model_config + # This ensures fallback models use their actual config values instead of FD defaults + self._sync_config_from_text_config() + # For convenience, keep direct access to some FD configs + self.quant_config = self.fd_config.quant_config + + # Load model using from_pretrained to support weight loading + # Pass dtype, config and other options from kwargs + + model_load_kwargs = { + "dtype": self.model_config.dtype, + "config": self.paddleformers_config, + "convert_from_hf": True, + "load_via_cpu": True, + "load_checkpoint_format": "flex_checkpoint", + } + # Set random seed before model construction for reproducibility + set_random_seed(seed_=42) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_config.model, + **model_load_kwargs, + ) + + self.model.eval() + # Patch PaddleFleet core_attention with FastDeploy attention + patched_count = patch_paddlefleet_core_attention( + model=self.model, + fd_config=self.fd_config, + ) + logger.info(f"Patched {patched_count} attention layers with FastDeploy") + + def compute_logits(self, hidden_state, forward_meta=None): + """Compute logits from hidden states using lm_head.""" + lm_head = self.model.get_lm_head() + # ColumnParallelLinear expects input [s, b, h] + hidden_state = hidden_state.unsqueeze(1) # [num_tokens, h] -> [num_tokens, 1, h] + logits = lm_head({"hidden_states": hidden_state}) + # Output [num_tokens, 1, vocab], squeeze back to [num_tokens, vocab] + if logits.ndim == 3: + logits = logits.squeeze(1) + logits = logits.astype(paddle.float32) + logits[:, self.model_config.ori_vocab_size :] = -float("inf") + + return logits + + def _init_paddlefleet_parallel_state(self, parallel_config) -> None: + """ + Initialize PaddleFleet's parallel_state so that ColumnParallelLinear/RowParallelLinear + can correctly obtain TP world_size and rank, and thus correctly shard weights + and build sharded_state_dict. + + References the initialization logic in PaddleFormers' training_args.py, + using the official initialize_fleet API instead of directly manipulating + parallel_state internal variables. + """ + from paddle.distributed import fleet + from paddlefleet.parallel_state import get_tensor_model_parallel_group + from paddlefleet.training import initialize_fleet + + # Only call initialize_fleet when the TP group has not been initialized yet + if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": parallel_config.data_parallel_size, + "mp_degree": parallel_config.tensor_parallel_size, + "pp_degree": 1, + "sep_degree": 1, + "ep_degree": parallel_config.expert_parallel_size, + } + initialize_fleet(strategy) + logger.info( + f"Initialized PaddleFleet parallel_state via initialize_fleet " + f"(dp={parallel_config.data_parallel_size}, " + f"mp={parallel_config.tensor_parallel_size}, " + f"ep={parallel_config.expert_parallel_size}, " + f"sp={parallel_config.sequence_parallel})" + ) + + def _sync_config_from_text_config(self) -> None: + """ + Sync important config values from text_config (PaddleFormers/HF config) + to model_config. This ensures fallback models use their actual config + values instead of FD's defaults. + + This is crucial for models with unique configs like: + - Gemma3: tie_word_embeddings=True, layer_types, sliding_window + - Mistral: sliding_window + - etc. + """ + mc = self.model_config + tc = self.paddleformers_config + + sync_fields = [ + "tie_word_embeddings", + "sliding_window", + "sliding_window_pattern", + "layer_types", # May be computed as property + "rope_theta", + "rope_scaling", + "head_dim", + "rms_norm_eps", + "rope_local_base_freq", # Gemma3 specific + "query_pre_attn_scalar", # Gemma3 specific + ] + + synced = [] + for field in sync_fields: + text_value = getattr(tc, field, None) + if text_value is not None: + # Only sync if not already set or if FD default differs + current_value = getattr(mc, field, None) if hasattr(mc, field) else None + if current_value is None or current_value != text_value: + setattr(mc, field, text_value) + synced.append(f"{field}={text_value}") + + def embed_input_ids(self, input_ids: paddle.Tensor) -> paddle.Tensor: + """Embed input_ids using the model's embedding layer.""" + # PaddleFleet PipelineLayer does not support get_input_embeddings(). + # Find the GPTEmbedding layer directly from run_function. + embedding_layer = None + if hasattr(self.model, "run_function"): + for layer in self.model.run_function: + if isinstance(layer, GPTEmbedding): + embedding_layer = layer + break + if embedding_layer is None: + raise RuntimeError("Cannot find GPTEmbedding layer in model.run_function") + + original_ndim = input_ids.ndim + if input_ids.ndim == 1: + input_ids = input_ids.unsqueeze(0) # [num_tokens] -> [1, num_tokens] + + model_input = {"input_ids": input_ids} + result = embedding_layer(model_input) + inputs_embeds = result["hidden_states"] + + # Embedding output is [batch, seq, h], squeeze back to [num_tokens, h] + if original_ndim == 1 and inputs_embeds.ndim == 3: + inputs_embeds = inputs_embeds.squeeze(0) + + if hasattr(self, "embed_scale") and self.embed_scale is not None: + inputs_embeds *= self.embed_scale + return inputs_embeds + + @paddle.no_grad() + def forward( + self, + inputs: Dict, + forward_meta: ForwardMeta, + **kwargs, + ): + """Full transformer forward: input_ids -> hidden_states. + + This method is the primary forward pass for the model, computing: + 1. Position IDs based on seq_lens_decoder (absolute positions for RoPE) + 2. Token embeddings via embed_input_ids + 3. Transformer layers via self.model() + + Returns: + hidden_states: [TotalTokens, HiddenDim] + """ + ids_remove_padding = inputs["ids_remove_padding"] + num_tokens = ids_remove_padding.shape[0] + + batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] + seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] + + if batch_id_per_token is not None and seq_lens_decoder is not None: + decoder_offsets = seq_lens_decoder.squeeze(-1) # [batch_size] + # Ensure decoder_offsets is at least 1D tensor + if decoder_offsets.ndim == 0: + decoder_offsets = decoder_offsets.reshape([1]) + token_decoder_offsets = paddle.index_select(decoder_offsets, batch_id_per_token, axis=0) # [num_tokens] + + cu_seqlens = forward_meta.cu_seqlens_q # [batch_size + 1] + if cu_seqlens is not None: + token_global_idx = paddle.arange(num_tokens, dtype="int64") + request_start_idx = paddle.index_select(cu_seqlens[:-1], batch_id_per_token, axis=0) + relative_positions = token_global_idx - request_start_idx.astype("int64") + else: + relative_positions = paddle.zeros([num_tokens], dtype="int64") + position_ids = token_decoder_offsets.astype("int64") + relative_positions + else: + position_ids = paddle.arange(num_tokens, dtype="int64") + if seq_lens_decoder is not None: + position_ids = position_ids + seq_lens_decoder[0, 0].astype("int64") + forward_meta.rope_already_applied = True + + # Also set forward_meta on each TransformerLayer's config + # so that FastDeployAttention can retrieve it from core_attn.config + if hasattr(self.model, "run_function"): + for layer in self.model.run_function: + if not isinstance(layer, (GPTEmbedding, GPTLMHead)): + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "core_attention"): + core_attn = layer.self_attn.core_attention + if hasattr(core_attn, "config"): + core_attn.config.forward_meta = forward_meta + + inputs_embeds = self.embed_input_ids(ids_remove_padding).unsqueeze(0) + + # Build input dict, PipelineLayer passes data between layers via dict + model_input = { + "input_ids": None, + "position_ids": position_ids, + } + # Add other parameters from kwargs + for k, v in kwargs.items(): + if v is not None: + model_input[k] = v + + # Iterate over run_function, skip GPTLMHead + # Only call TransformerLayer + for layer in self.model.run_function: + if isinstance(layer, GPTLMHead): + continue + if isinstance(layer, (GPTEmbedding)): + model_input = layer(model_input, decoder_input=inputs_embeds) + else: + model_input = layer(model_input) + + hidden_states = model_input["hidden_states"] + # [b, s, h] -> [s, h] (b=1) + hidden_states = hidden_states.squeeze(0) + + return hidden_states + + @paddle.no_grad() + def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): + # use model.from_pretrained to load weight + pass + + def set_state_dict(self, state_dict): + self.model.set_state_dict(state_dict) + + +# ============================================================================ +# PaddleFleet Attention Patch Functions +# ============================================================================ + + +def patch_paddlefleet_core_attention( + model, + fd_config: "FDConfig", + layers_to_patch: list[int] | None = None, +): + """ + Replace core_attention in all TransformerLayers of a PaddleFleet model with FastDeployAttention. + + Args: + model: PaddleFleet model instance (inheriting from PipelineLayer) + fd_config: FastDeploy FDConfig object, used to create Attention instances + layers_to_patch: List of layer indices to patch, None means patch all layers + + Returns: + int: Number of layers successfully patched + + Raises: + ValueError: If the model structure is unexpected or parameters are incorrect + """ + if fd_config is None: + raise ValueError("fd_config must be provided") + + from fastdeploy.model_executor.layers.attention.attention import Attention + + # Iterate over run_function to find TransformerLayers + patched_count = 0 + transformer_layers = [] + + # Collect all TransformerLayers + if hasattr(model, "run_function"): + for layer in model.run_function: + # Try to identify TransformerLayer + layer_type = type(layer).__name__ + if "TransformerLayer" in layer_type or "transformer" in str(type(layer)): + transformer_layers.append(layer) + + if not transformer_layers: + # Try alternative ways to find layers + for name, module in model.named_sublayers(): + if "TransformerLayer" in type(module).__name__: + transformer_layers.append(module) + + if not transformer_layers: + raise ValueError("No TransformerLayer found in model") + + # Patch core_attention for each TransformerLayer + for layer in transformer_layers: + layer_number = getattr(layer, "layer_number", None) + if layer_number is None: + layer_number = getattr(layer, "layer_id", None) + + if layer_number is None: + logger.warning("layer_number not found, skip patching...") + continue # Skip layers where layer_id cannot be obtained + + # Check if this layer needs to be patched + if layers_to_patch is not None and (layer_number) not in layers_to_patch: + continue + + # Get core_attention + if not hasattr(layer, "self_attn"): + logger.warning(f"self_attn not found in layer {layer_number}, skip patching...") + continue + + core_attn = layer.self_attn.core_attention + if core_attn is None: + logger.warning(f"core_attn not found in layer {layer_number}, skip patching...") + continue + + # Get configuration info + # Prefer per-partition values (values after TP sharding), + # because PaddleFleet's QKV output is already per-partition when TP>1 + num_attention_heads = getattr( + core_attn, "num_attention_heads_per_partition", getattr(core_attn.config, "num_attention_heads", None) + ) + num_key_value_heads = getattr( + core_attn, + "num_query_groups_per_partition", + getattr(core_attn.config, "num_key_value_heads", num_attention_heads), + ) + hidden_size_per_attention_head = getattr(core_attn, "hidden_size_per_attention_head", None) + if hidden_size_per_attention_head is not None: + softmax_scale = getattr(core_attn, "softmax_scale", 1.0 / math.sqrt(hidden_size_per_attention_head)) + else: + softmax_scale = 1.0 + + hidden_size_per_partition = getattr(core_attn, "hidden_size_per_partition", None) + if hidden_size_per_partition is None: + head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) + hidden_size_per_partition = num_attention_heads * head_dim + + # Get FastDeploy layer ID (0-indexed) + fd_layer_id = layer_number + + # Create Attention instance inside FastDeployAttention + fd_attn_instance = Attention( + fd_config=fd_config, + layer_id=fd_layer_id, + ) + + # Override Attention instance's head config to match PaddleFleet model + # This is necessary because fd_config.model_config may differ from PaddleFleet model config + fd_attn_instance.num_heads = num_attention_heads + fd_attn_instance.kv_num_heads = num_key_value_heads + fd_attn_instance.head_dim = hidden_size_per_attention_head + logger.info( + f"Overriding Attention config: num_heads={num_attention_heads}, kv_num_heads={num_key_value_heads}, head_dim={hidden_size_per_attention_head}" + ) + + # Create FastDeployAttention object and directly replace core_attention + fast_deploy_core_attn = FastDeployAttention( + config=core_attn.config, + fd_attention=fd_attn_instance, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + softmax_scale=softmax_scale, + hidden_size_per_attention_head=hidden_size_per_attention_head, + hidden_size_per_partition=hidden_size_per_partition, + layer_id=fd_layer_id, + ) + + # Replace core_attention object + layer.self_attn.core_attention = fast_deploy_core_attn + + patched_count += 1 + logger.info(f"Replaced core_attention with FastDeployAttention for layer {fd_layer_id}") + + logger.info(f"Successfully replaced {patched_count} core_attention layers with FastDeployAttention") + + return patched_count diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 28a943cf9d4..d34f5e18c47 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -1030,9 +1030,9 @@ def parse_args(): parser.add_argument( "--model-impl", type=str, - choices=["auto", "fastdeploy", "paddleformers"], + choices=["auto", "fastdeploy", "paddleformers", "paddlefleet"], default="auto", - help="Model implementation backend (auto, fastdeploy, paddleformers)", + help="Model implementation backend (auto, fastdeploy, paddleformers, paddlefleet)", ) parser.add_argument( diff --git a/requirements.txt b/requirements.txt index 14aea691f2c..b159ed9b73f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn>=0.38.0 fastapi -paddleformers>=1.1.1 +paddleformers[paddlefleet] @https://paddle-whl.bj.bcebos.com/nightly/cu126/paddleformers/paddleformers-1.1.0.post20260430-py3-none-any.whl redis etcd3 httpx From 18cc86b7306e8fc3f136a6cc194c09e36949eada Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 8 May 2026 17:33:49 +0800 Subject: [PATCH 02/19] remove fleet depend --- .../model_executor/models/model_base.py | 13 ++++++++++- .../models/paddleformers/base_fleet.py | 22 ++++++++++++------- fastdeploy/model_executor/utils.py | 4 ++++ requirements.txt | 2 +- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 79ab1ff3ed7..cbd622b6f7b 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -195,7 +195,18 @@ def _try_resolve_paddleformers( # Auto mode fallback when no native implementation exists backend_arch = "PaddleFormersForCausalLM" elif model_impl == "paddlefleet": - backend_arch = "PaddleFleetForCausalLM" + from fastdeploy.model_executor.utils import is_paddlefleet_available + + if is_paddlefleet_available(): + backend_arch = "PaddleFleetForCausalLM" + else: + raise ImportError( + "paddlefleet backend requires paddlefleet to be installed.\n" + "Please install with [change cuda version if needed ]:\n" + "python -m pip install paddlefleet==0.3.0.dev20260507' " + "--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ " + "--extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/" + ) elif model_impl == "fastdeploy": return None else: diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 056d3e0b9fb..4cc75e4c7f2 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -22,10 +22,15 @@ import paddle from paddle import nn -from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding -from paddlefleet.models.gpt.lm_head import GPTLMHead -from paddlefleet.transformer.layer import FleetLayer -from paddlefleet.transformer.transformer_config import TransformerConfig + +from fastdeploy.model_executor.utils import is_paddlefleet_available + +if is_paddlefleet_available(): + from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding + from paddlefleet.models.gpt.lm_head import GPTLMHead + from paddlefleet.transformer.layer import FleetLayer + from paddlefleet.transformer.transformer_config import TransformerConfig + from paddleformers.trainer.trainer_utils import set_random_seed from paddleformers.transformers import AutoConfig from paddleformers.transformers.auto.modeling import AutoModelForCausalLM @@ -200,8 +205,7 @@ def __init__(self, fd_config: "FDConfig", **kwargs): # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank # via parallel_state. Without initialization, it defaults to 1, causing weights # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). - if parallel_config.tensor_parallel_size > 1: - self._init_paddlefleet_parallel_state(parallel_config) + self._init_paddlefleet_parallel_state(self.paddleformers_config) # The specific text model config # Sync important config values from text_config to model_config @@ -260,8 +264,10 @@ def _init_paddlefleet_parallel_state(self, parallel_config) -> None: parallel_state internal variables. """ from paddle.distributed import fleet - from paddlefleet.parallel_state import get_tensor_model_parallel_group - from paddlefleet.training import initialize_fleet + + if is_paddlefleet_available(): + from paddlefleet.parallel_state import get_tensor_model_parallel_group + from paddlefleet.training import initialize_fleet # Only call initialize_fleet when the TP group has not been initialized yet if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index c34b697d785..bd115bdf46a 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -559,6 +559,10 @@ def has_flashinfer(): return importlib.util.find_spec("flashinfer") is not None +def is_paddlefleet_available(): + return importlib.util.find_spec("paddlefleet") is not None + + @cache def get_sm_version(): if paddle.cuda.is_available(): diff --git a/requirements.txt b/requirements.txt index b159ed9b73f..14aea691f2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn>=0.38.0 fastapi -paddleformers[paddlefleet] @https://paddle-whl.bj.bcebos.com/nightly/cu126/paddleformers/paddleformers-1.1.0.post20260430-py3-none-any.whl +paddleformers>=1.1.1 redis etcd3 httpx From 5e81aafa5bbedff5f3eb4965f1690748d1b7296b Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 8 May 2026 17:57:04 +0800 Subject: [PATCH 03/19] change import juage --- .../models/paddleformers/__init__.py | 26 +++++++++++-------- .../models/paddleformers/base_fleet.py | 19 +++++--------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/__init__.py b/fastdeploy/model_executor/models/paddleformers/__init__.py index e3db52fcae4..315b72b8775 100644 --- a/fastdeploy/model_executor/models/paddleformers/__init__.py +++ b/fastdeploy/model_executor/models/paddleformers/__init__.py @@ -19,14 +19,13 @@ ModelForCasualLM, ModelRegistry, ) +from fastdeploy.model_executor.utils import is_paddlefleet_available from .base import PaddleFormersModelBase -from .base_fleet import PaddleFleetModelBase from .causallm import CausalLMMixin __all__ = [ "PaddleFormersForCausalLM", - "PaddleFleetForCausalLM", ] @@ -42,12 +41,17 @@ def name(cls): return "PaddleFormersForCausalLM" -@ModelRegistry.register_model_class( - architecture="PaddleFleetForCausalLM", - module_name="paddleformers", - category=ModelCategory.TEXT_GENERATION, -) -class PaddleFleetForCausalLM(PaddleFleetModelBase, ModelForCasualLM): - @classmethod - def name(cls): - return "PaddleFleetForCausalLM" +if is_paddlefleet_available(): + from .base_fleet import PaddleFleetModelBase + + __all__ += ["PaddleFleetForCausalLM"] + + @ModelRegistry.register_model_class( + architecture="PaddleFleetForCausalLM", + module_name="paddleformers", + category=ModelCategory.TEXT_GENERATION, + ) + class PaddleFleetForCausalLM(PaddleFleetModelBase, ModelForCasualLM): + @classmethod + def name(cls): + return "PaddleFleetForCausalLM" diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 4cc75e4c7f2..b6516369c2f 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -22,15 +22,10 @@ import paddle from paddle import nn - -from fastdeploy.model_executor.utils import is_paddlefleet_available - -if is_paddlefleet_available(): - from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding - from paddlefleet.models.gpt.lm_head import GPTLMHead - from paddlefleet.transformer.layer import FleetLayer - from paddlefleet.transformer.transformer_config import TransformerConfig - +from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding +from paddlefleet.models.gpt.lm_head import GPTLMHead +from paddlefleet.transformer.layer import FleetLayer +from paddlefleet.transformer.transformer_config import TransformerConfig from paddleformers.trainer.trainer_utils import set_random_seed from paddleformers.transformers import AutoConfig from paddleformers.transformers.auto.modeling import AutoModelForCausalLM @@ -264,10 +259,8 @@ def _init_paddlefleet_parallel_state(self, parallel_config) -> None: parallel_state internal variables. """ from paddle.distributed import fleet - - if is_paddlefleet_available(): - from paddlefleet.parallel_state import get_tensor_model_parallel_group - from paddlefleet.training import initialize_fleet + from paddlefleet.parallel_state import get_tensor_model_parallel_group + from paddlefleet.training import initialize_fleet # Only call initialize_fleet when the TP group has not been initialized yet if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: From 4acab59b1463237719bbe66f109ce5187c10ff2d Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 8 May 2026 18:21:25 +0800 Subject: [PATCH 04/19] change import juage --- .../models/paddleformers/base_fleet.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index b6516369c2f..94ac2e8cbb9 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -16,25 +16,32 @@ """Generic PaddleFormers modeling backend base class.""" -import math -from collections.abc import Iterable -from typing import TYPE_CHECKING, Dict - -import paddle -from paddle import nn -from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding -from paddlefleet.models.gpt.lm_head import GPTLMHead -from paddlefleet.transformer.layer import FleetLayer -from paddlefleet.transformer.transformer_config import TransformerConfig -from paddleformers.trainer.trainer_utils import set_random_seed -from paddleformers.transformers import AutoConfig -from paddleformers.transformers.auto.modeling import AutoModelForCausalLM -from paddleformers.utils.log import logger - -from fastdeploy.model_executor.forward_meta import ForwardMeta # noqa: F401 -from fastdeploy.model_executor.graph_optimization.decorator import ( - support_graph_optimization, -) +import logging + +from fastdeploy.model_executor.utils import is_paddlefleet_available + +if not is_paddlefleet_available(): + logging.warning("paddlefleet is not installed, skipping base_fleet module") +else: + import math + from collections.abc import Iterable + from typing import TYPE_CHECKING, Dict + + import paddle + from paddle import nn + from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding + from paddlefleet.models.gpt.lm_head import GPTLMHead + from paddlefleet.transformer.layer import FleetLayer + from paddlefleet.transformer.transformer_config import TransformerConfig + from paddleformers.trainer.trainer_utils import set_random_seed + from paddleformers.transformers import AutoConfig + from paddleformers.transformers.auto.modeling import AutoModelForCausalLM + from paddleformers.utils.log import logger + + from fastdeploy.model_executor.forward_meta import ForwardMeta # noqa: F401 + from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, + ) if TYPE_CHECKING: from fastdeploy.config import FDConfig From 856ffc4c7f631c9f8ddc2f2bbbaef9d1d272db07 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 8 May 2026 18:24:43 +0800 Subject: [PATCH 05/19] change import juage --- .../models/paddleformers/base_fleet.py | 1014 ++++++++--------- 1 file changed, 506 insertions(+), 508 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 94ac2e8cbb9..cf1e4dcedce 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -43,540 +43,538 @@ support_graph_optimization, ) -if TYPE_CHECKING: - from fastdeploy.config import FDConfig - -from fastdeploy.model_executor.layers.attention.attention import Attention - - -class FastDeployAttention(FleetLayer): - """ - FastDeploy version of DotProductAttention, holding an internal FastDeploy Attention module. - - This class can be used to replace PaddleFleet's DotProductAttention, - using FastDeploy's attention backend for computation. - """ - - def __init__( - self, - config: TransformerConfig, - fd_attention: Attention, - num_attention_heads: int, - num_key_value_heads: int, - softmax_scale: float, - hidden_size_per_attention_head: int, - hidden_size_per_partition: int, - layer_id: int, - ): - """ - Initialize FastDeployAttention. + if TYPE_CHECKING: + from fastdeploy.config import FDConfig - Args: - fd_attention: FastDeploy Attention instance - num_attention_heads: Number of attention heads - num_key_value_heads: Number of KV heads - softmax_scale: Softmax scaling factor - hidden_size_per_attention_head: Hidden dimension per attention head - hidden_size_per_partition: Hidden size per partition - layer_id: Current layer ID - """ - super().__init__(config) - self.fd_attention = fd_attention - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.softmax_scale = softmax_scale - self.hidden_size_per_attention_head = hidden_size_per_attention_head - self.hidden_size_per_partition = hidden_size_per_partition - self.layer_id = layer_id - - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - attention_mask: paddle.Tensor, - attn_mask_startend_row_indices: paddle.Tensor = None, - attn_mask_type=None, - attention_bias: paddle.Tensor = None, - packed_seq_params=None, - use_rr_flash_attention: bool = False, - ): - """ - Forward pass. + from fastdeploy.model_executor.layers.attention.attention import Attention - Args: - query: Query tensor, supported formats: - - 4D BSHD: [b, sq, np, hn] (PaddleFleet default) - - 3D HSD: [np, sq, hn] - - 3D SHD: [sq, np, hn] - key: Key tensor, same format as above, but head count may differ (GQA) - value: Value tensor, same format as above - attention_mask: Attention mask - attn_mask_startend_row_indices: FlashMask start-end row indices - attn_mask_type: Attention mask type - attention_bias: Attention bias - packed_seq_params: Packed sequence parameters - use_rr_flash_attention: Whether to use RR Flash Attention + class FastDeployAttention(FleetLayer): + """ + FastDeploy version of DotProductAttention, holding an internal FastDeploy Attention module. - Returns: - Attention output tensor + This class can be used to replace PaddleFleet's DotProductAttention, + using FastDeploy's attention backend for computation. """ - # Try to get forward_meta from config (PaddleFleet does not pass this parameter when calling) - forward_meta = getattr(self.config, "forward_meta", None) - if forward_meta is None: - raise ValueError("forward_meta must be provided") - # Set scaling factor - original_scale = getattr(self.fd_attention, "scale", None) - if original_scale is None: - self.fd_attention.scale = self.softmax_scale - - try: - # Refer to the processing logic of fastdeploy_append_attention_forward - # Support 3D (SHD) and 4D (BSHD) input - - # 4D input: squeeze to 3D (only supports batch=1) - def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: - if t.ndim == 4: - if int(t.shape[0]) != 1: - raise ValueError( - f"{name} batch size {int(t.shape[0])} not supported, only batch=1 is supported" - ) - return t.squeeze(0) - if t.ndim == 3: - return t - raise ValueError(f"{name} has unexpected dims {t.ndim}, expect 3 or 4") - - q = squeeze_to_3d(query, "query") - k = squeeze_to_3d(key, "key") - v = squeeze_to_3d(value, "value") - - seq_len = int(q.shape[0]) - - # SHD: [seq, heads, dim] -> flatten to [seq, heads*dim] - q_flat = q.reshape([seq_len, -1]) - k_flat = k.reshape([seq_len, -1]) - v_flat = v.reshape([seq_len, -1]) - - # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] - qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) - - output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) - # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] - # PaddleFleet expects 3D output format - output = output.unsqueeze(0) - - return output - finally: - # Restore original scale + + def __init__( + self, + config: TransformerConfig, + fd_attention: Attention, + num_attention_heads: int, + num_key_value_heads: int, + softmax_scale: float, + hidden_size_per_attention_head: int, + hidden_size_per_partition: int, + layer_id: int, + ): + """ + Initialize FastDeployAttention. + + Args: + fd_attention: FastDeploy Attention instance + num_attention_heads: Number of attention heads + num_key_value_heads: Number of KV heads + softmax_scale: Softmax scaling factor + hidden_size_per_attention_head: Hidden dimension per attention head + hidden_size_per_partition: Hidden size per partition + layer_id: Current layer ID + """ + super().__init__(config) + self.fd_attention = fd_attention + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.softmax_scale = softmax_scale + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.hidden_size_per_partition = hidden_size_per_partition + self.layer_id = layer_id + + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + attention_mask: paddle.Tensor, + attn_mask_startend_row_indices: paddle.Tensor = None, + attn_mask_type=None, + attention_bias: paddle.Tensor = None, + packed_seq_params=None, + use_rr_flash_attention: bool = False, + ): + """ + Forward pass. + + Args: + query: Query tensor, supported formats: + - 4D BSHD: [b, sq, np, hn] (PaddleFleet default) + - 3D HSD: [np, sq, hn] + - 3D SHD: [sq, np, hn] + key: Key tensor, same format as above, but head count may differ (GQA) + value: Value tensor, same format as above + attention_mask: Attention mask + attn_mask_startend_row_indices: FlashMask start-end row indices + attn_mask_type: Attention mask type + attention_bias: Attention bias + packed_seq_params: Packed sequence parameters + use_rr_flash_attention: Whether to use RR Flash Attention + + Returns: + Attention output tensor + """ + # Try to get forward_meta from config (PaddleFleet does not pass this parameter when calling) + forward_meta = getattr(self.config, "forward_meta", None) + if forward_meta is None: + raise ValueError("forward_meta must be provided") + # Set scaling factor + original_scale = getattr(self.fd_attention, "scale", None) if original_scale is None: - if hasattr(self.fd_attention, "scale"): - delattr(self.fd_attention, "scale") - else: - self.fd_attention.scale = original_scale - - -@support_graph_optimization -class PaddleFleetModelBase(nn.Layer): - """ - A mixin-style base class to provide PaddleFormers backend logic on top of nn.Layer. - This class subclasses nn.Layer and provides common methods to - initialize and manage a PaddleFormers model. - """ - - def __init__(self, fd_config: "FDConfig", **kwargs): - super().__init__(fd_config) - logger.info("Initializing PaddleFormers backend.") - self.fd_config = fd_config # FastDeploy's top-level FDConfig - self.model_config = fd_config.model_config # FastDeploy's ModelConfig - self.paddleformers_config = AutoConfig.from_pretrained(self.model_config.model) - - # Assign parallel config from fd_config.parallel_config to paddleformers_config - parallel_config = fd_config.parallel_config - self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size - self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel - self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size - - self.paddleformers_config.max_seq_len = fd_config.model_config.max_model_len - self.paddleformers_config.param_dtype = "bfloat16" - self.paddleformers_config.moe_grouped_gemm = True - # fp32_residual_connection=True causes embedding output to be cast to float32, - # which mismatches bfloat16 model weights (e.g. RMSNorm weight). - # FastDeploy handles dtype consistency itself, so disable this. - self.paddleformers_config.fp32_residual_connection = False - # Initialize PaddleFleet parallel_state so that its TP group is consistent with FastDeploy. - # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank - # via parallel_state. Without initialization, it defaults to 1, causing weights - # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). - self._init_paddlefleet_parallel_state(self.paddleformers_config) - - # The specific text model config - # Sync important config values from text_config to model_config - # This ensures fallback models use their actual config values instead of FD defaults - self._sync_config_from_text_config() - # For convenience, keep direct access to some FD configs - self.quant_config = self.fd_config.quant_config - - # Load model using from_pretrained to support weight loading - # Pass dtype, config and other options from kwargs - - model_load_kwargs = { - "dtype": self.model_config.dtype, - "config": self.paddleformers_config, - "convert_from_hf": True, - "load_via_cpu": True, - "load_checkpoint_format": "flex_checkpoint", - } - # Set random seed before model construction for reproducibility - set_random_seed(seed_=42) - self.model = AutoModelForCausalLM.from_pretrained( - self.model_config.model, - **model_load_kwargs, - ) - - self.model.eval() - # Patch PaddleFleet core_attention with FastDeploy attention - patched_count = patch_paddlefleet_core_attention( - model=self.model, - fd_config=self.fd_config, - ) - logger.info(f"Patched {patched_count} attention layers with FastDeploy") - - def compute_logits(self, hidden_state, forward_meta=None): - """Compute logits from hidden states using lm_head.""" - lm_head = self.model.get_lm_head() - # ColumnParallelLinear expects input [s, b, h] - hidden_state = hidden_state.unsqueeze(1) # [num_tokens, h] -> [num_tokens, 1, h] - logits = lm_head({"hidden_states": hidden_state}) - # Output [num_tokens, 1, vocab], squeeze back to [num_tokens, vocab] - if logits.ndim == 3: - logits = logits.squeeze(1) - logits = logits.astype(paddle.float32) - logits[:, self.model_config.ori_vocab_size :] = -float("inf") - - return logits - - def _init_paddlefleet_parallel_state(self, parallel_config) -> None: + self.fd_attention.scale = self.softmax_scale + + try: + # Refer to the processing logic of fastdeploy_append_attention_forward + # Support 3D (SHD) and 4D (BSHD) input + + # 4D input: squeeze to 3D (only supports batch=1) + def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: + if t.ndim == 4: + if int(t.shape[0]) != 1: + raise ValueError( + f"{name} batch size {int(t.shape[0])} not supported, only batch=1 is supported" + ) + return t.squeeze(0) + if t.ndim == 3: + return t + raise ValueError(f"{name} has unexpected dims {t.ndim}, expect 3 or 4") + + q = squeeze_to_3d(query, "query") + k = squeeze_to_3d(key, "key") + v = squeeze_to_3d(value, "value") + + seq_len = int(q.shape[0]) + + # SHD: [seq, heads, dim] -> flatten to [seq, heads*dim] + q_flat = q.reshape([seq_len, -1]) + k_flat = k.reshape([seq_len, -1]) + v_flat = v.reshape([seq_len, -1]) + + # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] + qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) + + output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) + # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] + # PaddleFleet expects 3D output format + output = output.unsqueeze(0) + + return output + finally: + # Restore original scale + if original_scale is None: + if hasattr(self.fd_attention, "scale"): + delattr(self.fd_attention, "scale") + else: + self.fd_attention.scale = original_scale + + @support_graph_optimization + class PaddleFleetModelBase(nn.Layer): """ - Initialize PaddleFleet's parallel_state so that ColumnParallelLinear/RowParallelLinear - can correctly obtain TP world_size and rank, and thus correctly shard weights - and build sharded_state_dict. - - References the initialization logic in PaddleFormers' training_args.py, - using the official initialize_fleet API instead of directly manipulating - parallel_state internal variables. + A mixin-style base class to provide PaddleFormers backend logic on top of nn.Layer. + This class subclasses nn.Layer and provides common methods to + initialize and manage a PaddleFormers model. """ - from paddle.distributed import fleet - from paddlefleet.parallel_state import get_tensor_model_parallel_group - from paddlefleet.training import initialize_fleet - - # Only call initialize_fleet when the TP group has not been initialized yet - if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": parallel_config.data_parallel_size, - "mp_degree": parallel_config.tensor_parallel_size, - "pp_degree": 1, - "sep_degree": 1, - "ep_degree": parallel_config.expert_parallel_size, + + def __init__(self, fd_config: "FDConfig", **kwargs): + super().__init__(fd_config) + logger.info("Initializing PaddleFormers backend.") + self.fd_config = fd_config # FastDeploy's top-level FDConfig + self.model_config = fd_config.model_config # FastDeploy's ModelConfig + self.paddleformers_config = AutoConfig.from_pretrained(self.model_config.model) + + # Assign parallel config from fd_config.parallel_config to paddleformers_config + parallel_config = fd_config.parallel_config + self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size + self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel + self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size + + self.paddleformers_config.max_seq_len = fd_config.model_config.max_model_len + self.paddleformers_config.param_dtype = "bfloat16" + self.paddleformers_config.moe_grouped_gemm = True + # fp32_residual_connection=True causes embedding output to be cast to float32, + # which mismatches bfloat16 model weights (e.g. RMSNorm weight). + # FastDeploy handles dtype consistency itself, so disable this. + self.paddleformers_config.fp32_residual_connection = False + # Initialize PaddleFleet parallel_state so that its TP group is consistent with FastDeploy. + # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank + # via parallel_state. Without initialization, it defaults to 1, causing weights + # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). + self._init_paddlefleet_parallel_state(self.paddleformers_config) + + # The specific text model config + # Sync important config values from text_config to model_config + # This ensures fallback models use their actual config values instead of FD defaults + self._sync_config_from_text_config() + # For convenience, keep direct access to some FD configs + self.quant_config = self.fd_config.quant_config + + # Load model using from_pretrained to support weight loading + # Pass dtype, config and other options from kwargs + + model_load_kwargs = { + "dtype": self.model_config.dtype, + "config": self.paddleformers_config, + "convert_from_hf": True, + "load_via_cpu": True, + "load_checkpoint_format": "flex_checkpoint", } - initialize_fleet(strategy) - logger.info( - f"Initialized PaddleFleet parallel_state via initialize_fleet " - f"(dp={parallel_config.data_parallel_size}, " - f"mp={parallel_config.tensor_parallel_size}, " - f"ep={parallel_config.expert_parallel_size}, " - f"sp={parallel_config.sequence_parallel})" + # Set random seed before model construction for reproducibility + set_random_seed(seed_=42) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_config.model, + **model_load_kwargs, ) - def _sync_config_from_text_config(self) -> None: - """ - Sync important config values from text_config (PaddleFormers/HF config) - to model_config. This ensures fallback models use their actual config - values instead of FD's defaults. - - This is crucial for models with unique configs like: - - Gemma3: tie_word_embeddings=True, layer_types, sliding_window - - Mistral: sliding_window - - etc. - """ - mc = self.model_config - tc = self.paddleformers_config - - sync_fields = [ - "tie_word_embeddings", - "sliding_window", - "sliding_window_pattern", - "layer_types", # May be computed as property - "rope_theta", - "rope_scaling", - "head_dim", - "rms_norm_eps", - "rope_local_base_freq", # Gemma3 specific - "query_pre_attn_scalar", # Gemma3 specific - ] - - synced = [] - for field in sync_fields: - text_value = getattr(tc, field, None) - if text_value is not None: - # Only sync if not already set or if FD default differs - current_value = getattr(mc, field, None) if hasattr(mc, field) else None - if current_value is None or current_value != text_value: - setattr(mc, field, text_value) - synced.append(f"{field}={text_value}") - - def embed_input_ids(self, input_ids: paddle.Tensor) -> paddle.Tensor: - """Embed input_ids using the model's embedding layer.""" - # PaddleFleet PipelineLayer does not support get_input_embeddings(). - # Find the GPTEmbedding layer directly from run_function. - embedding_layer = None - if hasattr(self.model, "run_function"): + self.model.eval() + # Patch PaddleFleet core_attention with FastDeploy attention + patched_count = patch_paddlefleet_core_attention( + model=self.model, + fd_config=self.fd_config, + ) + logger.info(f"Patched {patched_count} attention layers with FastDeploy") + + def compute_logits(self, hidden_state, forward_meta=None): + """Compute logits from hidden states using lm_head.""" + lm_head = self.model.get_lm_head() + # ColumnParallelLinear expects input [s, b, h] + hidden_state = hidden_state.unsqueeze(1) # [num_tokens, h] -> [num_tokens, 1, h] + logits = lm_head({"hidden_states": hidden_state}) + # Output [num_tokens, 1, vocab], squeeze back to [num_tokens, vocab] + if logits.ndim == 3: + logits = logits.squeeze(1) + logits = logits.astype(paddle.float32) + logits[:, self.model_config.ori_vocab_size :] = -float("inf") + + return logits + + def _init_paddlefleet_parallel_state(self, parallel_config) -> None: + """ + Initialize PaddleFleet's parallel_state so that ColumnParallelLinear/RowParallelLinear + can correctly obtain TP world_size and rank, and thus correctly shard weights + and build sharded_state_dict. + + References the initialization logic in PaddleFormers' training_args.py, + using the official initialize_fleet API instead of directly manipulating + parallel_state internal variables. + """ + from paddle.distributed import fleet + from paddlefleet.parallel_state import get_tensor_model_parallel_group + from paddlefleet.training import initialize_fleet + + # Only call initialize_fleet when the TP group has not been initialized yet + if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": parallel_config.data_parallel_size, + "mp_degree": parallel_config.tensor_parallel_size, + "pp_degree": 1, + "sep_degree": 1, + "ep_degree": parallel_config.expert_parallel_size, + } + initialize_fleet(strategy) + logger.info( + f"Initialized PaddleFleet parallel_state via initialize_fleet " + f"(dp={parallel_config.data_parallel_size}, " + f"mp={parallel_config.tensor_parallel_size}, " + f"ep={parallel_config.expert_parallel_size}, " + f"sp={parallel_config.sequence_parallel})" + ) + + def _sync_config_from_text_config(self) -> None: + """ + Sync important config values from text_config (PaddleFormers/HF config) + to model_config. This ensures fallback models use their actual config + values instead of FD's defaults. + + This is crucial for models with unique configs like: + - Gemma3: tie_word_embeddings=True, layer_types, sliding_window + - Mistral: sliding_window + - etc. + """ + mc = self.model_config + tc = self.paddleformers_config + + sync_fields = [ + "tie_word_embeddings", + "sliding_window", + "sliding_window_pattern", + "layer_types", # May be computed as property + "rope_theta", + "rope_scaling", + "head_dim", + "rms_norm_eps", + "rope_local_base_freq", # Gemma3 specific + "query_pre_attn_scalar", # Gemma3 specific + ] + + synced = [] + for field in sync_fields: + text_value = getattr(tc, field, None) + if text_value is not None: + # Only sync if not already set or if FD default differs + current_value = getattr(mc, field, None) if hasattr(mc, field) else None + if current_value is None or current_value != text_value: + setattr(mc, field, text_value) + synced.append(f"{field}={text_value}") + + def embed_input_ids(self, input_ids: paddle.Tensor) -> paddle.Tensor: + """Embed input_ids using the model's embedding layer.""" + # PaddleFleet PipelineLayer does not support get_input_embeddings(). + # Find the GPTEmbedding layer directly from run_function. + embedding_layer = None + if hasattr(self.model, "run_function"): + for layer in self.model.run_function: + if isinstance(layer, GPTEmbedding): + embedding_layer = layer + break + if embedding_layer is None: + raise RuntimeError("Cannot find GPTEmbedding layer in model.run_function") + + original_ndim = input_ids.ndim + if input_ids.ndim == 1: + input_ids = input_ids.unsqueeze(0) # [num_tokens] -> [1, num_tokens] + + model_input = {"input_ids": input_ids} + result = embedding_layer(model_input) + inputs_embeds = result["hidden_states"] + + # Embedding output is [batch, seq, h], squeeze back to [num_tokens, h] + if original_ndim == 1 and inputs_embeds.ndim == 3: + inputs_embeds = inputs_embeds.squeeze(0) + + if hasattr(self, "embed_scale") and self.embed_scale is not None: + inputs_embeds *= self.embed_scale + return inputs_embeds + + @paddle.no_grad() + def forward( + self, + inputs: Dict, + forward_meta: ForwardMeta, + **kwargs, + ): + """Full transformer forward: input_ids -> hidden_states. + + This method is the primary forward pass for the model, computing: + 1. Position IDs based on seq_lens_decoder (absolute positions for RoPE) + 2. Token embeddings via embed_input_ids + 3. Transformer layers via self.model() + + Returns: + hidden_states: [TotalTokens, HiddenDim] + """ + ids_remove_padding = inputs["ids_remove_padding"] + num_tokens = ids_remove_padding.shape[0] + + batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] + seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] + + if batch_id_per_token is not None and seq_lens_decoder is not None: + decoder_offsets = seq_lens_decoder.squeeze(-1) # [batch_size] + # Ensure decoder_offsets is at least 1D tensor + if decoder_offsets.ndim == 0: + decoder_offsets = decoder_offsets.reshape([1]) + token_decoder_offsets = paddle.index_select( + decoder_offsets, batch_id_per_token, axis=0 + ) # [num_tokens] + + cu_seqlens = forward_meta.cu_seqlens_q # [batch_size + 1] + if cu_seqlens is not None: + token_global_idx = paddle.arange(num_tokens, dtype="int64") + request_start_idx = paddle.index_select(cu_seqlens[:-1], batch_id_per_token, axis=0) + relative_positions = token_global_idx - request_start_idx.astype("int64") + else: + relative_positions = paddle.zeros([num_tokens], dtype="int64") + position_ids = token_decoder_offsets.astype("int64") + relative_positions + else: + position_ids = paddle.arange(num_tokens, dtype="int64") + if seq_lens_decoder is not None: + position_ids = position_ids + seq_lens_decoder[0, 0].astype("int64") + forward_meta.rope_already_applied = True + + # Also set forward_meta on each TransformerLayer's config + # so that FastDeployAttention can retrieve it from core_attn.config + if hasattr(self.model, "run_function"): + for layer in self.model.run_function: + if not isinstance(layer, (GPTEmbedding, GPTLMHead)): + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "core_attention"): + core_attn = layer.self_attn.core_attention + if hasattr(core_attn, "config"): + core_attn.config.forward_meta = forward_meta + + inputs_embeds = self.embed_input_ids(ids_remove_padding).unsqueeze(0) + + # Build input dict, PipelineLayer passes data between layers via dict + model_input = { + "input_ids": None, + "position_ids": position_ids, + } + # Add other parameters from kwargs + for k, v in kwargs.items(): + if v is not None: + model_input[k] = v + + # Iterate over run_function, skip GPTLMHead + # Only call TransformerLayer for layer in self.model.run_function: - if isinstance(layer, GPTEmbedding): - embedding_layer = layer - break - if embedding_layer is None: - raise RuntimeError("Cannot find GPTEmbedding layer in model.run_function") - - original_ndim = input_ids.ndim - if input_ids.ndim == 1: - input_ids = input_ids.unsqueeze(0) # [num_tokens] -> [1, num_tokens] - - model_input = {"input_ids": input_ids} - result = embedding_layer(model_input) - inputs_embeds = result["hidden_states"] - - # Embedding output is [batch, seq, h], squeeze back to [num_tokens, h] - if original_ndim == 1 and inputs_embeds.ndim == 3: - inputs_embeds = inputs_embeds.squeeze(0) - - if hasattr(self, "embed_scale") and self.embed_scale is not None: - inputs_embeds *= self.embed_scale - return inputs_embeds - - @paddle.no_grad() - def forward( - self, - inputs: Dict, - forward_meta: ForwardMeta, - **kwargs, + if isinstance(layer, GPTLMHead): + continue + if isinstance(layer, (GPTEmbedding)): + model_input = layer(model_input, decoder_input=inputs_embeds) + else: + model_input = layer(model_input) + + hidden_states = model_input["hidden_states"] + # [b, s, h] -> [s, h] (b=1) + hidden_states = hidden_states.squeeze(0) + + return hidden_states + + @paddle.no_grad() + def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): + # use model.from_pretrained to load weight + pass + + def set_state_dict(self, state_dict): + self.model.set_state_dict(state_dict) + + # ============================================================================ + # PaddleFleet Attention Patch Functions + # ============================================================================ + + def patch_paddlefleet_core_attention( + model, + fd_config: "FDConfig", + layers_to_patch: list[int] | None = None, ): - """Full transformer forward: input_ids -> hidden_states. + """ + Replace core_attention in all TransformerLayers of a PaddleFleet model with FastDeployAttention. - This method is the primary forward pass for the model, computing: - 1. Position IDs based on seq_lens_decoder (absolute positions for RoPE) - 2. Token embeddings via embed_input_ids - 3. Transformer layers via self.model() + Args: + model: PaddleFleet model instance (inheriting from PipelineLayer) + fd_config: FastDeploy FDConfig object, used to create Attention instances + layers_to_patch: List of layer indices to patch, None means patch all layers Returns: - hidden_states: [TotalTokens, HiddenDim] + int: Number of layers successfully patched + + Raises: + ValueError: If the model structure is unexpected or parameters are incorrect """ - ids_remove_padding = inputs["ids_remove_padding"] - num_tokens = ids_remove_padding.shape[0] - - batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] - seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] - - if batch_id_per_token is not None and seq_lens_decoder is not None: - decoder_offsets = seq_lens_decoder.squeeze(-1) # [batch_size] - # Ensure decoder_offsets is at least 1D tensor - if decoder_offsets.ndim == 0: - decoder_offsets = decoder_offsets.reshape([1]) - token_decoder_offsets = paddle.index_select(decoder_offsets, batch_id_per_token, axis=0) # [num_tokens] - - cu_seqlens = forward_meta.cu_seqlens_q # [batch_size + 1] - if cu_seqlens is not None: - token_global_idx = paddle.arange(num_tokens, dtype="int64") - request_start_idx = paddle.index_select(cu_seqlens[:-1], batch_id_per_token, axis=0) - relative_positions = token_global_idx - request_start_idx.astype("int64") - else: - relative_positions = paddle.zeros([num_tokens], dtype="int64") - position_ids = token_decoder_offsets.astype("int64") + relative_positions - else: - position_ids = paddle.arange(num_tokens, dtype="int64") - if seq_lens_decoder is not None: - position_ids = position_ids + seq_lens_decoder[0, 0].astype("int64") - forward_meta.rope_already_applied = True - - # Also set forward_meta on each TransformerLayer's config - # so that FastDeployAttention can retrieve it from core_attn.config - if hasattr(self.model, "run_function"): - for layer in self.model.run_function: - if not isinstance(layer, (GPTEmbedding, GPTLMHead)): - if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "core_attention"): - core_attn = layer.self_attn.core_attention - if hasattr(core_attn, "config"): - core_attn.config.forward_meta = forward_meta - - inputs_embeds = self.embed_input_ids(ids_remove_padding).unsqueeze(0) - - # Build input dict, PipelineLayer passes data between layers via dict - model_input = { - "input_ids": None, - "position_ids": position_ids, - } - # Add other parameters from kwargs - for k, v in kwargs.items(): - if v is not None: - model_input[k] = v - - # Iterate over run_function, skip GPTLMHead - # Only call TransformerLayer - for layer in self.model.run_function: - if isinstance(layer, GPTLMHead): + if fd_config is None: + raise ValueError("fd_config must be provided") + + from fastdeploy.model_executor.layers.attention.attention import Attention + + # Iterate over run_function to find TransformerLayers + patched_count = 0 + transformer_layers = [] + + # Collect all TransformerLayers + if hasattr(model, "run_function"): + for layer in model.run_function: + # Try to identify TransformerLayer + layer_type = type(layer).__name__ + if "TransformerLayer" in layer_type or "transformer" in str(type(layer)): + transformer_layers.append(layer) + + if not transformer_layers: + # Try alternative ways to find layers + for name, module in model.named_sublayers(): + if "TransformerLayer" in type(module).__name__: + transformer_layers.append(module) + + if not transformer_layers: + raise ValueError("No TransformerLayer found in model") + + # Patch core_attention for each TransformerLayer + for layer in transformer_layers: + layer_number = getattr(layer, "layer_number", None) + if layer_number is None: + layer_number = getattr(layer, "layer_id", None) + + if layer_number is None: + logger.warning("layer_number not found, skip patching...") + continue # Skip layers where layer_id cannot be obtained + + # Check if this layer needs to be patched + if layers_to_patch is not None and (layer_number) not in layers_to_patch: continue - if isinstance(layer, (GPTEmbedding)): - model_input = layer(model_input, decoder_input=inputs_embeds) - else: - model_input = layer(model_input) - hidden_states = model_input["hidden_states"] - # [b, s, h] -> [s, h] (b=1) - hidden_states = hidden_states.squeeze(0) - - return hidden_states + # Get core_attention + if not hasattr(layer, "self_attn"): + logger.warning(f"self_attn not found in layer {layer_number}, skip patching...") + continue - @paddle.no_grad() - def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): - # use model.from_pretrained to load weight - pass + core_attn = layer.self_attn.core_attention + if core_attn is None: + logger.warning(f"core_attn not found in layer {layer_number}, skip patching...") + continue - def set_state_dict(self, state_dict): - self.model.set_state_dict(state_dict) + # Get configuration info + # Prefer per-partition values (values after TP sharding), + # because PaddleFleet's QKV output is already per-partition when TP>1 + num_attention_heads = getattr( + core_attn, "num_attention_heads_per_partition", getattr(core_attn.config, "num_attention_heads", None) + ) + num_key_value_heads = getattr( + core_attn, + "num_query_groups_per_partition", + getattr(core_attn.config, "num_key_value_heads", num_attention_heads), + ) + hidden_size_per_attention_head = getattr(core_attn, "hidden_size_per_attention_head", None) + if hidden_size_per_attention_head is not None: + softmax_scale = getattr(core_attn, "softmax_scale", 1.0 / math.sqrt(hidden_size_per_attention_head)) + else: + softmax_scale = 1.0 + hidden_size_per_partition = getattr(core_attn, "hidden_size_per_partition", None) + if hidden_size_per_partition is None: + head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) + hidden_size_per_partition = num_attention_heads * head_dim -# ============================================================================ -# PaddleFleet Attention Patch Functions -# ============================================================================ + # Get FastDeploy layer ID (0-indexed) + fd_layer_id = layer_number + # Create Attention instance inside FastDeployAttention + fd_attn_instance = Attention( + fd_config=fd_config, + layer_id=fd_layer_id, + ) -def patch_paddlefleet_core_attention( - model, - fd_config: "FDConfig", - layers_to_patch: list[int] | None = None, -): - """ - Replace core_attention in all TransformerLayers of a PaddleFleet model with FastDeployAttention. + # Override Attention instance's head config to match PaddleFleet model + # This is necessary because fd_config.model_config may differ from PaddleFleet model config + fd_attn_instance.num_heads = num_attention_heads + fd_attn_instance.kv_num_heads = num_key_value_heads + fd_attn_instance.head_dim = hidden_size_per_attention_head + logger.info( + f"Overriding Attention config: num_heads={num_attention_heads}, kv_num_heads={num_key_value_heads}, head_dim={hidden_size_per_attention_head}" + ) - Args: - model: PaddleFleet model instance (inheriting from PipelineLayer) - fd_config: FastDeploy FDConfig object, used to create Attention instances - layers_to_patch: List of layer indices to patch, None means patch all layers + # Create FastDeployAttention object and directly replace core_attention + fast_deploy_core_attn = FastDeployAttention( + config=core_attn.config, + fd_attention=fd_attn_instance, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + softmax_scale=softmax_scale, + hidden_size_per_attention_head=hidden_size_per_attention_head, + hidden_size_per_partition=hidden_size_per_partition, + layer_id=fd_layer_id, + ) - Returns: - int: Number of layers successfully patched + # Replace core_attention object + layer.self_attn.core_attention = fast_deploy_core_attn - Raises: - ValueError: If the model structure is unexpected or parameters are incorrect - """ - if fd_config is None: - raise ValueError("fd_config must be provided") + patched_count += 1 + logger.info(f"Replaced core_attention with FastDeployAttention for layer {fd_layer_id}") - from fastdeploy.model_executor.layers.attention.attention import Attention + logger.info(f"Successfully replaced {patched_count} core_attention layers with FastDeployAttention") - # Iterate over run_function to find TransformerLayers - patched_count = 0 - transformer_layers = [] - - # Collect all TransformerLayers - if hasattr(model, "run_function"): - for layer in model.run_function: - # Try to identify TransformerLayer - layer_type = type(layer).__name__ - if "TransformerLayer" in layer_type or "transformer" in str(type(layer)): - transformer_layers.append(layer) - - if not transformer_layers: - # Try alternative ways to find layers - for name, module in model.named_sublayers(): - if "TransformerLayer" in type(module).__name__: - transformer_layers.append(module) - - if not transformer_layers: - raise ValueError("No TransformerLayer found in model") - - # Patch core_attention for each TransformerLayer - for layer in transformer_layers: - layer_number = getattr(layer, "layer_number", None) - if layer_number is None: - layer_number = getattr(layer, "layer_id", None) - - if layer_number is None: - logger.warning("layer_number not found, skip patching...") - continue # Skip layers where layer_id cannot be obtained - - # Check if this layer needs to be patched - if layers_to_patch is not None and (layer_number) not in layers_to_patch: - continue - - # Get core_attention - if not hasattr(layer, "self_attn"): - logger.warning(f"self_attn not found in layer {layer_number}, skip patching...") - continue - - core_attn = layer.self_attn.core_attention - if core_attn is None: - logger.warning(f"core_attn not found in layer {layer_number}, skip patching...") - continue - - # Get configuration info - # Prefer per-partition values (values after TP sharding), - # because PaddleFleet's QKV output is already per-partition when TP>1 - num_attention_heads = getattr( - core_attn, "num_attention_heads_per_partition", getattr(core_attn.config, "num_attention_heads", None) - ) - num_key_value_heads = getattr( - core_attn, - "num_query_groups_per_partition", - getattr(core_attn.config, "num_key_value_heads", num_attention_heads), - ) - hidden_size_per_attention_head = getattr(core_attn, "hidden_size_per_attention_head", None) - if hidden_size_per_attention_head is not None: - softmax_scale = getattr(core_attn, "softmax_scale", 1.0 / math.sqrt(hidden_size_per_attention_head)) - else: - softmax_scale = 1.0 - - hidden_size_per_partition = getattr(core_attn, "hidden_size_per_partition", None) - if hidden_size_per_partition is None: - head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) - hidden_size_per_partition = num_attention_heads * head_dim - - # Get FastDeploy layer ID (0-indexed) - fd_layer_id = layer_number - - # Create Attention instance inside FastDeployAttention - fd_attn_instance = Attention( - fd_config=fd_config, - layer_id=fd_layer_id, - ) - - # Override Attention instance's head config to match PaddleFleet model - # This is necessary because fd_config.model_config may differ from PaddleFleet model config - fd_attn_instance.num_heads = num_attention_heads - fd_attn_instance.kv_num_heads = num_key_value_heads - fd_attn_instance.head_dim = hidden_size_per_attention_head - logger.info( - f"Overriding Attention config: num_heads={num_attention_heads}, kv_num_heads={num_key_value_heads}, head_dim={hidden_size_per_attention_head}" - ) - - # Create FastDeployAttention object and directly replace core_attention - fast_deploy_core_attn = FastDeployAttention( - config=core_attn.config, - fd_attention=fd_attn_instance, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - softmax_scale=softmax_scale, - hidden_size_per_attention_head=hidden_size_per_attention_head, - hidden_size_per_partition=hidden_size_per_partition, - layer_id=fd_layer_id, - ) - - # Replace core_attention object - layer.self_attn.core_attention = fast_deploy_core_attn - - patched_count += 1 - logger.info(f"Replaced core_attention with FastDeployAttention for layer {fd_layer_id}") - - logger.info(f"Successfully replaced {patched_count} core_attention layers with FastDeployAttention") - - return patched_count + return patched_count From e28b10f0e75775878cc82708889ab7eb2f566fb8 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Mon, 25 May 2026 14:20:10 +0800 Subject: [PATCH 06/19] revert print --- fastdeploy/config.py | 3 --- fastdeploy/model_executor/model_loader/default_loader_v1.py | 1 - fastdeploy/model_executor/models/model_base.py | 2 +- fastdeploy/model_executor/models/paddleformers/base_fleet.py | 4 +++- fastdeploy/worker/gpu_model_runner.py | 3 --- 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 2b14b6366a8..a1edc2ba670 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -304,9 +304,6 @@ def _post_init(self): self.is_unified_ckpt = check_unified_ckpt(self.model) self.runner_type = self._get_runner_type(self.architectures, self.runner) self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert) - print("self.architectures ", self.architectures) - print("self.runner_type ", self.runner_type) - print("self.convert_type ", self.convert_type) registry = self.registry is_generative_model = registry.is_text_generation_model(self.architectures, self) is_pooling_model = registry.is_pooling_model(self.architectures, self) diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py index 6a3b2639af6..1d5c19a5e24 100644 --- a/fastdeploy/model_executor/model_loader/default_loader_v1.py +++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py @@ -93,7 +93,6 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer: model_cls = as_embedding_model(model_cls) else: assert_never(convert_type) - print("model_class = ", model_cls) model = model_cls(fd_config) if fd_config.load_config.dynamic_load_weight or fd_config.model_config.enable_cache: process_final_after_loading(model, fd_config) diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index cbd622b6f7b..0acbeebc314 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -203,7 +203,7 @@ def _try_resolve_paddleformers( raise ImportError( "paddlefleet backend requires paddlefleet to be installed.\n" "Please install with [change cuda version if needed ]:\n" - "python -m pip install paddlefleet==0.3.0.dev20260507' " + "python -m pip install paddlefleet==0.3.0.dev20260507" "--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ " "--extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/" ) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 54acd6e6b12..5f43389f963 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -157,7 +157,6 @@ def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: v = squeeze_to_3d(value, "value") if is_mla: - # 同时兼容 CUDA Graph 和非 CUDA Graph 模式 need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0 need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0 @@ -294,6 +293,9 @@ def __init__(self, fd_config: "FDConfig", **kwargs): self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size + # if parallel_config.expert_parallel_size > 1 and parallel_config.sequence_parallel == False: + # self.paddleformers_config.tensor_model_parallel_size = 1 + # logger.warning("When using expert parallelism and tensor parallelism, sequence parallelism must be used in fleet set tp=1 .") self.paddleformers_config.parallel_output = self.paddleformers_config.tensor_model_parallel_size == 1 self.paddleformers_config.max_seq_len = self.model_config.max_model_len self.paddleformers_config.params_dtype = "bfloat16" diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 886648e48ef..c607a0e2880 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1296,7 +1296,6 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p token_num, real_bsz = self._resolve_current_launch_token_num( cached_token_num, cached_real_bsz, token_num_event, is_dummy_or_profile_run ) - print("self.share_inputs[input_ids] ", self.share_inputs["input_ids"]) ( ids_remove_padding, batch_id_per_token, @@ -1314,7 +1313,6 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p self.share_inputs["seq_lens_encoder"], self.share_inputs["seq_lens_decoder"], ) - print("ids_remove_padding", ids_remove_padding) self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) # NOTE: (changwenbin) Initialized to max_num_seq '-1' before copying, marking illegal positions self.share_inputs["batch_id_per_token"][:] = -1 @@ -2502,7 +2500,6 @@ def _preprocess( def _execute(self, model_inputs: Dict[str, paddle.Tensor]) -> None: model_output = None if model_inputs is not None and len(model_inputs) > 0: - print("model_inputs", model_inputs) model_output = self.model( model_inputs, self.forward_meta, From 5de13a6282a9e6b8fa6564f98bb480a4880f7786 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Mon, 25 May 2026 16:06:44 +0800 Subject: [PATCH 07/19] fix fleet import in base_fleet.py --- .../models/paddleformers/base_fleet.py | 1270 +++++++++-------- 1 file changed, 637 insertions(+), 633 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 5f43389f963..8631e780e1a 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -16,666 +16,670 @@ """Generic PaddleFormers modeling backend base class.""" -import math -from collections.abc import Iterable -from typing import TYPE_CHECKING, Dict - -import paddle -from paddle import nn -from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding -from paddlefleet.models.gpt.lm_head import GPTLMHead -from paddlefleet.transformer.layer import FleetLayer -from paddlefleet.transformer.transformer_config import TransformerConfig -from paddleformers.trainer.trainer_utils import set_random_seed -from paddleformers.transformers import AutoConfig -from paddleformers.transformers.auto.modeling import AutoModelForCausalLM -from paddleformers.utils.log import logger - -from fastdeploy.model_executor.forward_meta import ForwardMeta # noqa: F401 -from fastdeploy.model_executor.graph_optimization.decorator import ( - support_graph_optimization, -) - -if TYPE_CHECKING: - from fastdeploy.config import FDConfig - - -from fastdeploy.model_executor.layers.attention.attention import Attention - - -class FastDeployAttention(FleetLayer): - """ - FastDeploy version of DotProductAttention, holding an internal FastDeploy Attention module. - - This class can be used to replace PaddleFleet's DotProductAttention, - using FastDeploy's attention backend for computation. - """ - - def __init__( - self, - config: TransformerConfig, - fd_attention: Attention, - num_attention_heads: int, - num_key_value_heads: int, - softmax_scale: float, - hidden_size_per_attention_head: int, - hidden_size_per_partition: int, - layer_id: int, - ): - """ - Initialize FastDeployAttention. +import logging + +from fastdeploy.model_executor.utils import is_paddlefleet_available + +if not is_paddlefleet_available(): + logging.warning("paddlefleet is not installed, skipping base_fleet module") +else: + import math + from collections.abc import Iterable + from typing import TYPE_CHECKING, Dict + + import paddle + from paddle import nn + from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding + from paddlefleet.models.gpt.lm_head import GPTLMHead + from paddlefleet.transformer.layer import FleetLayer + from paddlefleet.transformer.transformer_config import TransformerConfig + from paddleformers.trainer.trainer_utils import set_random_seed + from paddleformers.transformers import AutoConfig + from paddleformers.transformers.auto.modeling import AutoModelForCausalLM + from paddleformers.utils.log import logger + + from fastdeploy.model_executor.forward_meta import ForwardMeta # noqa: F401 + from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, + ) + + if TYPE_CHECKING: + from fastdeploy.config import FDConfig - Args: - fd_attention: FastDeploy Attention instance - num_attention_heads: Number of attention heads - num_key_value_heads: Number of KV heads - softmax_scale: Softmax scaling factor - hidden_size_per_attention_head: Hidden dimension per attention head - hidden_size_per_partition: Hidden size per partition - layer_id: Current layer ID - """ - super().__init__(config) - self.fd_attention = fd_attention - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - self.softmax_scale = softmax_scale - self.hidden_size_per_attention_head = hidden_size_per_attention_head - self.hidden_size_per_partition = hidden_size_per_partition - self.layer_id = layer_id - - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - attention_mask: paddle.Tensor, - attn_mask_startend_row_indices: paddle.Tensor = None, - attn_mask_type=None, - attention_bias: paddle.Tensor = None, - packed_seq_params=None, - use_rr_flash_attention: bool = False, - x: paddle.Tensor = None, - qr: paddle.Tensor = None, - kv_compressed: paddle.Tensor = None, - k_pos_emb: paddle.Tensor = None, - q_absorbed: paddle.Tensor = None, - v_b_proj_weight: paddle.Tensor = None, - ): - """ - Forward pass. + from fastdeploy.model_executor.layers.attention.attention import Attention - Args: - query: Query tensor, supported formats: - - 4D BSHD: [b, sq, np, hn] (PaddleFleet default) - - 3D HSD: [np, sq, hn] - - 3D SHD: [sq, np, hn] - key: Key tensor, same format as above, but head count may differ (GQA) - value: Value tensor, same format as above - attention_mask: Attention mask - attn_mask_startend_row_indices: FlashMask start-end row indices - attn_mask_type: Attention mask type - attention_bias: Attention bias - packed_seq_params: Packed sequence parameters - use_rr_flash_attention: Whether to use RR Flash Attention - kv_compressed: Compressed KV tensor for MLA (Multi-Latent Attention) + class FastDeployAttention(FleetLayer): + """ + FastDeploy version of DotProductAttention, holding an internal FastDeploy Attention module. - Returns: - Attention output tensor + This class can be used to replace PaddleFleet's DotProductAttention, + using FastDeploy's attention backend for computation. """ - # Try to get forward_meta from config (PaddleFleet does not pass this parameter when calling) - forward_meta = getattr(self.config, "forward_meta", None) - assert forward_meta is not None, "forward_meta must be provided" - - # Set scaling factor - original_scale = getattr(self.fd_attention, "scale", None) - if original_scale is None: - self.fd_attention.scale = self.softmax_scale - - # Check if MLA mode is enabled - is_mla = getattr(self.config, "multi_latent_attention", False) - - try: - # Refer to the processing logic of fastdeploy_append_attention_forward - # Support 3D (SHD) and 4D (BSHD) input - - # 4D input: squeeze to 3D (only supports batch=1) - def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: - if t is None: - return None - if t.ndim == 4: - if int(t.shape[0]) != 1: - raise ValueError( - f"{name} batch size {int(t.shape[0])} not supported, only batch=1 is supported" - ) - return t.squeeze(0) - if t.ndim == 3: - return t - raise ValueError(f"{name} has unexpected dims {t.ndim}, expect 3 or 4") - - q = squeeze_to_3d(query, "query") - k = squeeze_to_3d(key, "key") - v = squeeze_to_3d(value, "value") - - if is_mla: - need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0 - need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0 - - # MLA mode: pass q, k, v, compressed_kv, k_pe separately - # Reference: deepseek_v3.py line 389 - # - # Note: - # - Prefill (flash_attn_func): expects 3D tensors [seq, heads, dim] - # - Decode (multi_head_latent_attention): expects 2D tensors [seq, heads*dim] - # So we need to flatten q for decode phase only - - # Process compressed_kv and k_pe - - assert kv_compressed is not None, "kv_compressed must be provided when use" - compressed_kv = kv_compressed.squeeze(0) - k_pos_emb = k_pos_emb.squeeze(0) - - output = None - fmqa_out = None - if need_do_prefill: - # Prefill: keep 3D tensors for flash_attn_func - output = self.fd_attention.forward( - q=q, - k=k, - v=v, - qkv=None, - compressed_kv=compressed_kv, - k_pe=k_pos_emb, - forward_meta=forward_meta, - ) - output.reshape_([output.shape[0], output.shape[1] * output.shape[2]]) - - if need_do_decode: - # Decode: use absorbed q for multi_head_latent_attention C++ kernel - # q_absorbed: [s, heads, kv_lora_rank + qk_rope_head_dim] (after squeeze_to_3d) - # C++ kernel expects: [token_num, heads * (kv_lora_rank + qk_rope_head_dim)] - q_abs = squeeze_to_3d(q_absorbed, "q_absorbed") if q_absorbed.ndim == 4 else q_absorbed - seq_len = int(q_abs.shape[0]) - q_input = q_abs.reshape([seq_len, -1]) - - fmqa_out = self.fd_attention.forward( - q=q_input, - k=None, - v=None, - qkv=None, - compressed_kv=compressed_kv, - k_pe=k_pos_emb, - forward_meta=forward_meta, - ) - - # V de-absorption: kernel output [token, heads * kv_lora_rank] - # -> [heads, token, kv_lora_rank] @ wv_b [heads, kv_lora_rank, v_head_dim] - # -> [token, heads * v_head_dim] - kv_lora_rank = self.config.kv_lora_rank - v_head_dim = self.config.v_head_dim - num_heads = fmqa_out.shape[-1] // kv_lora_rank - fmqa_out = fmqa_out.reshape([-1, num_heads, kv_lora_rank]).transpose([1, 0, 2]) - fmqa_out = paddle.bmm(fmqa_out, v_b_proj_weight) - fmqa_out = fmqa_out.transpose([1, 0, 2]).reshape([-1, num_heads * v_head_dim]) - # Merge prefill and decode outputs if both are present - if need_do_prefill: - try: - from fastdeploy.model_executor.ops.gpu import ( - merge_prefill_decode_output, - ) - merge_prefill_decode_output( - output, - fmqa_out, - forward_meta.seq_lens_encoder, - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.cu_seqlens_q, - num_heads, - v_head_dim, - 1, + def __init__( + self, + config: TransformerConfig, + fd_attention: Attention, + num_attention_heads: int, + num_key_value_heads: int, + softmax_scale: float, + hidden_size_per_attention_head: int, + hidden_size_per_partition: int, + layer_id: int, + ): + """ + Initialize FastDeployAttention. + + Args: + fd_attention: FastDeploy Attention instance + num_attention_heads: Number of attention heads + num_key_value_heads: Number of KV heads + softmax_scale: Softmax scaling factor + hidden_size_per_attention_head: Hidden dimension per attention head + hidden_size_per_partition: Hidden size per partition + layer_id: Current layer ID + """ + super().__init__(config) + self.fd_attention = fd_attention + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.softmax_scale = softmax_scale + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.hidden_size_per_partition = hidden_size_per_partition + self.layer_id = layer_id + + def forward( + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + attention_mask: paddle.Tensor, + attn_mask_startend_row_indices: paddle.Tensor = None, + attn_mask_type=None, + attention_bias: paddle.Tensor = None, + packed_seq_params=None, + use_rr_flash_attention: bool = False, + x: paddle.Tensor = None, + qr: paddle.Tensor = None, + kv_compressed: paddle.Tensor = None, + k_pos_emb: paddle.Tensor = None, + q_absorbed: paddle.Tensor = None, + v_b_proj_weight: paddle.Tensor = None, + ): + """ + Forward pass. + + Args: + query: Query tensor, supported formats: + - 4D BSHD: [b, sq, np, hn] (PaddleFleet default) + - 3D HSD: [np, sq, hn] + - 3D SHD: [sq, np, hn] + key: Key tensor, same format as above, but head count may differ (GQA) + value: Value tensor, same format as above + attention_mask: Attention mask + attn_mask_startend_row_indices: FlashMask start-end row indices + attn_mask_type: Attention mask type + attention_bias: Attention bias + packed_seq_params: Packed sequence parameters + use_rr_flash_attention: Whether to use RR Flash Attention + kv_compressed: Compressed KV tensor for MLA (Multi-Latent Attention) + + Returns: + Attention output tensor + """ + # Try to get forward_meta from config (PaddleFleet does not pass this parameter when calling) + forward_meta = getattr(self.config, "forward_meta", None) + assert forward_meta is not None, "forward_meta must be provided" + + # Set scaling factor + original_scale = getattr(self.fd_attention, "scale", None) + if original_scale is None: + self.fd_attention.scale = self.softmax_scale + + # Check if MLA mode is enabled + is_mla = getattr(self.config, "multi_latent_attention", False) + + try: + # Refer to the processing logic of fastdeploy_append_attention_forward + # Support 3D (SHD) and 4D (BSHD) input + + # 4D input: squeeze to 3D (only supports batch=1) + def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: + if t is None: + return None + if t.ndim == 4: + if int(t.shape[0]) != 1: + raise ValueError( + f"{name} batch size {int(t.shape[0])} not supported, only batch=1 is supported" ) - except (ImportError, AttributeError): - logger.warning("merge_prefill_decode_output not available, using decode output only") - output = fmqa_out - else: - output = fmqa_out - # _log_md5(output, "output") - else: - # Standard mode: concatenate QKV - seq_len = int(q.shape[0]) - - # SHD: [seq, heads, dim] -> flatten to [seq, heads*dim] - q_flat = q.reshape([seq_len, -1]) - k_flat = k.reshape([seq_len, -1]) - v_flat = v.reshape([seq_len, -1]) - - # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] - qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) - - # _log_md5(qkv, "qkv_out") - output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) - - # _log_md5(output, "atten_out") - # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] - # PaddleFleet expects 3D output format - output = output.unsqueeze(0) + return t.squeeze(0) + if t.ndim == 3: + return t + raise ValueError(f"{name} has unexpected dims {t.ndim}, expect 3 or 4") + + q = squeeze_to_3d(query, "query") + k = squeeze_to_3d(key, "key") + v = squeeze_to_3d(value, "value") + + if is_mla: + need_do_prefill = forward_meta.max_len_tensor_cpu[1] > 0 + need_do_decode = forward_meta.max_len_tensor_cpu[2] > 0 + + # MLA mode: pass q, k, v, compressed_kv, k_pe separately + # Reference: deepseek_v3.py line 389 + # + # Note: + # - Prefill (flash_attn_func): expects 3D tensors [seq, heads, dim] + # - Decode (multi_head_latent_attention): expects 2D tensors [seq, heads*dim] + # So we need to flatten q for decode phase only + + # Process compressed_kv and k_pe + + assert kv_compressed is not None, "kv_compressed must be provided when use" + compressed_kv = kv_compressed.squeeze(0) + k_pos_emb = k_pos_emb.squeeze(0) + + output = None + fmqa_out = None + if need_do_prefill: + # Prefill: keep 3D tensors for flash_attn_func + output = self.fd_attention.forward( + q=q, + k=k, + v=v, + qkv=None, + compressed_kv=compressed_kv, + k_pe=k_pos_emb, + forward_meta=forward_meta, + ) + output.reshape_([output.shape[0], output.shape[1] * output.shape[2]]) + + if need_do_decode: + # Decode: use absorbed q for multi_head_latent_attention C++ kernel + # q_absorbed: [s, heads, kv_lora_rank + qk_rope_head_dim] (after squeeze_to_3d) + # C++ kernel expects: [token_num, heads * (kv_lora_rank + qk_rope_head_dim)] + q_abs = squeeze_to_3d(q_absorbed, "q_absorbed") if q_absorbed.ndim == 4 else q_absorbed + seq_len = int(q_abs.shape[0]) + q_input = q_abs.reshape([seq_len, -1]) + + fmqa_out = self.fd_attention.forward( + q=q_input, + k=None, + v=None, + qkv=None, + compressed_kv=compressed_kv, + k_pe=k_pos_emb, + forward_meta=forward_meta, + ) - return output - finally: - # Restore original scale - if original_scale is None: - if hasattr(self.fd_attention, "scale"): - delattr(self.fd_attention, "scale") - else: - self.fd_attention.scale = original_scale - - -@support_graph_optimization -class PaddleFleetModelBase(nn.Layer): - """ - A mixin-style base class to provide PaddleFormers backend logic on top of nn.Layer. - This class subclasses nn.Layer and provides common methods to - initialize and manage a PaddleFormers model. - """ - - def __init__(self, fd_config: "FDConfig", **kwargs): - super().__init__(fd_config) - logger.info("Initializing PaddleFormers backend.") - self.fd_config = fd_config # FastDeploy's top-level FDConfig - self.model_config = fd_config.model_config # FastDeploy's ModelConfig - self.paddleformers_config = AutoConfig.from_pretrained(self.model_config.model) - - # Assign parallel config from fd_config.parallel_config to paddleformers_config - parallel_config = fd_config.parallel_config - # parallel_config.tensor_parallel_size = 1 - # parallel_config.expert_parallel_size = 2 - self.paddleformers_config.data_parallel_size = parallel_config.data_parallel_size - self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size - self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel - self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size - # if parallel_config.expert_parallel_size > 1 and parallel_config.sequence_parallel == False: - # self.paddleformers_config.tensor_model_parallel_size = 1 - # logger.warning("When using expert parallelism and tensor parallelism, sequence parallelism must be used in fleet set tp=1 .") - self.paddleformers_config.parallel_output = self.paddleformers_config.tensor_model_parallel_size == 1 - self.paddleformers_config.max_seq_len = self.model_config.max_model_len - self.paddleformers_config.params_dtype = "bfloat16" - # self.paddleformers_config.moe_grouped_gemm = True - self.paddleformers_config.moe_token_dispatcher_type = "deepep" - # self.paddleformers_config.use_cpu_initialization = True - self.paddleformers_config.gated_attention = getattr(self.paddleformers_config, "use_gated_attn", False) - if self.paddleformers_config.multi_latent_attention: - self.paddleformers_config.qk_head_dim = ( - self.paddleformers_config.qk_rope_head_dim + self.paddleformers_config.qk_nope_head_dim - ) - # Initialize PaddleFleet parallel_state so that its TP group is consistent with FastDeploy. - # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank - # via parallel_state. Without initialization, it defaults to 1, causing weights - # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). - # if parallel_config.tensor_parallel_size > 1: - self._init_paddlefleet_parallel_state(fd_config) - - # The specific text model config - # Sync important config values from text_config to model_config - # This ensures fallback models use their actual config values instead of FD defaults - self._sync_config_from_text_config() - # For convenience, keep direct access to some FD configs - self.quant_config = self.fd_config.quant_config - - # Load model using from_pretrained to support weight loading - # Pass dtype, config and other options from kwargs - - model_load_kwargs = { - "dtype": self.model_config.dtype, - "config": self.paddleformers_config, - "convert_from_hf": True, - "load_via_cpu": True, - "load_checkpoint_format": "flex_checkpoint", - } - # Set random seed before model construction for reproducibility - set_random_seed(seed_=42) - self.model = AutoModelForCausalLM.from_pretrained( - self.model_config.model, - **model_load_kwargs, - ) - - self.model.eval() - # Patch PaddleFleet core_attention with FastDeploy attention - patched_count = patch_paddlefleet_core_attention( - model=self.model, - fd_config=self.fd_config, - ) - logger.info(f"Patched {patched_count} attention layers with FastDeploy") - - def compute_logits(self, hidden_state, forward_meta=None): - """Compute logits from hidden states using lm_head.""" - lm_head = self.model.get_lm_head() - # ColumnParallelLinear expects input [s, b, h] - hidden_state = hidden_state.unsqueeze(1) # [num_tokens, h] -> [num_tokens, 1, h] - logits = lm_head({"hidden_states": hidden_state}) - # Output [num_tokens, 1, vocab], squeeze back to [num_tokens, vocab] - if logits.ndim == 3: - logits = logits.squeeze(1) - logits = logits.astype(paddle.float32) - logits[:, self.model_config.ori_vocab_size :] = -float("inf") - return logits - - def _init_paddlefleet_parallel_state(self, fd_config) -> None: + # V de-absorption: kernel output [token, heads * kv_lora_rank] + # -> [heads, token, kv_lora_rank] @ wv_b [heads, kv_lora_rank, v_head_dim] + # -> [token, heads * v_head_dim] + kv_lora_rank = self.config.kv_lora_rank + v_head_dim = self.config.v_head_dim + num_heads = fmqa_out.shape[-1] // kv_lora_rank + fmqa_out = fmqa_out.reshape([-1, num_heads, kv_lora_rank]).transpose([1, 0, 2]) + fmqa_out = paddle.bmm(fmqa_out, v_b_proj_weight) + fmqa_out = fmqa_out.transpose([1, 0, 2]).reshape([-1, num_heads * v_head_dim]) + # Merge prefill and decode outputs if both are present + if need_do_prefill: + try: + from fastdeploy.model_executor.ops.gpu import ( + merge_prefill_decode_output, + ) + + merge_prefill_decode_output( + output, + fmqa_out, + forward_meta.seq_lens_encoder, + forward_meta.seq_lens_decoder, + forward_meta.seq_lens_this_time, + forward_meta.cu_seqlens_q, + num_heads, + v_head_dim, + 1, + ) + except (ImportError, AttributeError): + logger.warning("merge_prefill_decode_output not available, using decode output only") + output = fmqa_out + else: + output = fmqa_out + # _log_md5(output, "output") + else: + # Standard mode: concatenate QKV + seq_len = int(q.shape[0]) + + # SHD: [seq, heads, dim] -> flatten to [seq, heads*dim] + q_flat = q.reshape([seq_len, -1]) + k_flat = k.reshape([seq_len, -1]) + v_flat = v.reshape([seq_len, -1]) + + # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] + qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) + + # _log_md5(qkv, "qkv_out") + output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) + + # _log_md5(output, "atten_out") + # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] + # PaddleFleet expects 3D output format + output = output.unsqueeze(0) + + return output + finally: + # Restore original scale + if original_scale is None: + if hasattr(self.fd_attention, "scale"): + delattr(self.fd_attention, "scale") + else: + self.fd_attention.scale = original_scale + + @support_graph_optimization + class PaddleFleetModelBase(nn.Layer): """ - Initialize PaddleFleet's parallel_state so that ColumnParallelLinear/RowParallelLinear - can correctly obtain TP world_size and rank, and thus correctly shard weights - and build sharded_state_dict. - - References the initialization logic in PaddleFormers' training_args.py, - using the official initialize_fleet API instead of directly manipulating - parallel_state internal variables. + A mixin-style base class to provide PaddleFormers backend logic on top of nn.Layer. + This class subclasses nn.Layer and provides common methods to + initialize and manage a PaddleFormers model. """ - from paddle.distributed import fleet - from paddlefleet.parallel_state import get_tensor_model_parallel_group - from paddlefleet.training import initialize_fleet - - parallel_config = fd_config.parallel_config - - # Only call initialize_fleet when the TP group has not been initialized yet - if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": parallel_config.data_parallel_size, - "mp_degree": parallel_config.tensor_parallel_size, - "pp_degree": 1, - "sep_degree": 1, - "ep_degree": parallel_config.expert_parallel_size, + + def __init__(self, fd_config: "FDConfig", **kwargs): + super().__init__(fd_config) + logger.info("Initializing PaddleFormers backend.") + self.fd_config = fd_config # FastDeploy's top-level FDConfig + self.model_config = fd_config.model_config # FastDeploy's ModelConfig + self.paddleformers_config = AutoConfig.from_pretrained(self.model_config.model) + + # Assign parallel config from fd_config.parallel_config to paddleformers_config + parallel_config = fd_config.parallel_config + # parallel_config.tensor_parallel_size = 1 + # parallel_config.expert_parallel_size = 2 + self.paddleformers_config.data_parallel_size = parallel_config.data_parallel_size + self.paddleformers_config.tensor_model_parallel_size = parallel_config.tensor_parallel_size + self.paddleformers_config.sequence_parallel = parallel_config.sequence_parallel + self.paddleformers_config.expert_model_parallel_size = parallel_config.expert_parallel_size + # if parallel_config.expert_parallel_size > 1 and parallel_config.sequence_parallel == False: + # self.paddleformers_config.tensor_model_parallel_size = 1 + # logger.warning("When using expert parallelism and tensor parallelism, sequence parallelism must be used in fleet set tp=1 .") + self.paddleformers_config.parallel_output = self.paddleformers_config.tensor_model_parallel_size == 1 + self.paddleformers_config.max_seq_len = self.model_config.max_model_len + self.paddleformers_config.params_dtype = "bfloat16" + # self.paddleformers_config.moe_grouped_gemm = True + self.paddleformers_config.moe_token_dispatcher_type = "deepep" + # self.paddleformers_config.use_cpu_initialization = True + self.paddleformers_config.gated_attention = getattr(self.paddleformers_config, "use_gated_attn", False) + if self.paddleformers_config.multi_latent_attention: + self.paddleformers_config.qk_head_dim = ( + self.paddleformers_config.qk_rope_head_dim + self.paddleformers_config.qk_nope_head_dim + ) + # Initialize PaddleFleet parallel_state so that its TP group is consistent with FastDeploy. + # PaddleFleet's ColumnParallelLinear/RowParallelLinear obtains TP world_size/rank + # via parallel_state. Without initialization, it defaults to 1, causing weights + # to not be TP-sharded, which mismatches FastDeploy's KV cache (allocated per TP). + # if parallel_config.tensor_parallel_size > 1: + self._init_paddlefleet_parallel_state(fd_config) + + # The specific text model config + # Sync important config values from text_config to model_config + # This ensures fallback models use their actual config values instead of FD defaults + self._sync_config_from_text_config() + # For convenience, keep direct access to some FD configs + self.quant_config = self.fd_config.quant_config + + # Load model using from_pretrained to support weight loading + # Pass dtype, config and other options from kwargs + + model_load_kwargs = { + "dtype": self.model_config.dtype, + "config": self.paddleformers_config, + "convert_from_hf": True, + "load_via_cpu": True, + "load_checkpoint_format": "flex_checkpoint", } - initialize_fleet(strategy) - logger.info( - f"Initialized PaddleFleet parallel_state via initialize_fleet " - f"(dp={parallel_config.data_parallel_size}, " - f"mp={parallel_config.tensor_parallel_size}, " - f"ep={parallel_config.expert_parallel_size}, " - f"sp={parallel_config.sequence_parallel})" + # Set random seed before model construction for reproducibility + set_random_seed(seed_=42) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_config.model, + **model_load_kwargs, ) - def _sync_config_from_text_config(self) -> None: - """ - Sync important config values from text_config (PaddleFormers/HF config) - to model_config. This ensures fallback models use their actual config - values instead of FD's defaults. - - This is crucial for models with unique configs like: - - Gemma3: tie_word_embeddings=True, layer_types, sliding_window - - Mistral: sliding_window - - etc. - """ - mc = self.model_config - tc = self.paddleformers_config - - sync_fields = [ - "tie_word_embeddings", - "sliding_window", - "sliding_window_pattern", - "layer_types", # May be computed as property - "rope_theta", - "rope_scaling", - "head_dim", - "v_head_dim", # For MLA (Multi-Latent Attention) support - "qk_head_dim", - "rms_norm_eps", - "rope_local_base_freq", # Gemma3 specific - "query_pre_attn_scalar", # Gemma3 specific - ] - - synced = [] - for field in sync_fields: - text_value = getattr(tc, field, None) - if text_value is not None: - # Only sync if not already set or if FD default differs - current_value = getattr(mc, field, None) if hasattr(mc, field) else None - if current_value is None or current_value != text_value: - setattr(mc, field, text_value) - synced.append(f"{field}={text_value}") - - def embed_input_ids(self, input_ids: paddle.Tensor) -> paddle.Tensor: - """Embed input_ids using the model's embedding layer.""" - embedding_layer = self.model.get_input_embeddings() - - original_ndim = input_ids.ndim - if input_ids.ndim == 1: - input_ids = input_ids.unsqueeze(0) # [num_tokens] -> [1, num_tokens] - - inputs_embeds = embedding_layer(input_ids) - - # Embedding output is [batch, seq, h], squeeze back to [num_tokens, h] - if original_ndim == 1 and inputs_embeds.ndim == 3: - inputs_embeds = inputs_embeds.squeeze(0) - - if hasattr(self, "embed_scale") and self.embed_scale is not None: - inputs_embeds *= self.embed_scale - return inputs_embeds - - @paddle.no_grad() - def forward( - self, - inputs: Dict, - forward_meta: ForwardMeta, - **kwargs, + self.model.eval() + # Patch PaddleFleet core_attention with FastDeploy attention + patched_count = patch_paddlefleet_core_attention( + model=self.model, + fd_config=self.fd_config, + ) + logger.info(f"Patched {patched_count} attention layers with FastDeploy") + + def compute_logits(self, hidden_state, forward_meta=None): + """Compute logits from hidden states using lm_head.""" + lm_head = self.model.get_lm_head() + # ColumnParallelLinear expects input [s, b, h] + hidden_state = hidden_state.unsqueeze(1) # [num_tokens, h] -> [num_tokens, 1, h] + logits = lm_head({"hidden_states": hidden_state}) + # Output [num_tokens, 1, vocab], squeeze back to [num_tokens, vocab] + if logits.ndim == 3: + logits = logits.squeeze(1) + logits = logits.astype(paddle.float32) + logits[:, self.model_config.ori_vocab_size :] = -float("inf") + return logits + + def _init_paddlefleet_parallel_state(self, fd_config) -> None: + """ + Initialize PaddleFleet's parallel_state so that ColumnParallelLinear/RowParallelLinear + can correctly obtain TP world_size and rank, and thus correctly shard weights + and build sharded_state_dict. + + References the initialization logic in PaddleFormers' training_args.py, + using the official initialize_fleet API instead of directly manipulating + parallel_state internal variables. + """ + from paddle.distributed import fleet + from paddlefleet.parallel_state import get_tensor_model_parallel_group + from paddlefleet.training import initialize_fleet + + parallel_config = fd_config.parallel_config + + # Only call initialize_fleet when the TP group has not been initialized yet + if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": parallel_config.data_parallel_size, + "mp_degree": parallel_config.tensor_parallel_size, + "pp_degree": 1, + "sep_degree": 1, + "ep_degree": parallel_config.expert_parallel_size, + } + initialize_fleet(strategy) + logger.info( + f"Initialized PaddleFleet parallel_state via initialize_fleet " + f"(dp={parallel_config.data_parallel_size}, " + f"mp={parallel_config.tensor_parallel_size}, " + f"ep={parallel_config.expert_parallel_size}, " + f"sp={parallel_config.sequence_parallel})" + ) + + def _sync_config_from_text_config(self) -> None: + """ + Sync important config values from text_config (PaddleFormers/HF config) + to model_config. This ensures fallback models use their actual config + values instead of FD's defaults. + + This is crucial for models with unique configs like: + - Gemma3: tie_word_embeddings=True, layer_types, sliding_window + - Mistral: sliding_window + - etc. + """ + mc = self.model_config + tc = self.paddleformers_config + + sync_fields = [ + "tie_word_embeddings", + "sliding_window", + "sliding_window_pattern", + "layer_types", # May be computed as property + "rope_theta", + "rope_scaling", + "head_dim", + "v_head_dim", # For MLA (Multi-Latent Attention) support + "qk_head_dim", + "rms_norm_eps", + "rope_local_base_freq", # Gemma3 specific + "query_pre_attn_scalar", # Gemma3 specific + ] + + synced = [] + for field in sync_fields: + text_value = getattr(tc, field, None) + if text_value is not None: + # Only sync if not already set or if FD default differs + current_value = getattr(mc, field, None) if hasattr(mc, field) else None + if current_value is None or current_value != text_value: + setattr(mc, field, text_value) + synced.append(f"{field}={text_value}") + + def embed_input_ids(self, input_ids: paddle.Tensor) -> paddle.Tensor: + """Embed input_ids using the model's embedding layer.""" + embedding_layer = self.model.get_input_embeddings() + + original_ndim = input_ids.ndim + if input_ids.ndim == 1: + input_ids = input_ids.unsqueeze(0) # [num_tokens] -> [1, num_tokens] + + inputs_embeds = embedding_layer(input_ids) + + # Embedding output is [batch, seq, h], squeeze back to [num_tokens, h] + if original_ndim == 1 and inputs_embeds.ndim == 3: + inputs_embeds = inputs_embeds.squeeze(0) + + if hasattr(self, "embed_scale") and self.embed_scale is not None: + inputs_embeds *= self.embed_scale + return inputs_embeds + + @paddle.no_grad() + def forward( + self, + inputs: Dict, + forward_meta: ForwardMeta, + **kwargs, + ): + """Full transformer forward: input_ids -> hidden_states. + + This method is the primary forward pass for the model, computing: + 1. Position IDs based on seq_lens_decoder (absolute positions for RoPE) + 2. Token embeddings via embed_input_ids + 3. Transformer layers via self.model() + + Returns: + hidden_states: [TotalTokens, HiddenDim] + """ + ids_remove_padding = inputs["ids_remove_padding"] + num_tokens = ids_remove_padding.shape[0] + + batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] + seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] + + if batch_id_per_token is not None and seq_lens_decoder is not None: + decoder_offsets = seq_lens_decoder.squeeze(-1) # [batch_size] + # Ensure decoder_offsets is at least 1D tensor + if decoder_offsets.ndim == 0: + decoder_offsets = decoder_offsets.reshape([1]) + token_decoder_offsets = paddle.index_select( + decoder_offsets, batch_id_per_token, axis=0 + ) # [num_tokens] + + cu_seqlens = forward_meta.cu_seqlens_q # [batch_size + 1] + if cu_seqlens is not None: + token_global_idx = paddle.arange(num_tokens, dtype="int64") + request_start_idx = paddle.index_select(cu_seqlens[:-1], batch_id_per_token, axis=0) + relative_positions = token_global_idx - request_start_idx.astype("int64") + else: + relative_positions = paddle.zeros([num_tokens], dtype="int64") + position_ids = token_decoder_offsets.astype("int64") + relative_positions + else: + position_ids = paddle.arange(num_tokens, dtype="int64") + if seq_lens_decoder is not None: + position_ids = position_ids + seq_lens_decoder[0, 0].astype("int64") + forward_meta.rope_already_applied = True + # Also set forward_meta on each TransformerLayer's config + # so that FastDeployAttention can retrieve it from core_attn.config + if hasattr(self.model, "run_function"): + for layer in self.model.run_function: + if not isinstance(layer, (GPTEmbedding, GPTLMHead)): + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "core_attention"): + core_attn = layer.self_attn.core_attention + if hasattr(core_attn, "config"): + core_attn.config.forward_meta = forward_meta + + inputs_embeds = self.embed_input_ids(ids_remove_padding).unsqueeze(0) + + # Build input dict, PipelineLayer passes data between layers via dict + model_input = { + "input_ids": None, + "position_ids": position_ids, + } + # Add other parameters from kwargs + for k, v in kwargs.items(): + if v is not None: + model_input[k] = v + + # Iterate over run_function, skip GPTLMHead + # Only call TransformerLayer + i = -1 + for layer in self.model.run_function: + if isinstance(layer, GPTLMHead): + continue + if isinstance(layer, (GPTEmbedding)): + model_input = layer(model_input, decoder_input=inputs_embeds) + else: + model_input = layer(model_input) + i += 1 + hidden_states = model_input["hidden_states"] + # [b, s, h] -> [s, h] (b=1) + hidden_states = hidden_states.squeeze(0) + + return hidden_states + + @paddle.no_grad() + def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): + # use model.from_pretrained to load weight + pass + + def set_state_dict(self, state_dict): + self.model.set_state_dict(state_dict) + + # ============================================================================ + # PaddleFleet Attention Patch Functions + # ============================================================================ + + def patch_paddlefleet_core_attention( + model, + fd_config: "FDConfig", + layers_to_patch: list[int] | None = None, ): - """Full transformer forward: input_ids -> hidden_states. + """ + Replace core_attention in all TransformerLayers of a PaddleFleet model with FastDeployAttention. - This method is the primary forward pass for the model, computing: - 1. Position IDs based on seq_lens_decoder (absolute positions for RoPE) - 2. Token embeddings via embed_input_ids - 3. Transformer layers via self.model() + Args: + model: PaddleFleet model instance (inheriting from PipelineLayer) + fd_config: FastDeploy FDConfig object, used to create Attention instances + layers_to_patch: List of layer indices to patch, None means patch all layers Returns: - hidden_states: [TotalTokens, HiddenDim] + int: Number of layers successfully patched + + Raises: + ValueError: If the model structure is unexpected or parameters are incorrect """ - ids_remove_padding = inputs["ids_remove_padding"] - num_tokens = ids_remove_padding.shape[0] - - batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] - seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] - - if batch_id_per_token is not None and seq_lens_decoder is not None: - decoder_offsets = seq_lens_decoder.squeeze(-1) # [batch_size] - # Ensure decoder_offsets is at least 1D tensor - if decoder_offsets.ndim == 0: - decoder_offsets = decoder_offsets.reshape([1]) - token_decoder_offsets = paddle.index_select(decoder_offsets, batch_id_per_token, axis=0) # [num_tokens] - - cu_seqlens = forward_meta.cu_seqlens_q # [batch_size + 1] - if cu_seqlens is not None: - token_global_idx = paddle.arange(num_tokens, dtype="int64") - request_start_idx = paddle.index_select(cu_seqlens[:-1], batch_id_per_token, axis=0) - relative_positions = token_global_idx - request_start_idx.astype("int64") - else: - relative_positions = paddle.zeros([num_tokens], dtype="int64") - position_ids = token_decoder_offsets.astype("int64") + relative_positions - else: - position_ids = paddle.arange(num_tokens, dtype="int64") - if seq_lens_decoder is not None: - position_ids = position_ids + seq_lens_decoder[0, 0].astype("int64") - forward_meta.rope_already_applied = True - # Also set forward_meta on each TransformerLayer's config - # so that FastDeployAttention can retrieve it from core_attn.config - if hasattr(self.model, "run_function"): - for layer in self.model.run_function: - if not isinstance(layer, (GPTEmbedding, GPTLMHead)): - if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "core_attention"): - core_attn = layer.self_attn.core_attention - if hasattr(core_attn, "config"): - core_attn.config.forward_meta = forward_meta - - inputs_embeds = self.embed_input_ids(ids_remove_padding).unsqueeze(0) - - # Build input dict, PipelineLayer passes data between layers via dict - model_input = { - "input_ids": None, - "position_ids": position_ids, - } - # Add other parameters from kwargs - for k, v in kwargs.items(): - if v is not None: - model_input[k] = v - - # Iterate over run_function, skip GPTLMHead - # Only call TransformerLayer - i = -1 - for layer in self.model.run_function: - if isinstance(layer, GPTLMHead): + if fd_config is None: + raise ValueError("fd_config must be provided") + + from fastdeploy.model_executor.layers.attention.attention import Attention + + # Iterate over run_function to find TransformerLayers + patched_count = 0 + transformer_layers = [] + + # Collect all TransformerLayers + if hasattr(model, "run_function"): + for layer in model.run_function: + # Try to identify TransformerLayer + layer_type = type(layer).__name__ + if "TransformerLayer" in layer_type or "transformer" in str(type(layer)): + transformer_layers.append(layer) + + if not transformer_layers: + # Try alternative ways to find layers + for name, module in model.named_sublayers(): + if "TransformerLayer" in type(module).__name__: + transformer_layers.append(module) + + if not transformer_layers: + raise ValueError("No TransformerLayer found in model") + + # Patch core_attention for each TransformerLayer + for layer in transformer_layers: + # Get layer_number (PaddleFleet starts from 1) + layer_number = getattr(layer, "layer_number", None) + if layer_number is None: + # Try to get from other attributes + layer_number = getattr(layer, "layer_id", None) + + if layer_number is None: + logger.warning("layer_number not found, skip patching...") + continue # Skip layers where layer_id cannot be obtained + + # Check if this layer needs to be patched + if layers_to_patch is not None and (layer_number) not in layers_to_patch: continue - if isinstance(layer, (GPTEmbedding)): - model_input = layer(model_input, decoder_input=inputs_embeds) - else: - model_input = layer(model_input) - i += 1 - hidden_states = model_input["hidden_states"] - # [b, s, h] -> [s, h] (b=1) - hidden_states = hidden_states.squeeze(0) - return hidden_states + # Get core_attention + if not hasattr(layer, "self_attn"): + logger.warning(f"self_attn not found in layer {layer_number}, skip patching...") + continue - @paddle.no_grad() - def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): - # use model.from_pretrained to load weight - pass + core_attn = layer.self_attn.core_attention + if core_attn is None: + logger.warning(f"core_attn not found in layer {layer_number}, skip patching...") + continue - def set_state_dict(self, state_dict): - self.model.set_state_dict(state_dict) + # Get configuration info + # Prefer per-partition values (values after TP sharding), + # because PaddleFleet's QKV output is already per-partition when TP>1 + num_attention_heads = getattr( + core_attn, "num_attention_heads_per_partition", getattr(core_attn.config, "num_attention_heads", None) + ) + num_key_value_heads = getattr( + core_attn, + "num_query_groups_per_partition", + getattr(core_attn.config, "num_key_value_heads", num_attention_heads), + ) + hidden_size_per_attention_head = getattr(core_attn, "hidden_size_per_attention_head", None) + if hidden_size_per_attention_head is not None: + softmax_scale = getattr(core_attn, "softmax_scale", 1.0 / math.sqrt(hidden_size_per_attention_head)) + else: + softmax_scale = 1.0 + hidden_size_per_partition = getattr(core_attn, "hidden_size_per_partition", None) + if hidden_size_per_partition is None: + head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) + hidden_size_per_partition = num_attention_heads * head_dim -# ============================================================================ -# PaddleFleet Attention Patch Functions -# ============================================================================ + # Get FastDeploy layer ID (0-indexed) + fd_layer_id = layer_number + # Create Attention instance inside FastDeployAttention + fd_attn_instance = Attention( + fd_config=fd_config, + layer_id=fd_layer_id, + ) -def patch_paddlefleet_core_attention( - model, - fd_config: "FDConfig", - layers_to_patch: list[int] | None = None, -): - """ - Replace core_attention in all TransformerLayers of a PaddleFleet model with FastDeployAttention. + # Override Attention instance's head config to match PaddleFleet model + # This is necessary because fd_config.model_config may differ from PaddleFleet model config + fd_attn_instance.num_heads = num_attention_heads + fd_attn_instance.kv_num_heads = num_key_value_heads + fd_attn_instance.head_dim = hidden_size_per_attention_head + logger.info( + f"Overriding Attention config: num_heads={num_attention_heads}, kv_num_heads={num_key_value_heads}, head_dim={hidden_size_per_attention_head}" + ) - Args: - model: PaddleFleet model instance (inheriting from PipelineLayer) - fd_config: FastDeploy FDConfig object, used to create Attention instances - layers_to_patch: List of layer indices to patch, None means patch all layers + # Create FastDeployAttention object and directly replace core_attention + fast_deploy_core_attn = FastDeployAttention( + config=core_attn.config, + fd_attention=fd_attn_instance, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + softmax_scale=softmax_scale, + hidden_size_per_attention_head=hidden_size_per_attention_head, + hidden_size_per_partition=hidden_size_per_partition, + layer_id=fd_layer_id, + ) - Returns: - int: Number of layers successfully patched + # Replace core_attention object + layer.self_attn.core_attention = fast_deploy_core_attn - Raises: - ValueError: If the model structure is unexpected or parameters are incorrect - """ - if fd_config is None: - raise ValueError("fd_config must be provided") + patched_count += 1 + logger.info(f"Replaced core_attention with FastDeployAttention for layer {fd_layer_id}") - from fastdeploy.model_executor.layers.attention.attention import Attention + logger.info(f"Successfully replaced {patched_count} core_attention layers with FastDeployAttention") - # Iterate over run_function to find TransformerLayers - patched_count = 0 - transformer_layers = [] - - # Collect all TransformerLayers - if hasattr(model, "run_function"): - for layer in model.run_function: - # Try to identify TransformerLayer - layer_type = type(layer).__name__ - if "TransformerLayer" in layer_type or "transformer" in str(type(layer)): - transformer_layers.append(layer) - - if not transformer_layers: - # Try alternative ways to find layers - for name, module in model.named_sublayers(): - if "TransformerLayer" in type(module).__name__: - transformer_layers.append(module) - - if not transformer_layers: - raise ValueError("No TransformerLayer found in model") - - # Patch core_attention for each TransformerLayer - for layer in transformer_layers: - # Get layer_number (PaddleFleet starts from 1) - layer_number = getattr(layer, "layer_number", None) - if layer_number is None: - # Try to get from other attributes - layer_number = getattr(layer, "layer_id", None) - - if layer_number is None: - logger.warning("layer_number not found, skip patching...") - continue # Skip layers where layer_id cannot be obtained - - # Check if this layer needs to be patched - if layers_to_patch is not None and (layer_number) not in layers_to_patch: - continue - - # Get core_attention - if not hasattr(layer, "self_attn"): - logger.warning(f"self_attn not found in layer {layer_number}, skip patching...") - continue - - core_attn = layer.self_attn.core_attention - if core_attn is None: - logger.warning(f"core_attn not found in layer {layer_number}, skip patching...") - continue - - # Get configuration info - # Prefer per-partition values (values after TP sharding), - # because PaddleFleet's QKV output is already per-partition when TP>1 - num_attention_heads = getattr( - core_attn, "num_attention_heads_per_partition", getattr(core_attn.config, "num_attention_heads", None) - ) - num_key_value_heads = getattr( - core_attn, - "num_query_groups_per_partition", - getattr(core_attn.config, "num_key_value_heads", num_attention_heads), - ) - hidden_size_per_attention_head = getattr(core_attn, "hidden_size_per_attention_head", None) - if hidden_size_per_attention_head is not None: - softmax_scale = getattr(core_attn, "softmax_scale", 1.0 / math.sqrt(hidden_size_per_attention_head)) - else: - softmax_scale = 1.0 - - hidden_size_per_partition = getattr(core_attn, "hidden_size_per_partition", None) - if hidden_size_per_partition is None: - head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) - hidden_size_per_partition = num_attention_heads * head_dim - - # Get FastDeploy layer ID (0-indexed) - fd_layer_id = layer_number - - # Create Attention instance inside FastDeployAttention - fd_attn_instance = Attention( - fd_config=fd_config, - layer_id=fd_layer_id, - ) - - # Override Attention instance's head config to match PaddleFleet model - # This is necessary because fd_config.model_config may differ from PaddleFleet model config - fd_attn_instance.num_heads = num_attention_heads - fd_attn_instance.kv_num_heads = num_key_value_heads - fd_attn_instance.head_dim = hidden_size_per_attention_head - logger.info( - f"Overriding Attention config: num_heads={num_attention_heads}, kv_num_heads={num_key_value_heads}, head_dim={hidden_size_per_attention_head}" - ) - - # Create FastDeployAttention object and directly replace core_attention - fast_deploy_core_attn = FastDeployAttention( - config=core_attn.config, - fd_attention=fd_attn_instance, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - softmax_scale=softmax_scale, - hidden_size_per_attention_head=hidden_size_per_attention_head, - hidden_size_per_partition=hidden_size_per_partition, - layer_id=fd_layer_id, - ) - - # Replace core_attention object - layer.self_attn.core_attention = fast_deploy_core_attn - - patched_count += 1 - logger.info(f"Replaced core_attention with FastDeployAttention for layer {fd_layer_id}") - - logger.info(f"Successfully replaced {patched_count} core_attention layers with FastDeployAttention") - - return patched_count + return patched_count From 41fd0cccd804eee86a48ca11b2f76cb4e7d38304 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Thu, 28 May 2026 10:14:03 +0800 Subject: [PATCH 08/19] support ep and add test --- .../models/paddleformers/base_fleet.py | 102 +++++++++--- scripts/coverage_run.sh | 35 ++++- tests/model_executor/conftest.py | 148 ++++++++++++++++++ .../test_fallback_fleet_model.py | 104 ++++++++++++ 4 files changed, 364 insertions(+), 25 deletions(-) create mode 100644 tests/model_executor/conftest.py create mode 100644 tests/model_executor/test_fallback_fleet_model.py diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 8631e780e1a..47f5045a24a 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -33,7 +33,6 @@ from paddlefleet.models.gpt.lm_head import GPTLMHead from paddlefleet.transformer.layer import FleetLayer from paddlefleet.transformer.transformer_config import TransformerConfig - from paddleformers.trainer.trainer_utils import set_random_seed from paddleformers.transformers import AutoConfig from paddleformers.transformers.auto.modeling import AutoModelForCausalLM from paddleformers.utils.log import logger @@ -336,7 +335,6 @@ def __init__(self, fd_config: "FDConfig", **kwargs): "load_checkpoint_format": "flex_checkpoint", } # Set random seed before model construction for reproducibility - set_random_seed(seed_=42) self.model = AutoModelForCausalLM.from_pretrained( self.model_config.model, **model_load_kwargs, @@ -374,29 +372,82 @@ def _init_paddlefleet_parallel_state(self, fd_config) -> None: parallel_state internal variables. """ from paddle.distributed import fleet - from paddlefleet.parallel_state import get_tensor_model_parallel_group - from paddlefleet.training import initialize_fleet parallel_config = fd_config.parallel_config - # Only call initialize_fleet when the TP group has not been initialized yet - if get_tensor_model_parallel_group is not None and get_tensor_model_parallel_group(False) is None: - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": parallel_config.data_parallel_size, - "mp_degree": parallel_config.tensor_parallel_size, - "pp_degree": 1, - "sep_degree": 1, - "ep_degree": parallel_config.expert_parallel_size, - } - initialize_fleet(strategy) - logger.info( - f"Initialized PaddleFleet parallel_state via initialize_fleet " - f"(dp={parallel_config.data_parallel_size}, " - f"mp={parallel_config.tensor_parallel_size}, " - f"ep={parallel_config.expert_parallel_size}, " - f"sp={parallel_config.sequence_parallel})" - ) + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": parallel_config.tensor_parallel_size, + "pp_degree": 1, + "sep_degree": 1, + "sharding_degree": parallel_config.data_parallel_size, + "ep_degree": parallel_config.expert_parallel_size, + "cp_degree": 1, + "moe_sharding_degree": 1, + "order": [ + "pp", + "moe_sharding", + "ep", + "dp", + "sharding", + "sep", + "cp", + "mp", + ], + } + fleet.init(is_collective=True, strategy=strategy) + logger.info( + f"Initialized PaddleFleet parallel_state via initialize_fleet " + f"(sharddp={parallel_config.data_parallel_size}, " + f"mp={parallel_config.tensor_parallel_size}, " + f"ep={parallel_config.expert_parallel_size}, " + f"sp={parallel_config.sequence_parallel})" + ) + + import paddle.distributed as dist + from paddlefleet import parallel_state + + hcg = fleet.get_hybrid_communicate_group() + expected_tp_size = parallel_config.tensor_parallel_size + + # Check if we need to initialize or reinitialize TP group + need_init = False + if parallel_state._TENSOR_MODEL_PARALLEL_GROUP is None: + need_init = True + reason = "TP group not initialized" + else: + # Check if current TP group size matches expected + current_tp_group = parallel_state._TENSOR_MODEL_PARALLEL_GROUP + current_tp_size = getattr(current_tp_group, "nranks", None) + if current_tp_size is None: + current_tp_size = getattr(current_tp_group, "world_size", None) + if current_tp_size != expected_tp_size: + need_init = True + reason = f"TP group size mismatch: current={current_tp_size}, expected={expected_tp_size}" + + if need_init: + logger.warning(f"{reason}, reinitializing TP group with size={expected_tp_size}") + if expected_tp_size == 1: + # Single process TP group - create manually + current_rank = dist.get_rank() + tp_ranks = [current_rank] + default_pg = dist.new_group(ranks=tp_ranks) + parallel_state._TENSOR_MODEL_PARALLEL_GROUP = default_pg + parallel_state._TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = tp_ranks + logger.info(f"Reinitialized TP group with size=1, rank={current_rank}, ranks={tp_ranks}") + else: + # Multiple processes - use hcg's mp group + parallel_state.initialize_model_parallel(hcg) + + from paddlefleet.tensor_parallel.random import ( + model_parallel_cuda_manual_seed, + ) + + try: + model_parallel_cuda_manual_seed(seed=42) + except AssertionError: + pass def _sync_config_from_text_config(self) -> None: """ @@ -472,6 +523,13 @@ def forward( Returns: hidden_states: [TotalTokens, HiddenDim] """ + # Handle empty batch case (e.g., DP worker with no data in EP mode) + if getattr(forward_meta, "is_zero_size", False) or inputs["ids_remove_padding"].shape[0] == 0: + # Return zero tensor with correct shape: [0, hidden_size] + hidden_size = self.model_config.hidden_size + dtype = self.model_config.dtype + return paddle.empty([0, hidden_size], dtype=dtype) + ids_remove_padding = inputs["ids_remove_padding"] num_tokens = ids_remove_padding.shape[0] diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 1d44f72eb97..456ef1f4950 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -11,9 +11,18 @@ export COVERAGE_RCFILE=${COVERAGE_RCFILE:-$DIR/../scripts/.coveragerc} # Classify tests into one of the following categories # - multi_gpu: requires multiple GPUs / ports (run sequentially) # - single_gpu: independent tests (can run in parallel) +# - isolated: tests that install dependencies, run last (isolated) # ============================================================ classify_tests() { local test_file=$1 + + # Rule 0: test_fallback_fleet_model.py should run in isolation (last) + # to avoid its dependencies (paddlefleet, paddleformers) affecting other tests + if [[ "$test_file" =~ test_fallback_fleet_model\.py ]]; then + echo "isolated" + return + fi + # Rule 1: distributed tests (explicit multi-GPU launch) if [[ "$test_file" =~ tests/distributed/.*test_.*\.py ]]; then echo "multi_gpu" @@ -247,6 +256,7 @@ fi MULTI_GPU_TESTS=() SINGLE_GPU_TESTS=() +ISOLATED_TESTS=() TOTAL_TESTS=0 for file in $ALL_TEST_FILES; do @@ -260,11 +270,15 @@ for file in $ALL_TEST_FILES; do "single_gpu") SINGLE_GPU_TESTS+=("$file") ;; + "isolated") + ISOLATED_TESTS+=("$file") + ;; esac done echo "Multi-GPU tests: ${#MULTI_GPU_TESTS[@]}" echo "Single-GPU tests: ${#SINGLE_GPU_TESTS[@]}" +echo "Isolated tests: ${#ISOLATED_TESTS[@]}" echo "Total tests: $TOTAL_TESTS" # ============================================================ @@ -327,9 +341,24 @@ else fi # ============================================================ -# Step 4: Summary +# Step 4: Run isolated tests (last, to avoid dependency pollution) +# ============================================================ +echo "Step 4: Running isolated tests (tests with special dependencies)" + +if [ ${#ISOLATED_TESTS[@]} -gt 0 ]; then + echo "Isolated tests will run last to avoid dependency pollution." + for file in "${ISOLATED_TESTS[@]}"; do + echo "Running isolated test: $file" + run_test_with_logging "$file" "$failed_tests_file" + done +else + echo "No isolated tests to run." +fi + +# ============================================================ +# Step 5: Summary # ============================================================ -echo "Step 4: Summary" +echo "Step 5: Summary" # Count failed tests if [ -f "$failed_tests_file" ]; then @@ -369,7 +398,7 @@ if [ "$failed_count" -ne 0 ]; then # Only package logs when there are failures echo "====================================" - echo "Step 5: Packaging logs (only on failure)" + echo "Step 6: Packaging logs (only on failure)" echo "====================================" if [ -d "${run_path}/unittest_logs" ]; then diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py new file mode 100644 index 00000000000..bcf4d8716ec --- /dev/null +++ b/tests/model_executor/conftest.py @@ -0,0 +1,148 @@ +"""Pytest configuration for model_executor tests. + +This conftest handles special dependencies required only by specific tests, +avoiding pollution of the global test environment. +""" + +import os +import subprocess +import sys + +import pytest + + +def get_package_version(package_name): + """Get the version of an installed package. + + Args: + package_name: Name of the package + + Returns: + Version string or "not installed" if package is not found + """ + try: + import importlib.metadata + + version = importlib.metadata.version(package_name) + return version + except Exception: + try: + # Fallback for older Python versions + import pkg_resources + + version = pkg_resources.get_distribution(package_name).version + return version + except Exception: + return "not installed" + + +def print_package_versions(): + """Print versions of key packages (paddlepaddle, paddlefleet, paddleformers).""" + print("\n" + "=" * 70) + print("[conftest] Package Versions:") + print("=" * 70) + + packages = ["paddlepaddle-gpu", "paddlefleet", "paddleformers"] + for pkg in packages: + version = get_package_version(pkg) + status = "✓" if version != "not installed" else "✗" + print(f"[conftest] {status} {pkg:20s}: {version}") + + print("=" * 70 + "\n") + + +def pytest_configure(config): + """Configure pytest before test collection.""" + # Register custom marker for paddlefleet tests + config.addinivalue_line("markers", "paddlefleet: tests that require paddlefleet and paddleformers dependencies") + + +def pytest_collection_modifyitems(config, items): + """Modify test collection to handle paddlefleet dependencies. + + This hook runs after test collection but before test execution. + It checks if any collected tests require paddlefleet dependencies + and installs them in an isolated manner if needed. + """ + # Check if any test in this session requires paddlefleet + has_paddlefleet_tests = any("test_fallback_fleet_model.py" in item.nodeid for item in items) + + if not has_paddlefleet_tests: + return + + # Check if dependencies are already installed + try: + import paddlefleet # noqa: F401 + + print("\n" + "=" * 70) + print("[conftest] paddlefleet already installed, skipping installation") + print("=" * 70) + print_package_versions() + return + except ImportError: + pass + + # Print versions before installation + print("\n" + "=" * 70) + print("[conftest] Package versions BEFORE installing paddlefleet dependencies:") + print("=" * 70) + print_package_versions() + + # Install dependencies only when running paddlefleet tests + print("=" * 70) + print("[conftest] Installing paddlefleet-specific dependencies...") + print("=" * 70) + + try: + # Install paddleformers + paddleformers_url = os.getenv( + "PADDLEFORMERS_WHEEL_URL", + "paddleformers==1.1.0.dev20250507 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", # fallback to PyPI name + ) + subprocess.check_call([sys.executable, "-m", "pip", "install", paddleformers_url, "--quiet"]) + print(f"[conftest] ✓ Installed paddleformers 1.1.0.dev20250507 from {paddleformers_url}") + + # Install paddlefleet (skip paddlepaddle dependency, use existing version) + paddlefleet_url = os.getenv("PADDLEFLEET_WHEEL_URL", "paddlefleet==0.3.0.dev20260527") # fallback to PyPI name + + # Use --no-deps to avoid reinstalling paddlepaddle + subprocess.check_call([sys.executable, "-m", "pip", "install", paddlefleet_url, "--no-deps", "--quiet"]) + print(f"[conftest] ✓ Installed paddlefleet (--no-deps) from {paddlefleet_url}") + print("[conftest] ℹ Using existing paddlepaddle from environment") + + # Print versions after installation + print("\n" + "=" * 70) + print("[conftest] Package versions AFTER installing paddlefleet dependencies:") + print("=" * 70) + print_package_versions() + + except subprocess.CalledProcessError as e: + print(f"[conftest] ✗ Failed to install dependencies: {e}") + print("[conftest] Tests requiring paddlefleet will be skipped") + + # Mark all paddlefleet tests to skip + skip_marker = pytest.mark.skip(reason="Failed to install paddlefleet dependencies") + for item in items: + if "test_fallback_fleet_model.py" in item.nodeid: + item.add_marker(skip_marker) + + print("=" * 70 + "\n") + + +def pytest_sessionfinish(session, exitstatus): + """Optional: cleanup after test session if needed. + + You can uninstall the dependencies here to keep the environment clean, + but this may slow down subsequent test runs. + """ + # Uncomment the following to auto-cleanup after tests + # if os.getenv("CLEANUP_PADDLEFLEET_DEPS", "false").lower() == "true": + # try: + # subprocess.check_call([ + # sys.executable, "-m", "pip", "uninstall", + # "paddlefleet", "paddleformers", "-y", "--quiet" + # ]) + # print("[conftest] Cleaned up paddlefleet dependencies") + # except Exception: + # pass + pass diff --git a/tests/model_executor/test_fallback_fleet_model.py b/tests/model_executor/test_fallback_fleet_model.py new file mode 100644 index 00000000000..e2a178e2121 --- /dev/null +++ b/tests/model_executor/test_fallback_fleet_model.py @@ -0,0 +1,104 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for LLM inference with real model results validation.""" + +import gc +import os + +import pytest + +from fastdeploy import LLM, SamplingParams + +MODEL_PATH = os.getenv("MODEL_PATH", "/root/paddlejob/share-storage/gpfs/system-public/wangruting/Qwen/Qwen3-4B") + + +@pytest.mark.gpu +class TestLLMInferenceRealModel: + """Test LLM inference with real model results validation.""" + + @classmethod + def setup_class(cls): + """Setup LLM instance once for all tests in this class.""" + cls.llm = LLM( + model=MODEL_PATH, + model_impl="paddlefleet", + max_model_len=32768, + tensor_parallel_size=1, + data_parallel_size=1, + enable_expert_parallel=True, + graph_optimization_config={"use_cudagraph": False}, + ) + + @classmethod + def teardown_class(cls): + """Cleanup LLM instance after all tests.""" + if hasattr(cls, "llm"): + del cls.llm + gc.collect() + + @pytest.fixture + def sampling_params(self): + """Provide sampling parameters for generation.""" + return SamplingParams(max_tokens=64, temperature=0.1) + + def test_generate_with_text_result_check(self, sampling_params): + """Test generate API and validate text result contains expected content.""" + prompt = "We the People of the United States, in Order to" + outputs_generate = self.llm.generate(prompt, sampling_params) + + if isinstance(outputs_generate, list): + res = outputs_generate[0].outputs.text + else: + res = outputs_generate + + expected = ( + "form a more perfect Union, establish Justice, insure domestic Tranquility, " + "provide for the common defence, promote the general Welfare, and secure the " + "Blessings of Liberty to ourselves and our Posterity, do ordain and establish " + "this Constitution for the United States of America." + ) + + assert expected in res, f"Result check failed!\nExpected to contain:\n {expected}\nGot:\n {res}" + + def test_generate_with_deterministic_sampling(self): + """Test generate with deterministic sampling (temperature=0).""" + params = SamplingParams(max_tokens=32, temperature=0.0, top_p=1.0) + prompt = "What is 2 + 2?" + output = self.llm.generate(prompt, params) + + result = output[0].outputs.text if isinstance(output, list) else output.outputs.text + assert len(result) > 0, "Should generate some text" + # Verify result contains answer + assert "4" in result or "four" in result.lower(), f"Expected result to contain answer, got: {result}" + + def test_generate_with_top_p_sampling(self): + """Test generate with top_p sampling.""" + params = SamplingParams(max_tokens=20, temperature=0.8, top_p=0.9) + prompt = "The meaning of life is" + output = self.llm.generate(prompt, params) + + result = output[0].outputs.text if isinstance(output, list) else output.outputs.text + assert len(result) > 0, "Should generate some text with top_p sampling" + + def test_generate_max_tokens_constraint(self): + """Test that max_tokens constraint is respected.""" + max_tokens = 10 + params = SamplingParams(max_tokens=max_tokens, temperature=0.1) + prompt = "Tell me a long story about" + output = self.llm.generate(prompt, params) + + token_ids = output[0].outputs.token_ids if isinstance(output, list) else output.outputs.token_ids + # Generated tokens should not exceed max_tokens by more than 1 (for EOS) + assert len(token_ids) <= max_tokens + 1, f"Expected at most {max_tokens + 1} tokens, got {len(token_ids)}" From 07fdfbe6321c749aaddbe9276a1ebc63d00d90fb Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Thu, 28 May 2026 11:08:38 +0800 Subject: [PATCH 09/19] fix paddlelfeet install --- .../model_executor/models/model_base.py | 2 +- .../models/paddleformers/base_fleet.py | 2 +- tests/model_executor/conftest.py | 99 +++++++++++++++---- 3 files changed, 81 insertions(+), 22 deletions(-) diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 0acbeebc314..b0358f432f4 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -203,7 +203,7 @@ def _try_resolve_paddleformers( raise ImportError( "paddlefleet backend requires paddlefleet to be installed.\n" "Please install with [change cuda version if needed ]:\n" - "python -m pip install paddlefleet==0.3.0.dev20260507" + "python -m pip install paddlefleet==0.3.0.dev20260527" "--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ " "--extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/" ) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 47f5045a24a..c3f326af4a3 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -301,7 +301,7 @@ def __init__(self, fd_config: "FDConfig", **kwargs): # logger.warning("When using expert parallelism and tensor parallelism, sequence parallelism must be used in fleet set tp=1 .") self.paddleformers_config.parallel_output = self.paddleformers_config.tensor_model_parallel_size == 1 self.paddleformers_config.max_seq_len = self.model_config.max_model_len - self.paddleformers_config.params_dtype = "bfloat16" + self.paddleformers_config.params_dtype = self.model_config.dtype or "bfloat16" # self.paddleformers_config.moe_grouped_gemm = True self.paddleformers_config.moe_token_dispatcher_type = "deepep" # self.paddleformers_config.use_cpu_initialization = True diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index bcf4d8716ec..1c601438435 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -5,6 +5,7 @@ """ import os +import shlex import subprocess import sys @@ -36,6 +37,35 @@ def get_package_version(package_name): return "not installed" +def check_package_version(package_name, required_version): + """Check if a package is installed with the required version. + + Args: + package_name: Name of the package + required_version: Required version string (e.g., "1.1.0.dev20250507") + + Returns: + bool: True if package is installed with required version, False otherwise + """ + try: + import importlib.metadata + + installed_version = importlib.metadata.version(package_name) + + # For dev versions, do exact match + if installed_version == required_version: + return True + + # Also accept if major.minor.patch matches (ignore post/dev suffixes) + # e.g., "1.1.1.post20260401" matches "1.1.1" + if required_version in installed_version: + return True + + return False + except Exception: + return False + + def print_package_versions(): """Print versions of key packages (paddlepaddle, paddlefleet, paddleformers).""" print("\n" + "=" * 70) @@ -70,17 +100,31 @@ def pytest_collection_modifyitems(config, items): if not has_paddlefleet_tests: return - # Check if dependencies are already installed - try: - import paddlefleet # noqa: F401 + # Check if dependencies are already installed with correct versions + # Define required versions + REQUIRED_PADDLEFLEET_VERSION = "0.3.0.dev20260527" + REQUIRED_PADDLEFORMERS_VERSION = "1.1.0.dev20250507" + + paddlefleet_ok = check_package_version("paddlefleet", REQUIRED_PADDLEFLEET_VERSION) + paddleformers_ok = check_package_version("paddleformers", REQUIRED_PADDLEFORMERS_VERSION) + if paddlefleet_ok and paddleformers_ok: print("\n" + "=" * 70) - print("[conftest] paddlefleet already installed, skipping installation") + print("[conftest] paddlefleet and paddleformers already installed with required versions") print("=" * 70) print_package_versions() return - except ImportError: - pass + + # If versions don't match, show what needs to be installed + if not paddlefleet_ok: + print("\n[conftest] paddlefleet version mismatch or not installed") + print(f" Required: {REQUIRED_PADDLEFLEET_VERSION}") + print(f" Current: {get_package_version('paddlefleet')}") + + if not paddleformers_ok: + print("[conftest] paddleformers version mismatch or not installed") + print(f" Required: {REQUIRED_PADDLEFORMERS_VERSION}") + print(f" Current: {get_package_version('paddleformers')}") # Print versions before installation print("\n" + "=" * 70) @@ -94,20 +138,35 @@ def pytest_collection_modifyitems(config, items): print("=" * 70) try: - # Install paddleformers - paddleformers_url = os.getenv( - "PADDLEFORMERS_WHEEL_URL", - "paddleformers==1.1.0.dev20250507 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", # fallback to PyPI name - ) - subprocess.check_call([sys.executable, "-m", "pip", "install", paddleformers_url, "--quiet"]) - print(f"[conftest] ✓ Installed paddleformers 1.1.0.dev20250507 from {paddleformers_url}") - - # Install paddlefleet (skip paddlepaddle dependency, use existing version) - paddlefleet_url = os.getenv("PADDLEFLEET_WHEEL_URL", "paddlefleet==0.3.0.dev20260527") # fallback to PyPI name - - # Use --no-deps to avoid reinstalling paddlepaddle - subprocess.check_call([sys.executable, "-m", "pip", "install", paddlefleet_url, "--no-deps", "--quiet"]) - print(f"[conftest] ✓ Installed paddlefleet (--no-deps) from {paddlefleet_url}") + # Install paddleformers if needed + if not paddleformers_ok: + paddleformers_url = os.getenv( + "PADDLEFORMERS_WHEEL_URL", + f"paddleformers=={REQUIRED_PADDLEFORMERS_VERSION} --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", + ) + + # Split the string into separate arguments (handles --extra-index-url flags) + install_args = ( + [sys.executable, "-m", "pip", "install"] + shlex.split(paddleformers_url) + ["--no-deps", "--quiet"] + ) + subprocess.check_call(install_args) + print(f"[conftest] ✓ Installed paddleformers (--no-deps) from {paddleformers_url}") + else: + print(f"[conftest] ℹ paddleformers {REQUIRED_PADDLEFORMERS_VERSION} already satisfied") + + # Install paddlefleet if needed + if not paddlefleet_ok: + paddlefleet_url = os.getenv("PADDLEFLEET_WHEEL_URL", f"paddlefleet=={REQUIRED_PADDLEFLEET_VERSION}") + + # Use --no-deps to avoid reinstalling paddlepaddle + # Split in case the URL contains spaces or flags + install_args = ( + [sys.executable, "-m", "pip", "install"] + shlex.split(paddlefleet_url) + ["--no-deps", "--quiet"] + ) + subprocess.check_call(install_args) + print(f"[conftest] ✓ Installed paddlefleet (--no-deps) from {paddlefleet_url}") + else: + print(f"[conftest] ℹ paddlefleet {REQUIRED_PADDLEFLEET_VERSION} already satisfied") print("[conftest] ℹ Using existing paddlepaddle from environment") # Print versions after installation From 2063df0663d0cb0bedd0e2e4b7374fc3f5e1cdfe Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Thu, 28 May 2026 14:23:57 +0800 Subject: [PATCH 10/19] fix version --- tests/model_executor/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index 1c601438435..d9def4dd206 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -42,7 +42,7 @@ def check_package_version(package_name, required_version): Args: package_name: Name of the package - required_version: Required version string (e.g., "1.1.0.dev20250507") + required_version: Required version string (e.g., "1.1.0.dev20260508") Returns: bool: True if package is installed with required version, False otherwise @@ -103,7 +103,7 @@ def pytest_collection_modifyitems(config, items): # Check if dependencies are already installed with correct versions # Define required versions REQUIRED_PADDLEFLEET_VERSION = "0.3.0.dev20260527" - REQUIRED_PADDLEFORMERS_VERSION = "1.1.0.dev20250507" + REQUIRED_PADDLEFORMERS_VERSION = "1.1.0.dev20260508" paddlefleet_ok = check_package_version("paddlefleet", REQUIRED_PADDLEFLEET_VERSION) paddleformers_ok = check_package_version("paddleformers", REQUIRED_PADDLEFORMERS_VERSION) From 190a39d5e2951df4c298f61b188da493aca92fd0 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Thu, 28 May 2026 15:07:47 +0800 Subject: [PATCH 11/19] change model and fix bug --- .../models/paddleformers/base_fleet.py | 3 +++ .../model_executor/test_fallback_fleet_model.py | 17 +++++------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index c3f326af4a3..ae298c40b70 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -98,6 +98,9 @@ def forward( attention_bias: paddle.Tensor = None, packed_seq_params=None, use_rr_flash_attention: bool = False, + past_key_values=None, + layer_idx=None, + use_cache=False, x: paddle.Tensor = None, qr: paddle.Tensor = None, kv_compressed: paddle.Tensor = None, diff --git a/tests/model_executor/test_fallback_fleet_model.py b/tests/model_executor/test_fallback_fleet_model.py index e2a178e2121..15838624893 100644 --- a/tests/model_executor/test_fallback_fleet_model.py +++ b/tests/model_executor/test_fallback_fleet_model.py @@ -21,7 +21,11 @@ from fastdeploy import LLM, SamplingParams -MODEL_PATH = os.getenv("MODEL_PATH", "/root/paddlejob/share-storage/gpfs/system-public/wangruting/Qwen/Qwen3-4B") +DEFAULT_MODEL_DIR = "./models" +MODEL_NAME = "Qwen3-0.6B" + +model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR) +MODEL_PATH = os.path.join(model_dir, MODEL_NAME) @pytest.mark.gpu @@ -72,17 +76,6 @@ def test_generate_with_text_result_check(self, sampling_params): assert expected in res, f"Result check failed!\nExpected to contain:\n {expected}\nGot:\n {res}" - def test_generate_with_deterministic_sampling(self): - """Test generate with deterministic sampling (temperature=0).""" - params = SamplingParams(max_tokens=32, temperature=0.0, top_p=1.0) - prompt = "What is 2 + 2?" - output = self.llm.generate(prompt, params) - - result = output[0].outputs.text if isinstance(output, list) else output.outputs.text - assert len(result) > 0, "Should generate some text" - # Verify result contains answer - assert "4" in result or "four" in result.lower(), f"Expected result to contain answer, got: {result}" - def test_generate_with_top_p_sampling(self): """Test generate with top_p sampling.""" params = SamplingParams(max_tokens=20, temperature=0.8, top_p=0.9) From efcde85de5603034cd41d047ebe0d8956e12a07a Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 29 May 2026 09:55:30 +0800 Subject: [PATCH 12/19] recover --- scripts/coverage_run.sh | 5 ++ tests/model_executor/conftest.py | 106 ++++++++----------------------- 2 files changed, 32 insertions(+), 79 deletions(-) diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 456ef1f4950..fd0f66a483c 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -237,6 +237,11 @@ echo "====================================" echo "Coverage Test Execution with Parallel Single-GPU Tests" echo "====================================" +# Print paddleformers version +echo "Checking paddleformers version:" +python -c "import paddleformers; print('paddleformers.__version__:', paddleformers.__version__)" 2>/dev/null || echo "paddleformers not installed" +python -c "import importlib.metadata; print('paddleformers package version:', importlib.metadata.version('paddleformers'))" 2>/dev/null || echo "Cannot get package version" + # ============================================================ # Step 1: Collect & classify tests # ============================================================ diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index d9def4dd206..f41a05a9a79 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -37,42 +37,13 @@ def get_package_version(package_name): return "not installed" -def check_package_version(package_name, required_version): - """Check if a package is installed with the required version. - - Args: - package_name: Name of the package - required_version: Required version string (e.g., "1.1.0.dev20260508") - - Returns: - bool: True if package is installed with required version, False otherwise - """ - try: - import importlib.metadata - - installed_version = importlib.metadata.version(package_name) - - # For dev versions, do exact match - if installed_version == required_version: - return True - - # Also accept if major.minor.patch matches (ignore post/dev suffixes) - # e.g., "1.1.1.post20260401" matches "1.1.1" - if required_version in installed_version: - return True - - return False - except Exception: - return False - - def print_package_versions(): """Print versions of key packages (paddlepaddle, paddlefleet, paddleformers).""" print("\n" + "=" * 70) print("[conftest] Package Versions:") print("=" * 70) - packages = ["paddlepaddle-gpu", "paddlefleet", "paddleformers"] + packages = ["paddlepaddle-gpu", "paddlefleet", "paddleformers", "transformers"] for pkg in packages: version = get_package_version(pkg) status = "✓" if version != "not installed" else "✗" @@ -96,35 +67,21 @@ def pytest_collection_modifyitems(config, items): """ # Check if any test in this session requires paddlefleet has_paddlefleet_tests = any("test_fallback_fleet_model.py" in item.nodeid for item in items) - + print("has_paddlefleet_tests:", has_paddlefleet_tests) if not has_paddlefleet_tests: return # Check if dependencies are already installed with correct versions - # Define required versions - REQUIRED_PADDLEFLEET_VERSION = "0.3.0.dev20260527" - REQUIRED_PADDLEFORMERS_VERSION = "1.1.0.dev20260508" - - paddlefleet_ok = check_package_version("paddlefleet", REQUIRED_PADDLEFLEET_VERSION) - paddleformers_ok = check_package_version("paddleformers", REQUIRED_PADDLEFORMERS_VERSION) + try: + import paddlefleet # noqa: F401 - if paddlefleet_ok and paddleformers_ok: print("\n" + "=" * 70) - print("[conftest] paddlefleet and paddleformers already installed with required versions") + print("[conftest] paddlefleet already installed, skipping installation") print("=" * 70) print_package_versions() return - - # If versions don't match, show what needs to be installed - if not paddlefleet_ok: - print("\n[conftest] paddlefleet version mismatch or not installed") - print(f" Required: {REQUIRED_PADDLEFLEET_VERSION}") - print(f" Current: {get_package_version('paddlefleet')}") - - if not paddleformers_ok: - print("[conftest] paddleformers version mismatch or not installed") - print(f" Required: {REQUIRED_PADDLEFORMERS_VERSION}") - print(f" Current: {get_package_version('paddleformers')}") + except ImportError: + pass # Print versions before installation print("\n" + "=" * 70) @@ -138,35 +95,26 @@ def pytest_collection_modifyitems(config, items): print("=" * 70) try: - # Install paddleformers if needed - if not paddleformers_ok: - paddleformers_url = os.getenv( - "PADDLEFORMERS_WHEEL_URL", - f"paddleformers=={REQUIRED_PADDLEFORMERS_VERSION} --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", - ) - - # Split the string into separate arguments (handles --extra-index-url flags) - install_args = ( - [sys.executable, "-m", "pip", "install"] + shlex.split(paddleformers_url) + ["--no-deps", "--quiet"] - ) - subprocess.check_call(install_args) - print(f"[conftest] ✓ Installed paddleformers (--no-deps) from {paddleformers_url}") - else: - print(f"[conftest] ℹ paddleformers {REQUIRED_PADDLEFORMERS_VERSION} already satisfied") - - # Install paddlefleet if needed - if not paddlefleet_ok: - paddlefleet_url = os.getenv("PADDLEFLEET_WHEEL_URL", f"paddlefleet=={REQUIRED_PADDLEFLEET_VERSION}") - - # Use --no-deps to avoid reinstalling paddlepaddle - # Split in case the URL contains spaces or flags - install_args = ( - [sys.executable, "-m", "pip", "install"] + shlex.split(paddlefleet_url) + ["--no-deps", "--quiet"] - ) - subprocess.check_call(install_args) - print(f"[conftest] ✓ Installed paddlefleet (--no-deps) from {paddlefleet_url}") - else: - print(f"[conftest] ℹ paddlefleet {REQUIRED_PADDLEFLEET_VERSION} already satisfied") + # Install paddleformers + paddleformers_url = os.getenv( + "PADDLEFORMERS_WHEEL_URL", + "paddleformers==1.1.0.dev20260507 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", # fallback to PyPI name + ) + install_args = [sys.executable, "-m", "pip", "install"] + shlex.split(paddleformers_url) + ["--quiet"] + subprocess.check_call(install_args) + print(f"[conftest] ✓ Installed paddleformers 1.1.0.dev20250507 from {paddleformers_url}") + + # Install paddlefleet (skip paddlepaddle dependency, use existing version) + paddlefleet_url = os.getenv( + "PADDLEFLEET_WHEEL_URL", + "paddlefleet==0.3.0.dev20260527 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", # fallback to PyPI name + ) + # Use --no-deps to avoid reinstalling paddlepaddle + install_args = ( + [sys.executable, "-m", "pip", "install"] + shlex.split(paddlefleet_url) + ["--no-deps", "--quiet"] + ) + subprocess.check_call(install_args) + print(f"[conftest] ✓ Installed paddlefleet 0.3.0.dev20260527 (--no-deps) from {paddlefleet_url}") print("[conftest] ℹ Using existing paddlepaddle from environment") # Print versions after installation From 1dd8639c9ae51d786e5b7ba2af2cf19cc343a472 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 29 May 2026 11:16:58 +0800 Subject: [PATCH 13/19] make new dir to isolated --- .../models/paddleformers/base_fleet.py | 1 - scripts/coverage_run.sh | 17 ++++++++++++++--- .../conftest.py | 11 +++++++---- .../test_fallback_fleet_model.py | 0 4 files changed, 21 insertions(+), 8 deletions(-) rename tests/{model_executor => model_executor_fallback}/conftest.py (95%) rename tests/{model_executor => model_executor_fallback}/test_fallback_fleet_model.py (100%) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index ae298c40b70..11fd7845e72 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -535,7 +535,6 @@ def forward( ids_remove_padding = inputs["ids_remove_padding"] num_tokens = ids_remove_padding.shape[0] - batch_id_per_token = forward_meta.batch_id_per_token # [num_tokens] seq_lens_decoder = forward_meta.seq_lens_decoder # [batch_size, 1] diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index fd0f66a483c..79466c174e8 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -16,9 +16,8 @@ export COVERAGE_RCFILE=${COVERAGE_RCFILE:-$DIR/../scripts/.coveragerc} classify_tests() { local test_file=$1 - # Rule 0: test_fallback_fleet_model.py should run in isolation (last) - # to avoid its dependencies (paddlefleet, paddleformers) affecting other tests - if [[ "$test_file" =~ test_fallback_fleet_model\.py ]]; then + # Rule 0: model_executor_fallback tests should run in isolation (last) + if [[ "$test_file" =~ tests/model_executor_fallback/.*\.py ]]; then echo "isolated" return fi @@ -67,6 +66,12 @@ run_test_with_logging() { echo "Running pytest file: $test_file" + # Set CUDA_VISIBLE_DEVICES for 4-card tests + if [[ "$test_file" =~ test_fallback_fleet_tp_model\.py ]]; then + export CUDA_VISIBLE_DEVICES="0,1" + echo "Setting CUDA_VISIBLE_DEVICES=0,1 for 2-card test" + fi + # Create isolated log directory for this test to avoid race conditions # Format: unittest_logs///log local test_rel_path="${test_file#tests/}" @@ -170,6 +175,12 @@ run_test_with_logging() { # Unset FD_LOG_DIR to avoid affecting next test unset FD_LOG_DIR + + # Unset CUDA_VISIBLE_DEVICES if it was set for 4-card test + if [[ "$test_file" =~ test_fallback_fleet_tp_model\.py ]]; then + unset CUDA_VISIBLE_DEVICES + fi + return $status } diff --git a/tests/model_executor/conftest.py b/tests/model_executor_fallback/conftest.py similarity index 95% rename from tests/model_executor/conftest.py rename to tests/model_executor_fallback/conftest.py index f41a05a9a79..cd94f342ba3 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor_fallback/conftest.py @@ -65,10 +65,13 @@ def pytest_collection_modifyitems(config, items): It checks if any collected tests require paddlefleet dependencies and installs them in an isolated manner if needed. """ - # Check if any test in this session requires paddlefleet - has_paddlefleet_tests = any("test_fallback_fleet_model.py" in item.nodeid for item in items) - print("has_paddlefleet_tests:", has_paddlefleet_tests) - if not has_paddlefleet_tests: + # IMPORTANT: Skip installation during collection phase (--collect-only) + if config.option.collectonly: + print("[conftest] Skipping dependency installation during collection phase") + return + + # All tests in this directory require paddlefleet + if not items: return # Check if dependencies are already installed with correct versions diff --git a/tests/model_executor/test_fallback_fleet_model.py b/tests/model_executor_fallback/test_fallback_fleet_model.py similarity index 100% rename from tests/model_executor/test_fallback_fleet_model.py rename to tests/model_executor_fallback/test_fallback_fleet_model.py From 095240f2fbab4104fe6347850d6376539827b65a Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 29 May 2026 14:03:31 +0800 Subject: [PATCH 14/19] add fleet_ops install --- scripts/coverage_run.sh | 5 ----- tests/model_executor_fallback/conftest.py | 9 +++++++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 79466c174e8..5d77da5d0f3 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -248,11 +248,6 @@ echo "====================================" echo "Coverage Test Execution with Parallel Single-GPU Tests" echo "====================================" -# Print paddleformers version -echo "Checking paddleformers version:" -python -c "import paddleformers; print('paddleformers.__version__:', paddleformers.__version__)" 2>/dev/null || echo "paddleformers not installed" -python -c "import importlib.metadata; print('paddleformers package version:', importlib.metadata.version('paddleformers'))" 2>/dev/null || echo "Cannot get package version" - # ============================================================ # Step 1: Collect & classify tests # ============================================================ diff --git a/tests/model_executor_fallback/conftest.py b/tests/model_executor_fallback/conftest.py index cd94f342ba3..8e8a0cc0be9 100644 --- a/tests/model_executor_fallback/conftest.py +++ b/tests/model_executor_fallback/conftest.py @@ -118,6 +118,15 @@ def pytest_collection_modifyitems(config, items): ) subprocess.check_call(install_args) print(f"[conftest] ✓ Installed paddlefleet 0.3.0.dev20260527 (--no-deps) from {paddlefleet_url}") + + # Install paddlefleet_ops + paddlefleet_ops_url = os.getenv( + "PADDLEFLEET_OPS_WHEEL_URL", + "paddlefleet_ops==0.3.0.dev20260520+2702ba51 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/", + ) + install_args = [sys.executable, "-m", "pip", "install"] + shlex.split(paddlefleet_ops_url) + ["--quiet"] + subprocess.check_call(install_args) + print(f"[conftest] ✓ Installed paddlefleet_ops 0.3.0.dev20260520+2702ba51 from {paddlefleet_ops_url}") print("[conftest] ℹ Using existing paddlepaddle from environment") # Print versions after installation From d945e683ed39ba884488a24eb2b296e35e899b30 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 29 May 2026 18:33:44 +0800 Subject: [PATCH 15/19] fix reinstall cache bug --- tests/model_executor_fallback/conftest.py | 13 +++++++++++++ .../test_fallback_fleet_model.py | 14 +++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/model_executor_fallback/conftest.py b/tests/model_executor_fallback/conftest.py index 8e8a0cc0be9..41c2b277241 100644 --- a/tests/model_executor_fallback/conftest.py +++ b/tests/model_executor_fallback/conftest.py @@ -129,6 +129,19 @@ def pytest_collection_modifyitems(config, items): print(f"[conftest] ✓ Installed paddlefleet_ops 0.3.0.dev20260520+2702ba51 from {paddlefleet_ops_url}") print("[conftest] ℹ Using existing paddlepaddle from environment") + # Clear module cache to ensure fresh imports after version change + # This is critical when transformers version changes during pytest session + import importlib + + keys_to_clear = [ + k for k in sys.modules.keys() if "huggingface_hub" in k or "transformers" in k or "paddleformers" in k + ] + for key in keys_to_clear: + del sys.modules[key] + importlib.invalidate_caches() + if keys_to_clear: + print(f"[conftest] ✓ Cleared {len(keys_to_clear)} cached modules (transformers/paddleformers)") + # Print versions after installation print("\n" + "=" * 70) print("[conftest] Package versions AFTER installing paddlefleet dependencies:") diff --git a/tests/model_executor_fallback/test_fallback_fleet_model.py b/tests/model_executor_fallback/test_fallback_fleet_model.py index 15838624893..28d126c4ea1 100644 --- a/tests/model_executor_fallback/test_fallback_fleet_model.py +++ b/tests/model_executor_fallback/test_fallback_fleet_model.py @@ -22,7 +22,7 @@ from fastdeploy import LLM, SamplingParams DEFAULT_MODEL_DIR = "./models" -MODEL_NAME = "Qwen3-0.6B" +MODEL_NAME = "Qwen3-0.6B-Base" model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR) MODEL_PATH = os.path.join(model_dir, MODEL_NAME) @@ -67,14 +67,22 @@ def test_generate_with_text_result_check(self, sampling_params): else: res = outputs_generate - expected = ( + expected1 = ( "form a more perfect Union, establish Justice, insure domestic Tranquility, " "provide for the common defence, promote the general Welfare, and secure the " "Blessings of Liberty to ourselves and our Posterity, do ordain and establish " "this Constitution for the United States of America." ) + expected2 = ( + "form a more perfect Union, establish Justice, insure domestic Tranquility, " + "provide for the common defense, promote the general Welfare, and secure the " + "Blessings of Liberty to ourselves and our Posterity, do ordain and establish " + "this Constitution for the United States of America." + ) - assert expected in res, f"Result check failed!\nExpected to contain:\n {expected}\nGot:\n {res}" + assert ( + expected1 in res or expected2 in res + ), f"Result check failed!\nExpected to contain:\n {expected1}\nGot:\n {res}" def test_generate_with_top_p_sampling(self): """Test generate with top_p sampling.""" From 83d4e63d2958ad2877ac3fbbb4f92ce918bf7329 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Fri, 29 May 2026 18:37:51 +0800 Subject: [PATCH 16/19] auto review --- fastdeploy/model_executor/models/model_base.py | 2 +- fastdeploy/model_executor/models/paddleformers/base_fleet.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index b0358f432f4..9e51a1e6bf7 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -203,7 +203,7 @@ def _try_resolve_paddleformers( raise ImportError( "paddlefleet backend requires paddlefleet to be installed.\n" "Please install with [change cuda version if needed ]:\n" - "python -m pip install paddlefleet==0.3.0.dev20260527" + "python -m pip install paddlefleet==0.3.0.dev20260527 " "--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ " "--extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu126/" ) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 11fd7845e72..356d2749ef6 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -602,6 +602,7 @@ def forward( @paddle.no_grad() def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]): # use model.from_pretrained to load weight + logger.debug("load_weights called but skipped: weights already loaded via from_pretrained") pass def set_state_dict(self, state_dict): @@ -658,7 +659,6 @@ def patch_paddlefleet_core_attention( # Patch core_attention for each TransformerLayer for layer in transformer_layers: - # Get layer_number (PaddleFleet starts from 1) layer_number = getattr(layer, "layer_number", None) if layer_number is None: # Try to get from other attributes @@ -704,7 +704,6 @@ def patch_paddlefleet_core_attention( head_dim = getattr(core_attn, "hidden_size_per_attention_head", hidden_size_per_attention_head) hidden_size_per_partition = num_attention_heads * head_dim - # Get FastDeploy layer ID (0-indexed) fd_layer_id = layer_number # Create Attention instance inside FastDeployAttention From a453cdb6ccb69e5ea734e339a99ef100c5207b80 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Mon, 1 Jun 2026 11:22:28 +0800 Subject: [PATCH 17/19] no use modify --- fastdeploy/model_executor/model_loader/default_loader_v1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py index 1d5c19a5e24..1217f9de28b 100644 --- a/fastdeploy/model_executor/model_loader/default_loader_v1.py +++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py @@ -93,6 +93,7 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer: model_cls = as_embedding_model(model_cls) else: assert_never(convert_type) + model = model_cls(fd_config) if fd_config.load_config.dynamic_load_weight or fd_config.model_config.enable_cache: process_final_after_loading(model, fd_config) From 03b2c629d860eaa1d438fccf62ccfd3fc16606c4 Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Mon, 1 Jun 2026 14:11:08 +0800 Subject: [PATCH 18/19] recommit --- scripts/coverage_run.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 5d77da5d0f3..e1337e97bfd 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -66,12 +66,6 @@ run_test_with_logging() { echo "Running pytest file: $test_file" - # Set CUDA_VISIBLE_DEVICES for 4-card tests - if [[ "$test_file" =~ test_fallback_fleet_tp_model\.py ]]; then - export CUDA_VISIBLE_DEVICES="0,1" - echo "Setting CUDA_VISIBLE_DEVICES=0,1 for 2-card test" - fi - # Create isolated log directory for this test to avoid race conditions # Format: unittest_logs///log local test_rel_path="${test_file#tests/}" @@ -175,12 +169,6 @@ run_test_with_logging() { # Unset FD_LOG_DIR to avoid affecting next test unset FD_LOG_DIR - - # Unset CUDA_VISIBLE_DEVICES if it was set for 4-card test - if [[ "$test_file" =~ test_fallback_fleet_tp_model\.py ]]; then - unset CUDA_VISIBLE_DEVICES - fi - return $status } From 820864af92ff68f90f3f46a4e82e117079bed94c Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 Date: Mon, 1 Jun 2026 14:13:58 +0800 Subject: [PATCH 19/19] remove log --- fastdeploy/model_executor/models/paddleformers/base_fleet.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fastdeploy/model_executor/models/paddleformers/base_fleet.py b/fastdeploy/model_executor/models/paddleformers/base_fleet.py index 356d2749ef6..1f57aebb75d 100644 --- a/fastdeploy/model_executor/models/paddleformers/base_fleet.py +++ b/fastdeploy/model_executor/models/paddleformers/base_fleet.py @@ -246,7 +246,6 @@ def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: output = fmqa_out else: output = fmqa_out - # _log_md5(output, "output") else: # Standard mode: concatenate QKV seq_len = int(q.shape[0]) @@ -259,10 +258,8 @@ def squeeze_to_3d(t: paddle.Tensor, name: str) -> paddle.Tensor: # Concatenate QKV: [seq, (q_heads + kv_heads + kv_heads) * head_dim] qkv = paddle.concat([q_flat, k_flat, v_flat], axis=-1) - # _log_md5(qkv, "qkv_out") output = self.fd_attention.forward(qkv=qkv, forward_meta=forward_meta) - # _log_md5(output, "atten_out") # Restore batch dimension: [seq, hidden] -> [b, seq, hidden] # PaddleFleet expects 3D output format output = output.unsqueeze(0)