From 783a6d3befe72964cf3612bd3164dd2063d9a5ab Mon Sep 17 00:00:00 2001 From: xingmingyyj Date: Wed, 11 Mar 2026 17:41:11 +0800 Subject: [PATCH 1/5] add deepseek v3.2 model --- .../paddlefleet/deepseek_v3_2_provider.py | 284 ++++++++++ paddleformers/datasets/template/template.py | 10 + paddleformers/transformers/__init__.py | 5 + paddleformers/transformers/aoa_config_base.py | 525 ++++++++++++++++++ .../transformers/auto/configuration.py | 1 + paddleformers/transformers/auto/modeling.py | 1 + 6 files changed, 826 insertions(+) create mode 100644 examples/experiments/paddlefleet/deepseek_v3_2_provider.py create mode 100644 paddleformers/transformers/aoa_config_base.py diff --git a/examples/experiments/paddlefleet/deepseek_v3_2_provider.py b/examples/experiments/paddlefleet/deepseek_v3_2_provider.py new file mode 100644 index 00000000000..7fd377ce936 --- /dev/null +++ b/examples/experiments/paddlefleet/deepseek_v3_2_provider.py @@ -0,0 +1,284 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +DeepSeek V3.2 Model Providers for PaddleFleet-based pretraining. + +Architecture: MLA (Multi-Latent Attention) + DSA Indexer (DeepSeek Sparse Attention) + + MoE (Mixture of Experts) + MTP (Multi-Token Prediction) + +Reference: DeepSeek-V3.2-Exp/inference/model.py +Config: DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json + +Usage: + provider = DeepSeekV3_2_671BProvider() + model = provider.provide(loss_fn=loss_fn) + +Pattern follows glm45_provider.py exactly. +""" + +import logging +from dataclasses import dataclass, field +from typing import Callable, List, Optional, Union + +import paddle +import paddle.nn.functional as F + +from paddleformers.transformers.gpt_provider import GPTModelProvider + +logger = logging.getLogger(__name__) + + +@dataclass +class DeepSeekV3_2BaseProvider(GPTModelProvider): + """ + Base provider for DeepSeek V3.2 architecture. + + Key components: + - MLA: Multi-Latent Attention with low-rank KV compression + - DSA: DeepSeek Sparse Attention (Indexer selects top-2048 tokens per query) + - MoE: Mixture of Experts with group-limited routing + - MTP: Multi-Token Prediction auxiliary loss + """ + + # ---- Normalization and activation ---- + normalization: str = "RMSNorm" + hidden_act: Callable = F.silu + gated_linear_unit: bool = True + use_bias: bool = False + attention_bias: bool = False + rms_norm_eps: float = 1e-6 + + # ---- Precision ---- + autocast_dtype: paddle.dtype = paddle.bfloat16 + params_dtype: paddle.dtype = paddle.bfloat16 + bf16: bool = True + + # ---- Embedding ---- + tie_word_embeddings: bool = False + + # ---- Sequence ---- + seq_length: int = 4096 + max_sequence_length: int = 4096 + hidden_dropout_prob: float = 0.0 + attention_dropout: float = 0.0 + init_method_std: float = 0.006 # ~1/sqrt(7168) + + # ---- MLA: Multi-Latent Attention ---- + # Enables the MLA RoPE path in rope_utils.py (interleaved layout) + multi_latent_attention: bool = True + num_attention_heads: int = 128 + # head_dim matches v_head_dim=128 so o_proj sizing in Attention base is correct + head_dim: int = 128 + # num_key_value_heads must be set for Attention base class; + # in MLA, KV is latent-compressed but we set this equal to num_attention_heads + # so TP sharding logic in Attention.__init__ works correctly + num_key_value_heads: int = 128 + + # MLA low-rank projection dimensions (matches DeepSeek V3.2 671B config) + q_lora_rank: int = 1536 # wq_a: hidden -> q_lora_rank + kv_lora_rank: int = 512 # wkv_a: hidden -> kv_lora_rank + qk_rope_head_dim + qk_nope_head_dim: int = 128 # per-head non-RoPE Q/K dim + qk_rope_head_dim: int = 64 # per-head RoPE Q/K dim + v_head_dim: int = 128 # per-head V dim (= head_dim, so o_proj ok) + + # ---- DSA: DeepSeek Sparse Attention Indexer ---- + # Non-None activates the DeepSeek V3.2 path in gpt_builders.py + # Field names mirror HuggingFace config.json keys for zero-copy from_config(). + index_n_heads: int = 64 # Indexer scoring heads + index_head_dim: int = 128 # Indexer Q/K head dim + index_topk: int = 2048 # Tokens selected per query + # KL loss trains wq_b/wk/weights_proj via KL(true_attn_dist || indexer_dist) + # Coefficient ~0.01 matches Megatron-Core default; set to None to disable + indexer_loss_coeff: float = 0.01 + indexer_use_sparse_loss: bool = False # use full-sequence KL (denser gradients) + + # ---- RoPE ---- + position_embedding_type: str = "rope" + # DeepSeek V3.2 uses YaRN-style RoPE with base 10000 + rotary_base: float = 10000.0 + # MLA uses interleaved RoPE; Indexer uses non-interleaved (handled internally) + # Setting rotary_interleaved=True here enables the interleaved path for MLA Q/K + rotary_interleaved: bool = True + # Disable fused RoPE kernel: MLA applies RoPE only to qk_rope_head_dim subspace, + # which is incompatible with the fused kernel that expects full head_dim + apply_rope_fusion: bool = False + # Use fp32 RoPE for numerical stability (matches reference implementation) + high_precision_rope: bool = True + + # ---- MoE routing ---- + scoring_func: str = "sigmoid" # Score experts with sigmoid + num_experts_per_tok: int = 8 # n_activated_experts + n_group: int = 8 # n_expert_groups: 256 experts / 8 groups = 32 per group + topk_group: int = 4 # n_limited_groups: select top-4 groups + routed_scaling_factor: float = 2.5 # route_scale: scale selected expert weights + topk_method: str = "group_limited_greedy" # group-limited top-k routing + norm_topk_prob: bool = True # normalize expert weights to sum to 1 + moe_token_dispatcher_type: str = "deepep" + moe_router_load_balancing_type: str = "seq_aux_loss" + moe_router_pre_softmax: bool = False + moe_grouped_gemm: bool = False + moe_shared_expert_overlap: bool = True + moe_router_dtype: str = "fp32" + moe_router_enable_expert_bias: bool = True + moe_router_bias_update_rate: float = 0.0 + + # ---- MTP: Multi-Token Prediction ---- + # 1 MTP layer for auxiliary next-token prediction loss + num_nextn_predict_layers: Optional[int] = 1 + mtp_loss_scaling_factor: float = 0.1 # MTP loss weight + + # ---- Optimization ---- + persist_layer_norm: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + + +@dataclass +class DeepSeekV3_2_671BProvider(DeepSeekV3_2BaseProvider): + """ + Provider for DeepSeek V3.2 671B model (full production config). + + Architecture: + - 61 transformer layers: first 3 dense MLP + 58 MoE + - All layers use MLA + DSA Indexer attention + - 256 routed experts + 1 shared expert per MoE layer + + Config reference: DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json + """ + + # ---- Model dimensions ---- + hidden_size: int = 7168 # dim + num_hidden_layers: int = 61 # n_layers + vocab_size: int = 129280 + + # ---- FFN dimensions ---- + intermediate_size: int = 18432 # inter_dim: dense MLP hidden size + moe_intermediate_size: int = 2048 # moe_inter_dim: per-expert MLP hidden size + + # ---- MoE architecture ---- + n_routed_experts: int = 256 + n_shared_experts: int = 1 + # Layer pattern: first 3 layers dense (0), then 58 MoE (1) + moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] * 3 + [1] * 58) + + +@dataclass +class DeepSeekV3_2_671BDebugProvider(DeepSeekV3_2_671BProvider): + """ + Small debug variant of DeepSeek V3.2 for single-card validation. + + Reduces all dimensions to fit on a single GPU for smoke testing. + Pattern: 1 dense layer + 3 MoE layers. + """ + + # ---- Reduced model dimensions ---- + num_hidden_layers: int = 4 + hidden_size: int = 1024 + vocab_size: int = 129280 + + # ---- Reduced attention dimensions ---- + num_attention_heads: int = 16 + num_key_value_heads: int = 16 + head_dim: int = 64 + q_lora_rank: int = 256 + kv_lora_rank: int = 128 + qk_nope_head_dim: int = 64 + qk_rope_head_dim: int = 32 + v_head_dim: int = 64 + + # ---- Reduced Indexer dimensions ---- + index_n_heads: int = 8 + index_head_dim: int = 64 + index_topk: int = 128 + indexer_loss_coeff: float = 0.01 + indexer_use_sparse_loss: bool = False + + # ---- Reduced FFN dimensions ---- + intermediate_size: int = 2048 + moe_intermediate_size: int = 512 + + # ---- Reduced MoE ---- + n_routed_experts: int = 8 + n_shared_experts: int = 1 + moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] * 1 + [1] * 3) + + # ---- Disable MTP for simplicity ---- + num_nextn_predict_layers: Optional[int] = 0 + + # ---- Short sequence for debug ---- + seq_length: int = 512 + max_sequence_length: int = 512 + + # ---- Single card: no model parallel ---- + sequence_parallel: bool = False + expert_model_parallel_size: int = 1 + tensor_model_parallel_size: int = 1 + moe_router_force_load_balancing: bool = True + + +@dataclass +class DeepSeekV3_2_8GPUDebugProvider(DeepSeekV3_2BaseProvider): + """ + Debug provider for DeepSeek V3.2 on a single node with 8 GPUs. + + Scales up from the single-card DebugProvider to exercise multi-card + communication paths (all-reduce, all-gather, DeepEP routing) without + the memory footprint of the full 671B model. + + Key dimension constraints for parallelism: + num_attention_heads (32) and index_n_heads (16) must be + divisible by whatever tensor_model_parallel_size is used. + n_routed_experts (16) must be divisible by expert_model_parallel_size. + + Pattern: 2 dense layers + 6 MoE layers (8 total). + """ + + # ---- Reduced model dimensions ---- + num_hidden_layers: int = 8 + hidden_size: int = 2048 + vocab_size: int = 129280 + + # ---- Reduced attention dimensions ---- + num_attention_heads: int = 32 # divisible by TP=1/2/4/8 + num_key_value_heads: int = 32 + head_dim: int = 64 + q_lora_rank: int = 512 + kv_lora_rank: int = 128 + qk_nope_head_dim: int = 64 + qk_rope_head_dim: int = 32 + v_head_dim: int = 64 + + # ---- Reduced Indexer dimensions ---- + index_n_heads: int = 16 # divisible by TP=1/2/4/8 + index_head_dim: int = 64 + index_topk: int = 256 + indexer_loss_coeff: float = 0.01 + indexer_use_sparse_loss: bool = False + + # ---- Reduced FFN dimensions ---- + intermediate_size: int = 4096 + moe_intermediate_size: int = 1024 + + # ---- Reduced MoE ---- + n_routed_experts: int = 16 # divisible by EP=1/2/4/8 + n_shared_experts: int = 1 + moe_layer_freq: Union[int, List[int]] = field(default_factory=lambda: [0] * 2 + [1] * 6) + + # ---- Disable MTP for simplicity ---- + num_nextn_predict_layers: Optional[int] = 0 + + # ---- Moderate sequence length ---- + seq_length: int = 1024 + max_sequence_length: int = 1024 diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py index 98925fb6300..047ee8ce098 100644 --- a/paddleformers/datasets/template/template.py +++ b/paddleformers/datasets/template/template.py @@ -868,3 +868,13 @@ def _get_gpt_oss_prefix(): suffix=["<|im_end|>"], chat_sep="<|im_end|>", ) + +# copied from deepseekv3 template +register_template( + name="deepseek_v32", + format_system=StringFormatter(slots=["{{content}}\n\n"]), + format_user=StringFormatter(slots=["<|User|>{{content}}\n\n<|Assistant|>"]), + format_prefix=EmptyFormatter(slots=[{"bos_token"}]), + format_assistant=StringFormatter(slots=["{{content}}"]), + chat_sep="<|end▁of▁sentence|>", +) diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py index eb816c5ffbf..0150ff79e20 100644 --- a/paddleformers/transformers/__init__.py +++ b/paddleformers/transformers/__init__.py @@ -94,6 +94,11 @@ "auto.tokenizer": ["AutoTokenizer", "TOKENIZER_MAPPING"], "auto.video_processing": ["AutoVideoProcessor", "VIDEO_PROCESSOR_MAPPING"], "deepseek_v3.configuration": ["DeepseekV3Config"], + "deepseek_v32.configuration": ["DeepseekV32Config"], + "deepseek_v32.modeling": [ + "DeepseekV32ForCausalLM", + "DeepseekV32ForCausalLMPipe", + ], "deepseek_v3.modeling": [ "masked_fill", "DeepseekV3Attention", diff --git a/paddleformers/transformers/aoa_config_base.py b/paddleformers/transformers/aoa_config_base.py new file mode 100644 index 00000000000..c9c10d38ce7 --- /dev/null +++ b/paddleformers/transformers/aoa_config_base.py @@ -0,0 +1,525 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Base class for MoE models' AOA (Auto-Optimized Architecture) config generation. + +This module provides a reusable base class for generating weight conversion +configurations in MoE (Mixture of Experts) models, supporting various features +like shared experts, dense-MoE hybrid layers, and MTP (Multi-Token Prediction). +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, List + + +@dataclass +class MoEAOAConfigParams: + """Parameters for MoE AOA config generation. + + This dataclass holds all the configuration parameters needed to generate + AOA (Auto-Optimized Architecture) statements for weight conversion. + """ + + # Basic model config + num_hidden_layers: int = 0 + num_attention_heads: int = 0 + num_key_value_heads: int = 0 + + # MoE specific config + num_experts: int = 0 + using_sonic_moe: bool = False + moe_grouped_gemm: bool = False + fp8: bool = False + fd_fallback: bool = False + + # Embedding config + tie_word_embeddings: bool = False + + # Layer offset config + num_head_empty_layers: int = 0 + first_k_dense_replace: int = 0 + num_nextn_predict_layers: int = 0 + + # Attention config + attention_bias: bool = False + multi_latent_attention: bool = False + use_qk_norm: bool = False + + # Shared experts config + has_shared_experts: bool = True + + # Runtime config + model_prefix: str = "model." + + # Extra statements to add + extra_statements: List[str] = field(default_factory=list) + + index_n_heads: int = 0 + + +class MoEAOAConfigGenerator: + """Base class for MoE AOA config generation. + + This class provides a modular and extensible framework for generating + weight conversion configurations. Subclasses can override specific methods + to customize behavior for different model architectures. + + Example: + class GlmMoeDsaAOAGenerator(MoEAOAConfigGenerator): + def _get_attention_statements(self, params, layer_idx, prefix, prefix_offset): + if params.multi_latent_attention: + return self._get_mla_attention_statements(params, prefix, prefix_offset) + return super()._get_attention_statements(params, layer_idx, prefix, prefix_offset) + """ + + @classmethod + def gen_aoa_config(cls, config: Any) -> Dict[str, List[str]]: + """Main entry point for generating AOA config. + + Args: + config: Model configuration object with necessary attributes. + + Returns: + Dictionary with 'aoa_statements' key containing list of conversion statements. + """ + params = cls._extract_params(config) + return cls._build_aoa_config(params) + + @classmethod + def _extract_params(cls, config: Any) -> MoEAOAConfigParams: + """Extract parameters from config object. + + Subclasses can override this to add custom parameter extraction. + """ + # Get num_experts from config + if hasattr(config, "n_routed_experts"): + num_experts = config.n_routed_experts + else: + num_experts = getattr(config, "num_experts", 0) + + return MoEAOAConfigParams( + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + num_experts=num_experts, + index_n_heads=getattr(config, "index_n_heads", 0), + using_sonic_moe=getattr(config, "using_sonic_moe", False), + moe_grouped_gemm=getattr(config, "moe_grouped_gemm", False), + fp8=getattr(config, "fp8", False), + fd_fallback=config.get("fd_fallback", False) if hasattr(config, "get") else False, + tie_word_embeddings=getattr(config, "tie_word_embeddings", False), + num_head_empty_layers=( + config.num_empty_layers_add_in_head + if hasattr(config, "num_empty_layers_add_in_head") and config.num_empty_layers_add_in_head + else 0 + ), + first_k_dense_replace=getattr(config, "first_k_dense_replace", 0), + num_nextn_predict_layers=getattr(config, "num_nextn_predict_layers", 0) or 0, + attention_bias=getattr(config, "attention_bias", False), + multi_latent_attention=getattr(config, "multi_latent_attention", False), + use_qk_norm=getattr(config, "use_qk_norm", False), + has_shared_experts=cls._has_shared_experts(config), + model_prefix=cls._get_model_prefix(config), + ) + + @classmethod + def _get_model_prefix(cls, config: Any) -> str: + """Get model prefix based on class type.""" + if hasattr(cls, "base_model_class") and cls == cls.base_model_class: + return "" + return "model." + + @classmethod + def _has_shared_experts(cls, config: Any) -> bool: + """Check if model has shared experts. Override for models without shared experts.""" + return True + + @classmethod + def _build_aoa_config(cls, params: MoEAOAConfigParams) -> Dict[str, List[str]]: + """Build the complete AOA config from parameters.""" + aoa_statements = [] + + # 1. Basic weights (norm, embed_tokens, lm_head) + aoa_statements.extend(cls._get_basic_weight_statements(params)) + + # 2. Dense layers (if any) + aoa_statements.extend(cls._get_dense_layer_statements(params)) + + # 3. MTP layers (if any) + aoa_statements.extend(cls._get_mtp_layer_statements(params)) + + # 4. MoE layers + aoa_statements.extend(cls._get_moe_layer_statements(params)) + + # 5. Grouped GEMM (if enabled) + aoa_statements.extend(cls._get_grouped_gemm_statements(params)) + + # 6. Extra statements from subclasses + aoa_statements.extend(params.extra_statements) + + return {"aoa_statements": aoa_statements} + + # ==================== Basic Weights ==================== + + @classmethod + def _get_basic_weight_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate statements for basic weights: norm, embeddings, lm_head.""" + statements = [ + f"model.norm.weight -> {params.model_prefix}norm.weight", + ] + + # Embeddings + statements.append(f"model.embed_tokens.weight -> {params.model_prefix}embedding.embed_tokens.weight") + + # lm_head + if params.tie_word_embeddings: + statements.append(f"model.embed_tokens.weight -> {params.model_prefix}lm_head.weight") + else: + statements.append(f"lm_head.weight -> {params.model_prefix}lm_head.weight") + + return statements + + # ==================== Dense Layers ==================== + + @classmethod + def _get_dense_layer_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate statements for dense (non-MoE) layers. + + Override this method to customize dense layer handling. + Default implementation handles first_k_dense_replace layers. + """ + statements = [] + + if params.first_k_dense_replace <= 0: + return statements + + for layer_idx in reversed(range(0, params.first_k_dense_replace)): + layer_idx_offset = layer_idx + params.num_head_empty_layers + statements.extend(cls._get_single_dense_layer_statements(params, layer_idx, layer_idx_offset)) + + return statements + + @classmethod + def _get_single_dense_layer_statements( + cls, params: MoEAOAConfigParams, layer_idx: int, layer_idx_offset: int + ) -> List[str]: + """Generate statements for a single dense layer.""" + prefix = f"model.layers.{layer_idx}" + prefix_offset = f"{params.model_prefix}layers.{layer_idx_offset}" + statements = [] + # Layer norms and attention output + statements.extend( + [ + f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight", + f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight", + f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight", + ] + ) + + # Attention QKV (can be standard or MLA) + statements.extend(cls._get_attention_statements(params, layer_idx, prefix, prefix_offset)) + + # MLP + statements.extend( + [ + f"{prefix}.mlp.down_proj.weight^T -> {prefix_offset}.mlp.down_proj.weight", + f"{prefix}.mlp.gate_proj.weight^T, {prefix}.mlp.up_proj.weight^T -> {prefix_offset}.mlp.up_gate_proj.weight, fused_ffn", + ] + ) + + return statements + + # ==================== MTP (Multi-Token Prediction) ==================== + + @classmethod + def _get_mtp_layer_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate statements for MTP layers.""" + statements = [] + + if params.num_nextn_predict_layers <= 0: + return statements + + num_hidden_layers = params.num_hidden_layers + for layer_idx in reversed(range(num_hidden_layers, num_hidden_layers + params.num_nextn_predict_layers)): + layer_idx_offset = layer_idx + params.num_head_empty_layers + statements.extend(cls._get_single_mtp_layer_statements(params, layer_idx, layer_idx_offset)) + + return statements + + @classmethod + def _get_single_mtp_layer_statements( + cls, params: MoEAOAConfigParams, layer_idx: int, layer_idx_offset: int + ) -> List[str]: + """Generate statements for a single MTP layer. Override for customization.""" + prefix = f"model.layers.{layer_idx}" + prefix_offset = f"{params.model_prefix}layers.{layer_idx_offset}" + + return [ + f"{prefix}.eh_proj.weight^T -> {prefix_offset}.eh_proj.weight", + f"{prefix}.enorm.weight -> {prefix_offset}.enorm.weight", + f"{prefix}.hnorm.weight -> {prefix_offset}.hnorm.weight", + f"{prefix}.shared_head.norm.weight -> {prefix_offset}.norm.weight", + ] + + # ==================== MoE Layers ==================== + + @classmethod + def _get_moe_layer_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate statements for MoE layers.""" + statements = [] + + # Determine layer range + start_layer = params.first_k_dense_replace + end_layer = params.num_hidden_layers + params.num_nextn_predict_layers + + for layer_idx in reversed(range(start_layer, end_layer)): + layer_idx_offset = layer_idx + params.num_head_empty_layers + statements.extend(cls._get_single_moe_layer_statements(params, layer_idx, layer_idx_offset)) + + return statements + + @classmethod + def _get_single_moe_layer_statements( + cls, params: MoEAOAConfigParams, layer_idx: int, layer_idx_offset: int + ) -> List[str]: + """Generate statements for a single MoE layer.""" + statements = [] + + prefix = f"model.layers.{layer_idx}" + prefix_offset = f"{params.model_prefix}layers.{layer_idx_offset}" + + # Handle MTP transformer layer + if layer_idx >= params.num_hidden_layers: + prefix_offset += ".transformer_layer" + + # Layer norms and attention output + statements.extend( + [ + f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight", + f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight", + f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight", + ] + ) + + # Attention QKV (can be standard or MLA) + statements.extend(cls._get_attention_statements(params, layer_idx, prefix, prefix_offset)) + + # MoE specific weights + statements.extend(cls._get_moe_expert_statements(params, prefix, prefix_offset)) + + return statements + + # ==================== Attention ==================== + + @classmethod + def _get_attention_statements( + cls, params: MoEAOAConfigParams, layer_idx: int, prefix: str, prefix_offset: str + ) -> List[str]: + """Generate attention-related statements. + + Override this method for different attention types (standard QKV vs MLA). + """ + if params.multi_latent_attention: + return cls._get_mla_attention_statements(params, prefix, prefix_offset) + return cls._get_standard_attention_statements(params, prefix, prefix_offset) + + @classmethod + def _get_standard_attention_statements( + cls, params: MoEAOAConfigParams, prefix: str, prefix_offset: str + ) -> List[str]: + """Generate standard QKV attention statements.""" + statements = [ + f"{prefix}.self_attn.q_proj.weight^T, {prefix}.self_attn.k_proj.weight^T, {prefix}.self_attn.v_proj.weight^T -> {prefix_offset}.self_attn.qkv_proj.weight, fused_qkv, num_heads={params.num_attention_heads}, num_key_value_groups={params.num_key_value_heads}", + ] + + if params.attention_bias: + statements.append( + f"{prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias -> {prefix_offset}.self_attn.qkv_proj.bias, fused_qkv, num_heads={params.num_attention_heads}, num_key_value_groups={params.num_key_value_heads}, axis=0" + ) + + return statements + + @classmethod + def _get_mla_attention_statements(cls, params: MoEAOAConfigParams, prefix: str, prefix_offset: str) -> List[str]: + """Generate Multi-Latent Attention (MLA) statements. + + MLA uses compressed KV representation with separate projections. + """ + statements = [ + f"{prefix}.self_attn.kv_a_proj_with_mqa.weight^T -> {prefix_offset}.self_attn.kv_a_proj_with_mqa.weight", + f"{prefix}.self_attn.kv_b_proj.weight^T -> {prefix_offset}.self_attn.kv_b_proj.weight", + f"{prefix}.self_attn.q_a_proj.weight^T -> {prefix_offset}.self_attn.q_a_proj.weight", + f"{prefix}.self_attn.q_b_proj.weight^T -> {prefix_offset}.self_attn.q_b_proj.weight", + ] + + if params.use_qk_norm: + statements.extend( + [ + f"{prefix}.self_attn.q_a_layernorm.weight -> {prefix_offset}.self_attn.q_a_layernorm.weight", + f"{prefix}.self_attn.kv_a_layernorm.weight -> {prefix_offset}.self_attn.kv_a_layernorm.weight", + ] + ) + + if params.index_n_heads > 0: + indexer_weights = [ + "wq_b", + "wk", + "weights_proj", + ] + statements.extend( + [ + f"{prefix}.self_attn.indexer.{weight_name}.weight^T -> {prefix_offset}.self_attn.indexer.{weight_name}.weight" + for weight_name in indexer_weights + ] + ) + + return statements + + # ==================== MoE Expert Weights ==================== + + @classmethod + def _get_moe_expert_statements(cls, params: MoEAOAConfigParams, prefix: str, prefix_offset: str) -> List[str]: + """Generate MoE expert weight statements.""" + statements = [] + + # Gate weights + statements.append( + f"{prefix}.mlp.gate.e_score_correction_bias -> {prefix_offset}.mlp.gate.e_score_correction_bias" + ) + statements.append(f"{prefix}.mlp.gate.weight -> {prefix_offset}.mlp.gate.weight, dtype='float32'") + + # Shared experts (if model has them) + if params.has_shared_experts: + statements.extend(cls._get_shared_expert_statements(params, prefix, prefix_offset)) + + # Routed experts + statements.extend(cls._get_routed_expert_statements(params, prefix, prefix_offset)) + + return statements + + @classmethod + def _get_shared_expert_statements(cls, params: MoEAOAConfigParams, prefix: str, prefix_offset: str) -> List[str]: + """Generate shared expert weight statements.""" + return [ + f"{prefix}.mlp.shared_experts.down_proj.weight^T -> {prefix_offset}.mlp.shared_experts.down_proj.weight", + f"{prefix}.mlp.shared_experts.gate_proj.weight^T, {prefix}.mlp.shared_experts.up_proj.weight^T -> {prefix_offset}.mlp.shared_experts.up_gate_proj.weight, fused_ffn", + ] + + @classmethod + def _get_routed_expert_statements(cls, params: MoEAOAConfigParams, prefix: str, prefix_offset: str) -> List[str]: + """Generate routed expert weight statements.""" + statements = [] + + # Down projection + if params.using_sonic_moe: + statements.append( + f"{prefix}.mlp.experts.$EXPERT_ID.down_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.down_proj.weight" + ) + else: + statements.append( + f"{prefix}.mlp.experts.$EXPERT_ID.down_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.down_proj.weight" + ) + + # Up and gate projection fusion + if params.using_sonic_moe: + statements.append( + f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0" + ) + else: + statements.append( + f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1" + ) + + return statements + + # ==================== Grouped GEMM ==================== + + @classmethod + def _get_grouped_gemm_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate grouped GEMM statements for efficient MoE computation.""" + if not (params.moe_grouped_gemm or params.using_sonic_moe) and not params.fp8: + return cls._get_fd_fallback_statements(params) + + statements = [] + + start_layer = params.first_k_dense_replace + end_layer = params.num_hidden_layers + params.num_nextn_predict_layers + + for layer_idx in range(start_layer, end_layer): + layer_idx_offset = layer_idx + params.num_head_empty_layers + prefix_offset = f"{params.model_prefix}layers.{layer_idx_offset}" + + if layer_idx >= params.num_hidden_layers: + prefix_offset += ".transformer_layer" + + ep_weight1 = [] + ep_weight2 = [] + for expert_id in range(params.num_experts): + ep_weight1.append(f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight") + ep_weight2.append(f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight") + + group_gemm1 = ",".join(ep_weight1) + group_gemm2 = ",".join(ep_weight2) + + statements.extend( + [ + f"{group_gemm1} -> {prefix_offset}.mlp.grouped_gemm_experts.weight1, axis=0" + f"{group_gemm2} -> {prefix_offset}.mlp.grouped_gemm_experts.weight2, axis=0" + ] + ) + + return statements + + @classmethod + def _get_fd_fallback_statements(cls, params: MoEAOAConfigParams) -> List[str]: + """Generate fallback statements when grouped GEMM is not available.""" + if not params.fd_fallback: + return [] + + statements = [] + + start_layer = params.first_k_dense_replace + end_layer = params.num_hidden_layers + params.num_nextn_predict_layers + + for layer_idx in range(start_layer, end_layer): + layer_idx_offset = layer_idx + params.num_head_empty_layers + prefix_offset = f"{params.model_prefix}layers.{layer_idx_offset}" + + if layer_idx >= params.num_hidden_layers: + prefix_offset += ".transformer_layer" + + ep_weight1 = [] + ep_weight2 = [] + for expert_id in range(params.num_experts): + ep_weight1.append(f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight") + ep_weight2.append(f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight") + + group1 = ",".join(ep_weight1) + group2 = ",".join(ep_weight2) + + statements.extend( + [ + f"{group1} -> {prefix_offset}.mlp.experts.up_gate_proj, axis=0" + f"{group2} -> {prefix_offset}.mlp.experts.down_proj, axis=0" + ] + ) + + return statements + + +__all__ = [ + "MoEAOAConfigParams", + "MoEAOAConfigGenerator", +] diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py index 14391dfd12f..8537a96398d 100644 --- a/paddleformers/transformers/auto/configuration.py +++ b/paddleformers/transformers/auto/configuration.py @@ -34,6 +34,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( [ ("deepseek_v3", "DeepseekV3Config"), + ("deepseek_v32", "DeepseekV32Config"), ("ernie4_5", "Ernie4_5Config"), ("ernie4_5_moe", "Ernie4_5_MoeConfig"), ("ernie4_5_moe_vl", "Ernie4_5_VLConfig"), diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py index ad2fdef5b7a..0521aabeac5 100644 --- a/paddleformers/transformers/auto/modeling.py +++ b/paddleformers/transformers/auto/modeling.py @@ -54,6 +54,7 @@ MAPPING_NAMES = OrderedDict( [ ("DeepseekV3", "deepseek_v3"), + ("DeepseekV32", "deepseek_v32"), ("Ernie4_5", "ernie4_5"), ("Ernie4_5_Moe", "ernie4_5_moe"), ("Ernie4_5_VLMoe", "ernie4_5_moe_vl"), From 63acc2bb3191c8c73142ec41ef5a9adab7645c3d Mon Sep 17 00:00:00 2001 From: xingmingyyj Date: Wed, 11 Mar 2026 21:36:44 +0800 Subject: [PATCH 2/5] add deepseek v3.2 model --- .../transformers/deepseek_v32/__init__.py | 37 +++++ .../deepseek_v32/configuration.py | 135 ++++++++++++++++++ .../transformers/deepseek_v32/modeling.py | 131 +++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 paddleformers/transformers/deepseek_v32/__init__.py create mode 100644 paddleformers/transformers/deepseek_v32/configuration.py create mode 100644 paddleformers/transformers/deepseek_v32/modeling.py diff --git a/paddleformers/transformers/deepseek_v32/__init__.py b/paddleformers/transformers/deepseek_v32/__init__.py new file mode 100644 index 00000000000..1889bf542a1 --- /dev/null +++ b/paddleformers/transformers/deepseek_v32/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from typing import TYPE_CHECKING + +from ...utils.lazy_import import _LazyModule + +import_structure = { + "configuration": ["DeepseekV32Config"], + "modeling": [ + "DeepseekV32ForCausalLM", + "DeepseekV32ForCausalLMPipe", + ], +} + +if TYPE_CHECKING: + from .configuration import * + from .modeling import * +else: + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + import_structure, + module_spec=__spec__, + ) diff --git a/paddleformers/transformers/deepseek_v32/configuration.py b/paddleformers/transformers/deepseek_v32/configuration.py new file mode 100644 index 00000000000..17ba5cec083 --- /dev/null +++ b/paddleformers/transformers/deepseek_v32/configuration.py @@ -0,0 +1,135 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..configuration_utils import PretrainedConfig + + +class DeepseekV32Config(PretrainedConfig): + r""" + Configuration for DeepSeek V3.2 model. + + Architecture: MLA (Multi-Latent Attention) + DSA Indexer (DeepSeek Sparse Attention) + + MoE (Mixture of Experts) + MTP (Multi-Token Prediction) + + Field names are kept consistent with the HuggingFace config.json so that + ``TransformerConfig.from_config()`` can map them to the PaddleFleet provider + dataclass fields without any manual renaming. + """ + + model_type = "deepseek_v32" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=129280, + hidden_size=7168, + intermediate_size=18432, + moe_intermediate_size=2048, + num_hidden_layers=61, + num_attention_heads=128, + num_key_value_heads=128, + max_position_embeddings=163840, + rms_norm_eps=1e-6, + hidden_act="silu", + initializer_range=0.02, + use_cache=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + tie_word_embeddings=False, + # MLA parameters + q_lora_rank=1536, + kv_lora_rank=512, + qk_nope_head_dim=128, + qk_rope_head_dim=64, + v_head_dim=128, + head_dim=None, + # DSA Indexer parameters (field names match HF config.json) + index_n_heads=64, + index_head_dim=128, + index_topk=2048, + # MoE parameters + n_routed_experts=256, + n_shared_experts=1, + num_experts_per_tok=8, + n_group=8, + topk_group=4, + routed_scaling_factor=2.5, + scoring_func="sigmoid", + norm_topk_prob=True, + topk_method="noaux_tc", + first_k_dense_replace=3, + moe_layer_freq=1, + # MTP parameters + num_nextn_predict_layers=1, + # Pipeline parallel segmentation + pp_seg_method="layer:TransformerLayer|EmptyLayer", + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.max_position_embeddings = max_position_embeddings + self.rms_norm_eps = rms_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + # MLA + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + # head_dim must equal v_head_dim for MLA: o_proj input size = num_heads * head_dim, + # and the attention output per head = v_head_dim. + self.head_dim = head_dim if head_dim is not None else v_head_dim + + # DSA Indexer + self.index_n_heads = index_n_heads + self.index_head_dim = index_head_dim + self.index_topk = index_topk + + # MoE + self.n_routed_experts = n_routed_experts + self.n_shared_experts = n_shared_experts + self.num_experts_per_tok = num_experts_per_tok + self.n_group = n_group + self.topk_group = topk_group + self.routed_scaling_factor = routed_scaling_factor + self.scoring_func = scoring_func + self.norm_topk_prob = norm_topk_prob + self.topk_method = topk_method + self.first_k_dense_replace = first_k_dense_replace + self.moe_layer_freq = moe_layer_freq + + # MTP + self.num_nextn_predict_layers = num_nextn_predict_layers + + # PP + self.pp_seg_method = pp_seg_method + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +__all__ = ["DeepseekV32Config"] diff --git a/paddleformers/transformers/deepseek_v32/modeling.py b/paddleformers/transformers/deepseek_v32/modeling.py new file mode 100644 index 00000000000..b87b4d3e113 --- /dev/null +++ b/paddleformers/transformers/deepseek_v32/modeling.py @@ -0,0 +1,131 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +DeepSeek V3.2 PaddleFleet model bridge. + +This module bridges the HuggingFace-style PretrainedConfig/PretrainedModel +interface used by PaddleFormers with the PaddleFleet provider system. + +Pattern follows glm_moe_dsa/modeling.py (GLM5 PR #3940) exactly: + - DeepseekV32ForCausalLM.__new__() calls DeepSeekV3_2BaseProvider.from_config(config) + - provider.provide() calls paddlefleet.gpt_builders.gpt_builder() + - Returns a PaddleFleet GPT model (Megatron-style) +""" + +import os +import sys + +from ..aoa_config_base import MoEAOAConfigGenerator +from ..model_utils import PretrainedModel +from .configuration import DeepseekV32Config + + +class DeepseekV32PreTrainedModel(PretrainedModel): + config_class = DeepseekV32Config + base_model_prefix = "model" + + @classmethod + def _gen_aoa_config(cls, config: DeepseekV32Config): + aoa_config = MoEAOAConfigGenerator.gen_aoa_config(config) + aoa_statements = aoa_config["aoa_statements"] + print("===========> AOA STATEMENTS <==========") + for stmt in aoa_statements: + print(stmt) + return aoa_config + + @classmethod + def _gen_inv_aoa_config(cls, config: DeepseekV32Config): + return None + + +def _build_model(config): + """ + Common __new__ logic shared by ForCausalLM and ForCausalLMPipe. + + Steps: + 1. Normalise parallel config attributes (same as GLM5). + 2. Call DeepSeekV3_2BaseProvider.from_config(config) to populate provider fields. + 3. Call provider.provide() which runs gpt_builder() and returns the PaddleFleet model. + (moe_layer_freq + first_k_dense_replace conversion is handled by + TransformerConfig.__post_init__ automatically.) + """ + # 1. Normalise parallel config (guard against missing attrs from old configs) + config.tensor_model_parallel_size = max(getattr(config, "tensor_model_parallel_size", 1), 1) + config.pipeline_model_parallel_size = max(getattr(config, "pipeline_model_parallel_size", 1), 1) + config.context_parallel_size = max(getattr(config, "context_parallel_size", 1), 1) + config.virtual_pipeline_model_parallel_size = max(getattr(config, "virtual_pipeline_model_parallel_size", 1), 1) + config.expert_model_parallel_size = max(getattr(config, "expert_model_parallel_size", 1), 1) + + # 2. Resolve provider module path. + # The provider lives under examples/experiments/paddlefleet/ which is + # not a proper package. We add it to sys.path if needed. + _provider_dir = os.path.join( + os.path.dirname(__file__), # .../paddleformers/transformers/deepseek_v32/ + "..", + "..", + "..", + "examples", + "experiments", + "paddlefleet", + ) + _provider_dir = os.path.normpath(_provider_dir) + if _provider_dir not in sys.path: + sys.path.insert(0, _provider_dir) + + from deepseek_v3_2_provider import DeepSeekV3_2BaseProvider + + # 3. Build model via provider + model_provider = DeepSeekV3_2BaseProvider.from_config(config) + gpt_model = model_provider.provide() + gpt_model.config_to_save = config + return gpt_model + + +class DeepseekV32ForCausalLM(DeepseekV32PreTrainedModel): + """DeepSeek V3.2 model for pipeline_model_parallel_size == 1.""" + + is_fleet = True + + def __new__(cls, config): + gpt_model = _build_model(config) + gpt_model.is_fleet = cls.is_fleet + gpt_model._gen_aoa_config = cls._gen_aoa_config + gpt_model._gen_inv_aoa_config = cls._gen_inv_aoa_config + return gpt_model + + +class DeepseekV32ForCausalLMPipe(DeepseekV32PreTrainedModel): + """DeepSeek V3.2 model for pipeline_model_parallel_size > 1.""" + + is_fleet = True + + def __new__(cls, config): + if not hasattr(config, "architectures"): + config.architectures = ["DeepseekV32ForCausalLM"] + gpt_model = _build_model(config) + gpt_model.is_fleet = cls.is_fleet + gpt_model._gen_aoa_config = cls._gen_aoa_config + gpt_model._gen_inv_aoa_config = cls._gen_inv_aoa_config + print("=======> model config") + print(gpt_model.config) + gpt_model._gen_aoa_config(gpt_model.config) + return gpt_model + + +__all__ = [ + "DeepseekV32PreTrainedModel", + "DeepseekV32ForCausalLM", + "DeepseekV32ForCausalLMPipe", +] From 345633923fd5cdf97b54e4b35b9b4005f1966f1b Mon Sep 17 00:00:00 2001 From: xingmingyyj Date: Sun, 15 Mar 2026 19:04:42 +0800 Subject: [PATCH 3/5] fix --- .../paddlefleet/deepseek_v3_2_provider.py | 6 ++-- paddleformers/transformers/aoa_config_base.py | 29 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/examples/experiments/paddlefleet/deepseek_v3_2_provider.py b/examples/experiments/paddlefleet/deepseek_v3_2_provider.py index 7fd377ce936..1ca2fde883c 100644 --- a/examples/experiments/paddlefleet/deepseek_v3_2_provider.py +++ b/examples/experiments/paddlefleet/deepseek_v3_2_provider.py @@ -76,8 +76,10 @@ class DeepSeekV3_2BaseProvider(GPTModelProvider): init_method_std: float = 0.006 # ~1/sqrt(7168) # ---- MLA: Multi-Latent Attention ---- - # Enables the MLA RoPE path in rope_utils.py (interleaved layout) - multi_latent_attention: bool = True + # MLA de-interleave in rope_utils is NOT needed when rotary_interleaved=True, + # because _rotate_half(interleaved=True) already pairs adjacent dims correctly + # (matching DeepSeek-V3.2 reference apply_rotary_emb(interleaved=True)). + multi_latent_attention: bool = False num_attention_heads: int = 128 # head_dim matches v_head_dim=128 so o_proj sizing in Attention base is correct head_dim: int = 128 diff --git a/paddleformers/transformers/aoa_config_base.py b/paddleformers/transformers/aoa_config_base.py index c9c10d38ce7..a41f5c65577 100644 --- a/paddleformers/transformers/aoa_config_base.py +++ b/paddleformers/transformers/aoa_config_base.py @@ -154,9 +154,6 @@ def _build_aoa_config(cls, params: MoEAOAConfigParams) -> Dict[str, List[str]]: # 1. Basic weights (norm, embed_tokens, lm_head) aoa_statements.extend(cls._get_basic_weight_statements(params)) - # 2. Dense layers (if any) - aoa_statements.extend(cls._get_dense_layer_statements(params)) - # 3. MTP layers (if any) aoa_statements.extend(cls._get_mtp_layer_statements(params)) @@ -169,6 +166,9 @@ def _build_aoa_config(cls, params: MoEAOAConfigParams) -> Dict[str, List[str]]: # 6. Extra statements from subclasses aoa_statements.extend(params.extra_statements) + # 2. Dense layers (if any) + aoa_statements.extend(cls._get_dense_layer_statements(params)) + return {"aoa_statements": aoa_statements} # ==================== Basic Weights ==================== @@ -222,8 +222,8 @@ def _get_single_dense_layer_statements( # Layer norms and attention output statements.extend( [ - f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight", - f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight", + f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight, src_dtype='float32', dst_dtype='bfloat16'", + f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight,src_dtype='float32', dst_dtype='bfloat16'", f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight", ] ) @@ -307,8 +307,8 @@ def _get_single_moe_layer_statements( # Layer norms and attention output statements.extend( [ - f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight", - f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight", + f"{prefix}.input_layernorm.weight -> {prefix_offset}.input_layernorm.weight,src_dtype='float32', dst_dtype='bfloat16'", + f"{prefix}.post_attention_layernorm.weight -> {prefix_offset}.post_attention_layernorm.weight,src_dtype='float32', dst_dtype='bfloat16'", f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight", ] ) @@ -367,12 +367,12 @@ def _get_mla_attention_statements(cls, params: MoEAOAConfigParams, prefix: str, if params.use_qk_norm: statements.extend( [ - f"{prefix}.self_attn.q_a_layernorm.weight -> {prefix_offset}.self_attn.q_a_layernorm.weight", - f"{prefix}.self_attn.kv_a_layernorm.weight -> {prefix_offset}.self_attn.kv_a_layernorm.weight", + f"{prefix}.self_attn.q_a_layernorm.weight -> {prefix_offset}.self_attn.q_a_layernorm.weight, src_dtype='float32',dst_dtype='bfloat16'", + f"{prefix}.self_attn.kv_a_layernorm.weight -> {prefix_offset}.self_attn.kv_a_layernorm.weight, src_dtype='float32',dst_dtype='bfloat16'", ] ) - if params.index_n_heads > 0: + if params.index_n_heads and params.index_n_heads > 0: indexer_weights = [ "wq_b", "wk", @@ -384,6 +384,11 @@ def _get_mla_attention_statements(cls, params: MoEAOAConfigParams, prefix: str, for weight_name in indexer_weights ] ) + statements += [ + f"{prefix}.self_attn.indexer.k_norm.bias -> {prefix_offset}.self_attn.indexer.k_norm.bias,src_dtype='float32', dst_dtype='bfloat16'", + f"{prefix}.self_attn.indexer.k_norm.weight -> {prefix_offset}.self_attn.indexer.k_norm.weight,src_dtype='float32', dst_dtype='bfloat16'", + "model.norm.weight -> model.norm.weight, src_dtype='float32',dst_dtype='bfloat16'", + ] return statements @@ -398,7 +403,9 @@ def _get_moe_expert_statements(cls, params: MoEAOAConfigParams, prefix: str, pre statements.append( f"{prefix}.mlp.gate.e_score_correction_bias -> {prefix_offset}.mlp.gate.e_score_correction_bias" ) - statements.append(f"{prefix}.mlp.gate.weight -> {prefix_offset}.mlp.gate.weight, dtype='float32'") + statements.append( + f"{prefix}.mlp.gate.weight -> {prefix_offset}.mlp.gate.weight,src_dtype='bfloat16',dst_dtype='float32'" + ) # Shared experts (if model has them) if params.has_shared_experts: From 2c8b687b68f40d845f456425321481ed6e60f1b2 Mon Sep 17 00:00:00 2001 From: xingmingyyj Date: Fri, 27 Mar 2026 19:46:23 +0800 Subject: [PATCH 4/5] tmp --- paddleformers/trainer/trainer.py | 38 ++++++++++++++++++- paddleformers/trainer/training_args.py | 5 +++ paddleformers/transformers/aoa_config_base.py | 2 +- .../transformers/configuration_utils.py | 6 +++ .../deepseek_v32/configuration.py | 9 +++++ paddleformers/transformers/gpt_provider.py | 10 +++++ paddleformers/transformers/model_utils.py | 19 ++++++---- 7 files changed, 79 insertions(+), 10 deletions(-) diff --git a/paddleformers/trainer/trainer.py b/paddleformers/trainer/trainer.py index 673fd0f63c4..1a11e6aa302 100644 --- a/paddleformers/trainer/trainer.py +++ b/paddleformers/trainer/trainer.py @@ -1956,6 +1956,24 @@ def _inner_training_loop( _data_load_start_time = time.time() for step, inputs in enumerate(epoch_iterator): + + # # print("=====> inputs: ",inputs) + # input_ids_ = inputs["input_ids"] + # labels_ = inputs["labels"] + # position_ids_ = inputs["position_ids"] + # input_ids_numpy = input_ids_.numpy() + # labels_numpy = labels_.numpy() + # position_ids_numpy = position_ids_.numpy() + # path = r"/root/paddlejob/share-storage/gpfs/system-public/zhuxinming/zhuxinming/paddlefleet_dpskv32/paddlefleet/new_paddlefleet/data_cache/paddlefleet" + # inputs_ids_path = os.path.join(path,f"input_ids_step_{step}.np") + # np.save(inputs_ids_path, input_ids_numpy) + + # labels_path = os.path.join(path,f"labels_step_{step}.np") + # np.save(labels_path, labels_numpy) + + # position_ids_path = os.path.join(path,f"position_ids_step_{step}.np") + # np.save(position_ids_path, position_ids_numpy) + # Record data loading time for this iteration _data_load_end_time = time.time() _data_load_time_for_global_step += _data_load_end_time - _data_load_start_time @@ -1993,7 +2011,7 @@ def _inner_training_loop( steps_trained_progress_bar.update(1) if steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) - self.timers and self.timers("read-data").start() + # self.timers and self.timers("read-data").start() # Reset data loading timer for skipped steps _data_load_start_time = time.time() continue @@ -2560,6 +2578,24 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, except (ImportError, AttributeError): pass + # Add DSA indexer loss metrics if available + try: + from paddlefleet.transformer.dsa_attention import ( + DSAIndexerLossLoggingHelper, + ) + + if DSAIndexerLossLoggingHelper.tracker.get("values") is not None: + loss_scale = 1.0 / self.args.gradient_accumulation_steps + DSAIndexerLossLoggingHelper.reduce_loss_in_tracker() + tracker = DSAIndexerLossLoggingHelper.tracker + indexer_loss_values = tracker["values"] * loss_scale + num_layers = indexer_loss_values.shape[0] + avg_indexer_loss = indexer_loss_values.sum() / num_layers + logs["indexer_loss"] = avg_indexer_loss.item() + DSAIndexerLossLoggingHelper.clean_loss_in_tracker() + except (ImportError, AttributeError): + pass + self._total_loss_scalar += tr_loss_scalar self._globalstep_last_logged = self.state.global_step self._globalstep_last_start_time = time.time() diff --git a/paddleformers/trainer/training_args.py b/paddleformers/trainer/training_args.py index b3d91dd2c6a..8b2aadcb541 100644 --- a/paddleformers/trainer/training_args.py +++ b/paddleformers/trainer/training_args.py @@ -1552,6 +1552,11 @@ class TrainingArguments: }, ) + dsa_indexer_loss_coeff: bool = field( + default=0.01, + metadata={"help": "Loss coefficient for the DSA indexer; controls the weight of the indexer loss term."}, + ) + def __post_init__(self): world_size = paddle.distributed.get_world_size() if in_auto_parallel_align_mode(): diff --git a/paddleformers/transformers/aoa_config_base.py b/paddleformers/transformers/aoa_config_base.py index a41f5c65577..b9d7747b787 100644 --- a/paddleformers/transformers/aoa_config_base.py +++ b/paddleformers/transformers/aoa_config_base.py @@ -331,7 +331,7 @@ def _get_attention_statements( Override this method for different attention types (standard QKV vs MLA). """ - if params.multi_latent_attention: + if params.multi_latent_attention or (params.index_n_heads and params.index_n_heads > 0) or True: return cls._get_mla_attention_statements(params, prefix, prefix_offset) return cls._get_standard_attention_statements(params, prefix, prefix_offset) diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py index 593c99c3bd0..f424f4d45a5 100644 --- a/paddleformers/transformers/configuration_utils.py +++ b/paddleformers/transformers/configuration_utils.py @@ -410,6 +410,12 @@ class LlmMetaConfig: False, "Whether to use SonicMoE as the computation backend for the moelayer.", ), + ( + "dsa_indexer_loss_coeff", + float, + 0.01, + "Loss coefficient for the DSA indexer; controls the weight of the indexer loss term.", + ), ] mtp_attributes = [ diff --git a/paddleformers/transformers/deepseek_v32/configuration.py b/paddleformers/transformers/deepseek_v32/configuration.py index 17ba5cec083..2b5eed21b77 100644 --- a/paddleformers/transformers/deepseek_v32/configuration.py +++ b/paddleformers/transformers/deepseek_v32/configuration.py @@ -60,6 +60,12 @@ def __init__( index_n_heads=64, index_head_dim=128, index_topk=2048, + indexer_loss_coeff=0.0, + indexer_use_sparse_loss=False, + # RoPE format control for DSA Indexer + # False = non-interleaved (default, compatible with MLA's interleaved YaRN) + # True = interleaved (paired frequency format) + indexer_rotary_interleaved=False, # MoE parameters n_routed_experts=256, n_shared_experts=1, @@ -109,6 +115,9 @@ def __init__( self.index_n_heads = index_n_heads self.index_head_dim = index_head_dim self.index_topk = index_topk + self.indexer_loss_coeff = indexer_loss_coeff + self.indexer_use_sparse_loss = indexer_use_sparse_loss + self.indexer_rotary_interleaved = indexer_rotary_interleaved # MoE self.n_routed_experts = n_routed_experts diff --git a/paddleformers/transformers/gpt_provider.py b/paddleformers/transformers/gpt_provider.py index 6eb94df4be6..bd2a3370078 100644 --- a/paddleformers/transformers/gpt_provider.py +++ b/paddleformers/transformers/gpt_provider.py @@ -172,6 +172,16 @@ def provide(self, pre_process=None, post_process=None, vp_stage=None, loss_fn=No if self.init_model_with_meta_device: model_init_device_context = partial(paddle.device, device="meta") + # Flatten rope_parameters + if hasattr(self, "rope_parameters") and self.rope_parameters: + if "rope_type" in self.rope_parameters: + if not self.rope_parameters["rope_type"] == "default": + self.rope_type = self.rope_parameters["rope_type"] + if "rope_theta" in self.rope_parameters: + self.rope_theta = self.rope_parameters["rope_theta"] + if hasattr(self, "rope_scaling") and self.rope_scaling is not None: + self.mscale_all_dim = self.rope_scaling["mscale_all_dim"] + # Check if mtp_block_spec parameter is supported kwargs = {} if "mtp_block_spec" in inspect.signature(GPTModel.__init__).parameters: diff --git a/paddleformers/transformers/model_utils.py b/paddleformers/transformers/model_utils.py index 4fc63c47132..d629c0fcd2e 100644 --- a/paddleformers/transformers/model_utils.py +++ b/paddleformers/transformers/model_utils.py @@ -2919,14 +2919,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): except Exception as e: logger.error(f"Failed to delete {metadata_path}: {e}") - # change dtype in aoa - if dtype is not None: - for key in model.state_dict().keys(): - # keep fp32 - if model.state_dict()[key].dtype == paddle.float32: - aoa_config["aoa_statements"].append(f"{key} -> {key}, dtype='float32'") - else: - aoa_config["aoa_statements"].append(f"{key} -> {key}, dtype='{dtype}'") + # # change dtype in aoa + # if dtype is not None: + # for key in model.state_dict().keys(): + # # keep fp32 + # if model.state_dict()[key].dtype == paddle.float32: + # aoa_config["aoa_statements"].append(f"{key} -> {key}, dtype='float32'") + # else: + # aoa_config["aoa_statements"].append(f"{key} -> {key}, dtype='{dtype}'") dist.load_state_dict( sharded_state_dict, @@ -2936,6 +2936,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): offload=load_via_cpu, ) + print("==========> MODEL ARCH") + print(model) + for v in sharded_state_dict.values(): if hasattr(v.local_tensor, "target_tensor"): del v.local_tensor.target_tensor From 0eb38a5516379377f1f842e8a70d2ab6bf61e50d Mon Sep 17 00:00:00 2001 From: xingmingyyj Date: Fri, 22 May 2026 11:25:12 +0800 Subject: [PATCH 5/5] delete comment --- paddleformers/trainer/trainer.py | 20 +--------------- paddleformers/transformers/aoa_config_base.py | 2 +- .../deepseek_v32/configuration.py | 5 ++++ .../transformers/deepseek_v32/modeling.py | 24 ++++++++++++------- paddleformers/transformers/model_utils.py | 3 --- 5 files changed, 23 insertions(+), 31 deletions(-) diff --git a/paddleformers/trainer/trainer.py b/paddleformers/trainer/trainer.py index 117160a1909..38fa493bef8 100644 --- a/paddleformers/trainer/trainer.py +++ b/paddleformers/trainer/trainer.py @@ -2050,24 +2050,6 @@ def _inner_training_loop( _data_load_start_time = time.time() for step, inputs in enumerate(epoch_iterator): - - # # print("=====> inputs: ",inputs) - # input_ids_ = inputs["input_ids"] - # labels_ = inputs["labels"] - # position_ids_ = inputs["position_ids"] - # input_ids_numpy = input_ids_.numpy() - # labels_numpy = labels_.numpy() - # position_ids_numpy = position_ids_.numpy() - # path = r"/root/paddlejob/share-storage/gpfs/system-public/zhuxinming/zhuxinming/paddlefleet_dpskv32/paddlefleet/new_paddlefleet/data_cache/paddlefleet" - # inputs_ids_path = os.path.join(path,f"input_ids_step_{step}.np") - # np.save(inputs_ids_path, input_ids_numpy) - - # labels_path = os.path.join(path,f"labels_step_{step}.np") - # np.save(labels_path, labels_numpy) - - # position_ids_path = os.path.join(path,f"position_ids_step_{step}.np") - # np.save(position_ids_path, position_ids_numpy) - # Record data loading time for this iteration _data_load_end_time = time.time() _data_load_time_for_global_step += _data_load_end_time - _data_load_start_time @@ -2106,7 +2088,7 @@ def _inner_training_loop( steps_trained_progress_bar.update(1) if steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) - # self.timers and self.timers("read-data").start() + self.timers and self.timers("read-data").start() # Reset data loading timer for skipped steps _data_load_start_time = time.time() continue diff --git a/paddleformers/transformers/aoa_config_base.py b/paddleformers/transformers/aoa_config_base.py index 2ff6870da09..8414d11fe26 100644 --- a/paddleformers/transformers/aoa_config_base.py +++ b/paddleformers/transformers/aoa_config_base.py @@ -667,7 +667,7 @@ def _get_inv_moe_layer_statements(cls, params: MoEAOAConfigParams) -> List[str]: if layer_idx >= params.num_hidden_layers: prefix_offset += ".transformer_layer" - statements.extend( + statements.extend( [ f"{prefix_offset}.input_layernorm.weight -> {prefix}.input_layernorm.weight", f"{prefix_offset}.post_attention_layernorm.weight -> {prefix}.post_attention_layernorm.weight", diff --git a/paddleformers/transformers/deepseek_v32/configuration.py b/paddleformers/transformers/deepseek_v32/configuration.py index 437f5922adb..121e75d07d2 100644 --- a/paddleformers/transformers/deepseek_v32/configuration.py +++ b/paddleformers/transformers/deepseek_v32/configuration.py @@ -144,5 +144,10 @@ def __init__( super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + # Re-set after super().__init__ because LlmMetaConfig defaults override these + self.multi_latent_attention = multi_latent_attention + self.use_qk_norm = use_qk_norm + self.num_nextn_predict_layers = num_nextn_predict_layers + __all__ = ["DeepseekV32Config"] diff --git a/paddleformers/transformers/deepseek_v32/modeling.py b/paddleformers/transformers/deepseek_v32/modeling.py index 05d2b91c7e1..c14cda3aadb 100644 --- a/paddleformers/transformers/deepseek_v32/modeling.py +++ b/paddleformers/transformers/deepseek_v32/modeling.py @@ -36,18 +36,29 @@ class DeepseekV32PreTrainedModel(PretrainedModel): config_class = DeepseekV32Config base_model_prefix = "model" + # Layernorm weight names that need dtype cast (fleet model skips generic dtype mapping) + _NORM_WEIGHT_KEYS = ("input_layernorm.weight", "post_attention_layernorm.weight", + "q_a_layernorm.weight", "kv_a_layernorm.weight", + "k_norm.weight", "k_norm.bias", "norm.weight") + @classmethod def _gen_aoa_config(cls, config: DeepseekV32Config): aoa_config = MoEAOAConfigGenerator.gen_aoa_config(config) - aoa_statements = aoa_config["aoa_statements"] - print("===========> AOA STATEMENTS <==========") - for stmt in aoa_statements: - print(stmt) + cls._inject_norm_dtype(aoa_config["aoa_statements"], "bfloat16") return aoa_config @classmethod def _gen_inv_aoa_config(cls, config: DeepseekV32Config): - return MoEAOAConfigGenerator.gen_inv_aoa_config(config) + inv_aoa_config = MoEAOAConfigGenerator.gen_inv_aoa_config(config) + cls._inject_norm_dtype(inv_aoa_config["aoa_statements"], "float32") + return inv_aoa_config + + @classmethod + def _inject_norm_dtype(cls, aoa_statements, target_dtype): + """Inject dtype into existing layernorm statements generated by base class.""" + for i, stmt in enumerate(aoa_statements): + if any(k in stmt for k in cls._NORM_WEIGHT_KEYS) and "dtype=" not in stmt: + aoa_statements[i] = f"{stmt}, dtype='{target_dtype}'" def _build_model(config): @@ -118,9 +129,6 @@ def __new__(cls, config): gpt_model.is_fleet = cls.is_fleet gpt_model._gen_aoa_config = cls._gen_aoa_config gpt_model._gen_inv_aoa_config = cls._gen_inv_aoa_config - print("=======> model config") - print(gpt_model.config) - gpt_model._gen_aoa_config(gpt_model.config) return gpt_model diff --git a/paddleformers/transformers/model_utils.py b/paddleformers/transformers/model_utils.py index b63de73f24e..a8ec2532867 100644 --- a/paddleformers/transformers/model_utils.py +++ b/paddleformers/transformers/model_utils.py @@ -2936,9 +2936,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): offload=load_via_cpu, ) - print("==========> MODEL ARCH") - print(model) - for v in sharded_state_dict.values(): if hasattr(v.local_tensor, "target_tensor"): del v.local_tensor.target_tensor