[Feature] Support share MTP weights. (#1672)

RangiLyu · web-flow · commit e35e80cbb9ac · 2026-04-14T18:13:25.000+08:00
* Refactor MTP configuration to support weight sharing across layers. Updated MoE and MTPBlock classes to handle shared weights and adjusted layer initialization accordingly. Added share_weights parameter to MTPConfig for better control over layer behavior.

* Updated the checkpointing mechanism to ensure shared MTP heads are recomputed as necessary.

* resolve review comments
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
@@ -855,7 +855,8 @@ def build_mtp_block(self, config: MoEConfig) -> MTPBlock:
         else:
             raise ValueError(f"Unsupported layer type {layers_type_list[last_layer_idx]}")
 
-        for i in range(mtp_config.num_layers):
+        num_physical_layer = 1 if mtp_config.share_weights else mtp_config.num_layers
+        for i in range(num_physical_layer):
             # Build MoE decoder layer for MTP
             decoder_layer = MoEDecoderLayer(
                 hidden_size=config.hidden_size,
@@ -894,7 +895,7 @@ def build_mtp_block(self, config: MoEConfig) -> MTPBlock:
             )
             mtp_layers.append(mtp_layer)
 
-        return MTPBlock(mtp_layers=mtp_layers)
+        return MTPBlock(mtp_config=mtp_config, mtp_layers=mtp_layers)
 
     @override
     def from_hf(self, hf_path: str | Path, strict: bool = True) -> tuple:
@@ -1015,7 +1016,9 @@ def fully_shard(
         # Shard MTP block if it exists
         if self.mtp_block is not None:
             for mtp_idx, mtp_layer in enumerate(self.mtp_block.layers):
-                if self._should_recompute(None, mtp_idx=mtp_idx):
+                if self._should_recompute(None, mtp_idx=mtp_idx) or (
+                    self.config.mtp_config is not None and self.config.mtp_config.share_weights
+                ):  # share mtp head must recompute
                     mtp_layer = checkpoint_wrapper(mtp_layer, checkpoint_impl=CheckpointImpl.REENTRANT)
                 self.mtp_block.layers[mtp_idx] = mtp_layer
 
@@ -1234,7 +1237,10 @@ def _should_recompute(
                 * Global 9 (MTP 2, last layer): no recompute (forced)
         """
         num_layers = self.config.num_hidden_layers
-        mtp_layers = self.config.mtp_config.num_layers if self.config.mtp_config is not None else 0
+        if self.config.mtp_config is not None:
+            mtp_layers = 1 if self.config.mtp_config.share_weights else self.config.mtp_config.num_layers
+        else:
+            mtp_layers = 0
         recompute_ratio = self.fsdp_config.recompute_ratio if self.fsdp_config is not None else 0.0
 
         total_layers = num_layers + mtp_layers
diff --git a/xtuner/v1/module/mtp/config.py b/xtuner/v1/module/mtp/config.py
@@ -20,6 +20,9 @@ class MTPConfig(BaseModel):
     Args:
         num_layers (int): Number of MTP layers (prediction depths). Each layer
             predicts tokens at increasing future positions (i+1, i+2, ..., i+D).
+        share_weights (bool): Whether to share the weights of the MTP layers.
+            If True, the weights of the MTP layers are shared across all layers.
+            Default: False.
         loss_scaling_factor (float): Scaling factor for MTP loss. The total MTP loss
             is computed as the average of losses across all depths, multiplied by
             this factor. Default: 0.1.
@@ -30,6 +33,7 @@ class MTPConfig(BaseModel):
         ...     ...,
         ...     mtp_config=MTPConfig(
         ...         num_layers=2,
+        ...         share_weights=True,
         ...         loss_scaling_factor=0.1,
         ...     ),
         ... )
@@ -38,4 +42,5 @@ class MTPConfig(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     num_layers: Annotated[int, Parameter(group="model")]
+    share_weights: Annotated[bool, Parameter(group="model")] = False
     loss_scaling_factor: Annotated[float, Parameter(group="model")] = 0.1
diff --git a/xtuner/v1/module/mtp/mtp_block.py b/xtuner/v1/module/mtp/mtp_block.py
@@ -7,6 +7,7 @@
 
 from xtuner.v1.data_proto import SequenceContext
 
+from .config import MTPConfig
 from .mtp_layer import MTPLayer
 from .utils import roll_sequence_context
 
@@ -25,6 +26,7 @@ class MTPBlock(nn.Module):
     the predictions of shallower layers.
 
     Args:
+        mtp_config (MTPConfig): MTP configuration.
         mtp_layers (list[MTPLayer]): List of MTP layers. Each layer should be a
             fully constructed MTPLayer instance. The number of layers determines
             the prediction depth (D).
@@ -43,7 +45,7 @@ class MTPBlock(nn.Module):
         ...     mtp_layers.append(mtp_layer)
         >>>
         >>> # Create MTP block
-        >>> mtp_block = MTPBlock(mtp_layers=mtp_layers)
+        >>> mtp_block = MTPBlock(mtp_config=config, mtp_layers=mtp_layers)
         >>>
         >>> # Forward pass
         >>> outputs = mtp_block(
@@ -58,13 +60,17 @@ class MTPBlock(nn.Module):
         >>> # outputs[1]: predictions for i+2
     """
 
-    def __init__(self, *, mtp_layers: list[MTPLayer]):
+    def __init__(self, *, mtp_config: MTPConfig, mtp_layers: list[MTPLayer]):
         super().__init__()
         if not mtp_layers:
             raise ValueError("mtp_layers cannot be empty")
 
+        if mtp_config.share_weights and len(mtp_layers) != 1:
+            raise ValueError(f"share_weights mode requires exactly 1 MTP layer, got {len(mtp_layers)}")
+        if not mtp_config.share_weights and len(mtp_layers) != mtp_config.num_layers:
+            raise ValueError(f"Expected {mtp_config.num_layers} MTP layers, but got {len(mtp_layers)}")
+        self.mtp_config = mtp_config
         self.layers = nn.ModuleList(mtp_layers)
-        self.num_layers = len(mtp_layers)
 
     def forward(
         self,
@@ -97,7 +103,9 @@ def forward(
         current_hidden_states = hidden_states
         current_seq_ctx = seq_ctx
 
-        for layer in self.layers:
+        num_steps = self.mtp_config.num_layers
+        for step in range(num_steps):
+            layer = self.layers[0] if self.mtp_config.share_weights else self.layers[step]
             # Roll sequence context to get future tokens
             # This shifts each packed sequence independently, respecting boundaries
             current_seq_ctx = roll_sequence_context(current_seq_ctx, shifts=-1)