fix(merger): re-tie weights to avoid duplicating tied parameters (#157)

kcz358 · github-actions[bot] · web-flow · commit b92834470206 · 2026-04-18T20:21:49.000+08:00
* fix(merger): re-tie weights to avoid duplicating tied parameters

FSDP saves tied parameters (e.g. lm_head &lt;-&gt; embed_tokens) as
independent shards. After load_state_dict(..., assign=True) they
become separate tensors and save_pretrained writes both, bloating
the merged checkpoint. Re-tie when the model declares tying and the
saved tensors agree, otherwise warn and skip.

* style: auto-fix lint (black + isort)

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/src/lmms_engine/merger/fsdp2.py b/src/lmms_engine/merger/fsdp2.py
@@ -109,7 +109,9 @@ def consolidate(self, shard_state_dicts: list[dict]) -> dict:
             if not isinstance(state_dict[key], list):
                 continue
             # Non-sharded tensors are duplicated across ranks; just take the first one
-            if all(t.shape == state_dict[key][0].shape and torch.equal(t, state_dict[key][0]) for t in state_dict[key][1:]):
+            if all(
+                t.shape == state_dict[key][0].shape and torch.equal(t, state_dict[key][0]) for t in state_dict[key][1:]
+            ):
                 state_dict[key] = state_dict[key][0]
             else:
                 state_dict[key] = torch.cat(state_dict[key], dim=0)
@@ -146,6 +148,34 @@ def _resolve_checkpoint_path(self, path: Path) -> Path:
         latest_checkpoint = checkpoint_folders[-1]
         return latest_checkpoint
 
+    def maybe_tie_weights(self, model: torch.nn.Module, config: object, state_dict: dict) -> None:
+        """Re-tie weights if the model declares weight tying.
+
+        FSDP saves tied parameters (e.g. ``lm_head`` <-> ``embed_tokens``) as
+        independent shards, so after ``load_state_dict(..., assign=True)`` they
+        become separate tensors and ``save_pretrained`` would write both.
+
+        Only re-ties when the model declares tying AND the saved tensors
+        actually agree, to avoid silently dropping divergent weights.
+        """
+        tied_keys_map = getattr(model, "_tied_weights_keys", None)
+        tie_word_embeddings = getattr(config, "tie_word_embeddings", False) or getattr(
+            getattr(config, "text_config", None), "tie_word_embeddings", False
+        )
+        if not (tied_keys_map and tie_word_embeddings):
+            return
+
+        if isinstance(tied_keys_map, dict):
+            for tied_key, source_key in tied_keys_map.items():
+                t1 = state_dict.get(tied_key)
+                t2 = state_dict.get(source_key)
+                if t1 is not None and t2 is not None and not torch.equal(t1, t2):
+                    logger.warning(f"Tied weights mismatch: '{tied_key}' != '{source_key}'. Skipping tie_weights().")
+                    return
+
+        logger.info("Re-tying weights (tie_word_embeddings=True).")
+        model.tie_weights()
+
     def merge(
         self,
         checkpoint_path: Path,
@@ -194,6 +224,7 @@ def merge(
         with init_empty_weights():
             model = model_cls.from_config(config)
         model.load_state_dict(full_state_dict, assign=True)
+        self.maybe_tie_weights(model, config, full_state_dict)
         processor = AutoProcessor.from_pretrained(checkpoint_path)
         processor.save_pretrained(output_path)
         config.save_pretrained(output_path)