fix: fixed initialization of tied weights in Llama3Initializer

le1nux · le1nux · commit 49c185c7dd9d · 2026-06-20T17:56:54.000+02:00
diff --git a/src/modalities/models/gpt2/llama3_like_initialization.py b/src/modalities/models/gpt2/llama3_like_initialization.py
@@ -15,7 +15,7 @@
 class Llama3InitializerConfig(BaseModel):
     num_layers: Annotated[int, Field(strict=True, gt=0)]
     n_embd: Annotated[int, Field(strict=True, gt=0)]
-    use_weight_tying: bool
+    use_weight_tying: bool = False
     depth_init: bool = True
 
 
@@ -89,7 +89,7 @@ def __init__(self, num_layers: int, n_embd: int, depth_init: bool, use_weight_ty
             ),
         }
         if not use_weight_tying:
-            # lm head weights
+            # lm head weights (separate output projection matrix)
             self.regex_to_init[r"transformer\.lm_head\.weight"] = (
                 trunc_normal_,
                 {
@@ -99,6 +99,21 @@ def __init__(self, num_layers: int, n_embd: int, depth_init: bool, use_weight_ty
                     "b": 3 / math.sqrt(n_embd),
                 },
             )
+        else:
+            # With weight tying, transformer.wte.weight IS the output projection
+            # (lm_head shares the same tensor), so it must be initialized with the
+            # small output std (1/sqrt(n_embd)) instead of the embedding std of 1.
+            # Otherwise the tied matrix produces logits that are ~sqrt(n_embd)x too
+            # large at init, causing the initial loss/grad norm to explode.
+            self.regex_to_init[r"transformer\.wte\.weight"] = (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": 1 / math.sqrt(n_embd),
+                    "a": -3 / math.sqrt(n_embd),
+                    "b": 3 / math.sqrt(n_embd),
+                },
+            )
 
     def initialize_in_place(self, model: nn.Module):
         self._init_by_fqn_regex(model, self.regex_to_init)