chore: added weight tying tests and Llama3 initialization checks

le1nux · le1nux · commit 0927a2f539e1 · 2026-06-26T14:51:21.000+02:00
diff --git a/tests/fsdp2_parallelization/test_tensor_parallelism.py b/tests/fsdp2_parallelization/test_tensor_parallelism.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import Tuple
 
@@ -27,7 +28,7 @@ def patch_config_file(original_config_path: Path, activation_type: str, tmp_dir:
 
     config_dict["model_raw"]["config"]["activation_type"] = activation_type
 
-    tmp_file_path = tmp_dir / original_config_path.name
+    tmp_file_path = tmp_dir / f"{activation_type}_{os.getpid()}_{original_config_path.name}"
     with tmp_file_path.open("w", encoding="utf-8") as f:
         yaml.safe_dump(config_dict, f)
 
diff --git a/tests/test_weight_tying.py b/tests/test_weight_tying.py
@@ -1,10 +1,13 @@
+import math
+
 import pytest
+import torch
 import torch.nn as nn
 from pydantic import ValidationError
 from torch.distributed.device_mesh import DeviceMesh
 
 from modalities.config.config import GPT2ModelTPConfig
-from modalities.models.components.layer_norms import LayerNormConfig
+from modalities.models.components.layer_norms import LayerNormConfig, PytorchRMSLayerNormConfig
 from modalities.models.gpt2.gpt2_model import (
     GPT2LLM,
     AttentionConfig,
@@ -13,6 +16,7 @@
     LayerNormWrapperConfig,
     PositionTypes,
 )
+from modalities.models.gpt2.llama3_like_initialization import Llama3Initializer
 from modalities.models.model import ActivationType
 from modalities.models.parallelism.pipeline_parallelism_configs import StagedPipelineConfig
 from modalities.models.parallelism.stages_generator import GPT2LLMStagesGenerator
@@ -27,7 +31,12 @@ def count_parameters(model: nn.Module) -> int:
     return sum(p.numel() for p in model.parameters())
 
 
-def create_gpt2_model(use_weight_tying: bool) -> GPT2LLM:
+def create_gpt2_model(
+    use_weight_tying: bool,
+    activation_type: ActivationType = ActivationType.GELU,
+    bias: bool = True,
+    norm_type: LayerNorms = LayerNorms.layer_norm,
+) -> GPT2LLM:
     vocab_size = VOCAB_SIZE
     n_embd = EMBEDDING_DIM
     sequence_length = 128
@@ -36,9 +45,7 @@ def create_gpt2_model(use_weight_tying: bool) -> GPT2LLM:
     n_head_kv = 2
     ffn_hidden = 256
     dropout = 0.1
-    bias = True
     poe_type = PositionTypes.NOPE
-    activation_type = ActivationType.GELU
     attention_implementation = AttentionImplementation.PYTORCH_FLASH
     attention_config = AttentionConfig(
         qkv_transforms=[
@@ -53,15 +60,17 @@ def create_gpt2_model(use_weight_tying: bool) -> GPT2LLM:
             )
         ]
     )
-    attention_norm_config = LayerNormWrapperConfig(
-        norm_type=LayerNorms.layer_norm, config=LayerNormConfig(normalized_shape=n_embd)
-    )
-    ffn_norm_config = LayerNormWrapperConfig(
-        norm_type=LayerNorms.layer_norm, config=LayerNormConfig(normalized_shape=n_embd)
-    )
-    lm_head_norm_config = LayerNormWrapperConfig(
-        norm_type=LayerNorms.layer_norm, config=LayerNormConfig(normalized_shape=n_embd)
-    )
+
+    def _make_norm_config() -> LayerNormWrapperConfig:
+        if norm_type == LayerNorms.pytorch_rms_norm:
+            return LayerNormWrapperConfig(
+                norm_type=norm_type, config=PytorchRMSLayerNormConfig(normalized_shape=n_embd)
+            )
+        return LayerNormWrapperConfig(norm_type=norm_type, config=LayerNormConfig(normalized_shape=n_embd))
+
+    attention_norm_config = _make_norm_config()
+    ffn_norm_config = _make_norm_config()
+    lm_head_norm_config = _make_norm_config()
 
     return GPT2LLM(
         sample_key="input_ids",
@@ -140,6 +149,17 @@ def test_has_tied_word_embeddings_requires_model_capability():
         has_tied_word_embeddings(nn.Linear(1, 1))
 
 
+@pytest.mark.parametrize("module_name", ["transformer", "wte", "lm_head"])
+def test_has_tied_word_embeddings_handles_pipeline_stage(module_name: str):
+    model = create_gpt2_model(use_weight_tying=True)
+    if module_name == "transformer":
+        del model.transformer
+    else:
+        del model.transformer[module_name]
+
+    assert has_tied_word_embeddings(model) is False
+
+
 def test_tp_config_rejects_tied_word_embeddings():
     model = create_gpt2_model(use_weight_tying=True)
     device_mesh = create_device_mesh_stub(ParallelismDegrees.TP.value)
@@ -148,6 +168,45 @@ def test_tp_config_rejects_tied_word_embeddings():
         GPT2ModelTPConfig(model=model, device_mesh=device_mesh)
 
 
+@pytest.mark.parametrize("use_weight_tying", [True, False])
+def test_llama3_init_keeps_output_projection_small(use_weight_tying: bool):
+    """Regression test for the weight-tying init bug.
+
+    With weight tying, ``transformer.wte.weight`` *is* the output projection
+    (``lm_head`` shares the same tensor), so it must be initialized with the small
+    output std ``1 / sqrt(n_embd)`` -- not the embedding std of 1. Otherwise the tied
+    matrix produces logits ~sqrt(n_embd)x too large at init and the loss/grad norm
+    explode (observed: initial loss ~1685 instead of ~ln(vocab_size)).
+    """
+    n_embd = EMBEDDING_DIM
+    expected_output_std = 1 / math.sqrt(n_embd)
+
+    # SwiGLU + RMSNorm + no bias so the Llama3Initializer's FQN regexes fully match
+    # the model and it rejects no parameters.
+    model = create_gpt2_model(
+        use_weight_tying=use_weight_tying,
+        activation_type=ActivationType.SWIGLU,
+        bias=False,
+        norm_type=LayerNorms.pytorch_rms_norm,
+    )
+    initializer = Llama3Initializer(num_layers=2, n_embd=n_embd, depth_init=True, use_weight_tying=use_weight_tying)
+    # Mirror the production flow (model_factory applies the initializer under no_grad).
+    with torch.no_grad():
+        initializer.initialize_in_place(model)
+
+    # The logit-producing matrix must be small regardless of weight tying.
+    output_proj_std = model.transformer.lm_head.weight.detach().float().std().item()
+    assert output_proj_std == pytest.approx(expected_output_std, rel=0.15)
+
+    if use_weight_tying:
+        # Tied: embedding and output projection are the same (small) tensor.
+        assert model.transformer.wte.weight is model.transformer.lm_head.weight
+    else:
+        # Untied: the embedding keeps the Llama3/TorchTitan std of 1.
+        embedding_std = model.transformer.wte.weight.detach().float().std().item()
+        assert embedding_std == pytest.approx(1.0, rel=0.15)
+
+
 def test_tp_config_allows_untied_word_embeddings():
     model = create_gpt2_model(use_weight_tying=False)
     device_mesh = create_device_mesh_stub(ParallelismDegrees.TP.value)