feat: implemented llama3 weight init tests

le1nux · le1nux · commit 4dea4965acdb · 2026-03-04T18:37:04.000+01:00
diff --git a/tests/end2end_tests/configs/gpt2_warm_start_from_step_4_fsdp2_grad_accu.yaml b/tests/end2end_tests/configs/gpt2_warm_start_from_step_4_fsdp2_grad_accu.yaml
@@ -177,7 +177,7 @@ app_state_raw:
   component_key: app_state
   variant_key: raw
   config:
-    model: 
+    model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
@@ -288,7 +288,7 @@ optimizer:
     eps: 1e-8
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
-    wrapped_model: 
+    wrapped_model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
 
diff --git a/tests/test_initialization_fsdpx.py b/tests/test_initialization_fsdpx.py
@@ -18,8 +18,16 @@
 from torch.distributed.fsdp import StateDictType
 
 from modalities.__main__ import Main
-from modalities.config.config import ProcessGroupBackendType
-from modalities.config.pydantic_if_types import PydanticFSDP1ModuleType, PydanticFSDP2ModuleType
+from modalities.config.component_factory import ComponentFactory
+from modalities.config.config import ProcessGroupBackendType, load_app_config_dict
+from modalities.config.pydantic_if_types import (
+    PydanticFSDP1ModuleType,
+    PydanticFSDP2ModuleType,
+    PydanticPytorchModuleType,
+)
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
+from modalities.registry.components import COMPONENTS
+from modalities.registry.registry import Registry
 from tests.end2end_tests.custom_components import MultiProcessingCudaEnv
 
 
@@ -493,3 +501,86 @@ def _get_fdsp2_state_dict(model: FSDP2) -> dict[str, Any]:
             model=model, optimizers=[], options=StateDictOptions(full_state_dict=True, broadcast_from_rank0=True)
         )[0]
         return model_state
+
+
+class TestLlama3LikeInitialization:
+    @pytest.mark.parametrize("has_bias", [True, False])
+    def test_llama3_like_initialization(self, has_bias: bool):
+        config_file_path = Path(__file__).parent / "test_yaml_configs/llama3_config_initalization.yaml"
+        n_layer = 4
+        model = self._get_components(config_file_path=config_file_path, has_bias=has_bias)
+        self._test_wte(model=model)
+        self._test_lm_head(model=model)
+
+        for block in model.transformer.h:
+            self._test_qkv_proj(gpt2_block=block, has_bias=has_bias)
+            self._test_c_proj(gpt2_block=block, has_bias=has_bias, n_layer=n_layer)
+            self._test_swiglu_proj(gpt2_block=block, has_bias=has_bias, n_layer=n_layer)
+
+    def _get_components(self, config_file_path: Path, has_bias: bool) -> GPT2LLM:
+        config_dict = load_app_config_dict(
+            config_file_path=config_file_path,
+        )
+        config_dict["model_raw"]["config"]["bias"] = has_bias
+        registry = Registry(COMPONENTS)
+        component_factory = ComponentFactory(registry=registry)
+
+        class ComponentsInstantiationModel(BaseModel):
+            initialized_model: PydanticPytorchModuleType
+
+        components: ComponentsInstantiationModel = component_factory.build_components(
+            config_dict=config_dict, components_model_type=ComponentsInstantiationModel
+        )
+        return components.initialized_model
+
+    def _test_wte(self, model: GPT2LLM):
+        assert model.transformer.wte.weight.std().detach().cpu() == pytest.approx(1, abs=1e-3)
+        assert model.transformer.wte.weight.mean().detach().cpu() == pytest.approx(0, abs=1e-3)
+
+    def _test_lm_head(self, model: GPT2LLM, n_emb: int):
+        assert model.transformer.lm_head.weight.std().detach().cpu() == pytest.approx(1 / math.sqrt(n_emb), abs=1e-3)
+        assert model.transformer.lm_head.weight.max().detach().cpu() <= 3 / math.sqrt(n_emb)
+        assert model.transformer.lm_head.weight.min().detach().cpu() >= -3 / math.sqrt(n_emb)
+        assert model.transformer.lm_head.weight.mean().detach().cpu() == pytest.approx(0, abs=1e-3)
+
+    def _test_qkv_proj(self, gpt2_block: GPT2Block, has_bias: bool):
+        layers = (gpt2_block.attn.q_attn, gpt2_block.attn.k_attn, gpt2_block.attn.v_attn)
+        for layer in layers:
+            assert layer.weight.std().detach().cpu() == pytest.approx(0.02, abs=1e-3)
+            assert layer.weight.max().detach().cpu() <= 2
+            assert layer.weight.min().detach().cpu() >= -2
+            if has_bias:
+                assert layer.bias is not None
+                assert layer.bias.std().detach().cpu() == pytest.approx(0.02, abs=1e-3)
+                assert layer.bias.max().detach().cpu() <= 2
+                assert layer.bias.min().detach().cpu() >= -2
+
+    def _test_c_proj(self, gpt2_block: GPT2Block, has_bias: bool, n_layer: int):
+        layer = gpt2_block.attn.c_proj
+        assert layer.weight.std().detach().cpu() == pytest.approx(0.02 / math.sqrt(2 * n_layer), abs=1e-3)
+        assert layer.weight.max().detach().cpu() <= 2
+        assert layer.weight.min().detach().cpu() >= -2
+
+        if has_bias:
+            assert layer.bias is not None
+            assert layer.bias.std().detach().cpu() == pytest.approx(0.02 / math.sqrt(2 * n_layer), abs=1e-3)
+            assert layer.bias.max().detach().cpu() <= 2
+            assert layer.bias.min().detach().cpu() >= -2
+
+    def _test_swiglu_proj(self, gpt2_block: GPT2Block, has_bias: bool, n_layer: int):
+        layers = (gpt2_block.mlp.V, gpt2_block.mlp.W_2)
+        for layer in layers:
+            assert layer.weight.std().detach().cpu() == pytest.approx(0.02 / math.sqrt(2 * n_layer), abs=1e-3)
+            assert layer.weight.max().detach().cpu() <= 2
+            assert layer.weight.min().detach().cpu() >= -2
+
+            if has_bias:
+                # all zero bias
+                assert layer.bias is not None and torch.all(layer.bias == 0)
+
+        layer = gpt2_block.mlp.W
+        assert layer.weight.std().detach().cpu() == pytest.approx(0.02, abs=1e-3)
+        assert layer.weight.max().detach().cpu() <= 2
+        assert layer.weight.min().detach().cpu() >= -2
+        if has_bias:
+            assert layer.bias is not None and torch.all(layer.bias == 0)