NVIDIA-BioNeMo
diff --git a/‎bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 26 additions & 2 deletions b/‎bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_meta_device_init.py‎
Lines changed: 180 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/test_meta_device_init.py‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎bionemo-recipes/models/llama3/modeling_llama_te.py‎
Lines changed: 26 additions & 2 deletions b/‎bionemo-recipes/models/llama3/modeling_llama_te.py‎
Lines changed: 26 additions & 2 deletions
@@ -259,9 +259,13 @@ class NVEsmPreTrainedModel(PreTrainedModel):
         "EsmEmbeddings",
     )
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module: nn.Module):
-        """Initialize the weights.
+        """Initialize model weights.
+
+        This method ensures that models with randomly-initialized weights get the correct initial value distribution,
+        which can be critical for training stability. We also call this method directly when using meta-device init, as
+        the `to_empty` method does not initialize the weights. While the base Transformers model has a similar method,
+        we need to extend it to handle TE-specific modules.
 
         Args:
             module (nn.Module): The module to initialize the weights for.
@@ -282,9 +286,29 @@ def _init_weights(self, module: nn.Module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         if isinstance(module, transformer_engine.pytorch.LayerNormLinear):
+            if module.layer_norm_bias is not None:
+                module.layer_norm_bias.data.zero_()
             module.layer_norm_weight.data.fill_(1.0)
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
+        if isinstance(module, transformer_engine.pytorch.LayerNormMLP):
+            if module.layer_norm_bias is not None:
+                module.layer_norm_bias.data.zero_()
+            module.layer_norm_weight.data.fill_(1.0)
+            if hasattr(module, "fc1_weight") and module.fc1_weight is not None:
+                module.fc1_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, "fc2_weight") and module.fc2_weight is not None:
+                module.fc2_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, "fc1_bias") and module.fc1_bias is not None and module.fc1_bias.numel() > 0:
+                module.fc1_bias.data.zero_()
+            if hasattr(module, "fc2_bias") and module.fc2_bias is not None and module.fc2_bias.numel() > 0:
+                module.fc2_bias.data.zero_()
+        if isinstance(module, RotaryPositionEmbedding) and hasattr(module, "inv_freq"):
+            # When we initialize the model with `to_empty`, the `inv_freq` attribute is not initialized, so we need to
+            # re-initialize it here with the correct values.
+            module.inv_freq = RotaryPositionEmbedding(
+                self.config.hidden_size // self.config.num_attention_heads
+            ).inv_freq.to(module.inv_freq.device)
 
     @classmethod
     def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
 
@@ -0,0 +1,180 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Test that parameter distributions are identical with and without meta device initialization.
+
+These tests verify that when using meta device initialization (creating the model on meta device, then calling
+`to_empty` and `_init_weights`), the resulting parameter distributions (mean and std) match those from normal
+initialization. This is important because we previously observed differences in convergence between meta-device-init and
+non-meta-device-init training, which suggested that the initialization was not being applied correctly after `to_empty`.
+By explicitly calling `_init_weights` after `to_empty`, we ensure that parameters are properly initialized, leading to
+consistent training behavior regardless of whether meta device initialization is used.
+"""
+
+import os
+import subprocess
+
+import pytest
+import torch
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.tensor import DTensor
+from transformers import AutoConfig, set_seed
+
+from esm.modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
+
+
+requires_multi_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+    reason="Test requires at least 2 GPUs",
+)
+
+
+def test_meta_device_init():
+    config = NVEsmConfig(**AutoConfig.from_pretrained("facebook/esm2_t6_8M_UR50D").to_dict())
+
+    set_seed(42)
+    with torch.device("meta"):
+        model_meta_init = NVEsmForMaskedLM(config)
+
+    model_meta_init.to_empty(device="cuda")
+    model_meta_init.apply(model_meta_init._init_weights)
+
+    set_seed(42)
+    model_normal_init = NVEsmForMaskedLM(config)
+    model_normal_init.to("cuda")
+
+    state_dict_meta_init = model_meta_init.state_dict()
+    state_dict_normal_init = model_normal_init.state_dict()
+
+    for key in state_dict_meta_init.keys():
+        meta_tensor = state_dict_meta_init[key]
+        normal_tensor = state_dict_normal_init[key]
+        # Skip non-numeric tensors (e.g., Byte/uint8 tensors like _extra_state)
+        if meta_tensor.dtype not in (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+            torch.complex64,
+            torch.complex128,
+        ):
+            continue
+        torch.testing.assert_close(
+            normal_tensor.mean(),
+            meta_tensor.mean(),
+            atol=1e-3,
+            rtol=1e-4,
+            msg=lambda x: f"Mean mismatch for parameter {key}: {x}",
+        )
+        torch.testing.assert_close(
+            normal_tensor.std(),
+            meta_tensor.std(),
+            atol=1e-3,
+            rtol=1e-4,
+            msg=lambda x: f"Std mismatch for parameter {key}: {x}",
+        )
+
+
+@pytest.mark.parametrize("num_gpus", [1, pytest.param(2, marks=requires_multi_gpu)])
+def test_meta_device_init_after_fully_shard(num_gpus: int):
+    cmd = [
+        "torchrun",
+        f"--nproc_per_node={num_gpus}",
+        os.path.relpath(__file__),
+    ]
+
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=240,
+    )
+
+    if result.returncode != 0:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Command failed with exit code {result.returncode}")
+
+
+if __name__ == "__main__":
+    torch.distributed.init_process_group(backend="cuda:nccl")
+    torch.cuda.set_device(torch.distributed.get_rank())
+
+    config = NVEsmConfig(**AutoConfig.from_pretrained("facebook/esm2_t6_8M_UR50D").to_dict())
+
+    set_seed(42)
+
+    with torch.device("meta"):
+        model_meta_init = NVEsmForMaskedLM(config)
+
+    for layer in model_meta_init.esm.encoder.layers:
+        fully_shard(layer)
+    fully_shard(model_meta_init)
+
+    model_meta_init.to_empty(device="cuda")
+    model_meta_init.apply(model_meta_init._init_weights)
+
+    set_seed(42)
+    model_normal_init = NVEsmForMaskedLM(config)
+
+    for layer in model_normal_init.esm.encoder.layers:
+        fully_shard(layer)
+    fully_shard(model_normal_init)
+
+    state_dict_meta_init = model_meta_init.state_dict()
+    state_dict_normal_init = model_normal_init.state_dict()
+
+    for key in state_dict_meta_init.keys():
+        meta_tensor = state_dict_meta_init[key]
+        normal_tensor = state_dict_normal_init[key]
+        # Skip non-numeric tensors (e.g., Byte/uint8 tensors like _extra_state)
+        if meta_tensor.dtype not in (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+            torch.complex64,
+            torch.complex128,
+        ):
+            continue
+
+        torch.testing.assert_close(
+            normal_tensor.mean(),
+            meta_tensor.mean(),
+            atol=1e-3,
+            rtol=1e-4,
+            msg=lambda x: f"Mean mismatch for parameter {key}: {x}",
+        )
+
+        if isinstance(normal_tensor, DTensor) and isinstance(meta_tensor, DTensor):
+            torch.testing.assert_close(
+                normal_tensor.full_tensor().std(),
+                meta_tensor.full_tensor().std(),
+                atol=1e-3,
+                rtol=1e-4,
+                msg=lambda x: f"Std mismatch for parameter {key}: {x}",
+            )
+
+        else:
+            torch.testing.assert_close(
+                normal_tensor.std(),
+                meta_tensor.std(),
+                atol=1e-3,
+                rtol=1e-4,
+                msg=lambda x: f"Std mismatch for parameter {key}: {x}",
+            )
@@ -48,13 +48,22 @@ class NVLlamaConfig(LlamaConfig):
 class NVLlamaPreTrainedModel(PreTrainedModel):
     """Base class for NVLlama models."""
 
-    config: NVLlamaConfig
+    config_class = NVLlamaConfig
     base_model_prefix = "model"
     _no_split_modules = ("TransformerLayer",)
     _skip_keys_device_placement = ("past_key_values",)
 
     def _init_weights(self, module):
-        """TE-specific weight initialization."""
+        """Initialize module weights.
+
+        This method ensures that models with randomly-initialized weights get the correct initial value distribution,
+        which can be critical for training stability. We also call this method directly when using meta-device init, as
+        the `to_empty` method does not initialize the weights. While the base Transformers model has a similar method,
+        we need to extend it to handle TE-specific modules.
+
+        Args:
+            module (nn.Module): The module to initialize the weights for.
+        """
         super()._init_weights(module)
 
         # Copied from transformers.modeling_utils.PreTrainedModel._init_weights
@@ -75,10 +84,25 @@ def _init_weights(self, module):
                 module.weight.data.fill_(1.0)
             if hasattr(module, "bias") and module.bias is not None:
                 module.bias.data.zero_()
+        if isinstance(module, transformer_engine.pytorch.RMSNorm):
+            if hasattr(module, "weight") and module.weight is not None:
+                module.weight.data.fill_(1.0)
         if isinstance(module, transformer_engine.pytorch.LayerNormLinear):
             module.layer_norm_weight.data.fill_(1.0)
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
+        if isinstance(module, transformer_engine.pytorch.LayerNormMLP):
+            module.layer_norm_weight.data.fill_(1.0)
+            if hasattr(module, "fc1_weight") and module.fc1_weight is not None:
+                module.fc1_weight.data.normal_(mean=0.0, std=std)
+            if hasattr(module, "fc2_weight") and module.fc2_weight is not None:
+                module.fc2_weight.data.normal_(mean=0.0, std=std)
+            if hasattr(module, "fc1_bias") and module.fc1_bias is not None and module.fc1_bias.numel() > 0:
+                module.fc1_bias.data.zero_()
+            if hasattr(module, "fc2_bias") and module.fc2_bias is not None and module.fc2_bias.numel() > 0:
+                module.fc2_bias.data.zero_()
+        if isinstance(module, RotaryPositionEmbedding) and hasattr(module, "inv_freq"):
+            module.inv_freq = LlamaRotaryEmbedding(config=self.config).inv_freq.to(module.inv_freq.device)
 
 
 class NVLlamaModel(NVLlamaPreTrainedModel):