NVIDIA-BioNeMo
diff --git a/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 50 additions & 67 deletions b/‎bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py‎
Lines changed: 50 additions & 67 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/conftest.py‎
Lines changed: 62 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/conftest.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_convert.py‎
Lines changed: 42 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/test_convert.py‎
Lines changed: 42 additions & 0 deletions
@@ -2,6 +2,7 @@ accelerate
 datasets
 deepspeed
 hydra-core
+lm-eval
 # TOT megatron-mfsdp until NVIDIA/Megatron-LM#2575 is in a release.
 megatron-fsdp @ git+https://github.com/NVIDIA/Megatron-LM.git@main#subdirectory=megatron/core/distributed/fsdp/src
 peft
 
@@ -1,4 +1,3 @@
-# coding=utf-8
 # noqa: license-check
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
@@ -38,9 +37,8 @@
     MaskedLMOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.esm.configuration_esm import EsmConfig
-from transformers.models.esm.modeling_esm import EsmPooler
+from transformers.models.esm.modeling_esm import EsmPooler, EsmPreTrainedModel
 from transformers.utils import logging
 from transformers.utils.generic import TransformersKwargs
 
@@ -135,6 +133,10 @@ def __init__(self, config: NVEsmConfig):
         """
         super().__init__()
         self.config = config
+
+        def _init_method(x):
+            torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range)
+
         self.layers = nn.ModuleList(
             [
                 transformer_engine.pytorch.TransformerLayer(
@@ -156,12 +158,18 @@ def __init__(self, config: NVEsmConfig):
                     fuse_qkv_params=config.fuse_qkv_params,
                     params_dtype=config.dtype,
                     window_size=(-1, -1),
+                    device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+                    init_method=_init_method,
+                    output_layer_init_method=_init_method,
                 )
                 for i in range(config.num_hidden_layers)
             ]
         )
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps, params_dtype=config.dtype
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            params_dtype=config.dtype,
+            device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
         )
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
@@ -247,7 +255,7 @@ def forward(
         )
 
 
-class NVEsmPreTrainedModel(PreTrainedModel):
+class NVEsmPreTrainedModel(EsmPreTrainedModel):
     """An abstract class to handle weights initialization and pretrained model loading."""
 
     config_class = NVEsmConfig
@@ -259,61 +267,22 @@ class NVEsmPreTrainedModel(PreTrainedModel):
         "EsmEmbeddings",
     )
 
-    def _init_weights(self, module: nn.Module):
-        """Initialize model weights.
+    def init_empty_weights(self):
+        """Handles moving the model from the meta device to the cuda device and initializing the weights."""
+        # For TE layers, calling `reset_parameters` is sufficient to move them to the cuda device and apply the weight
+        # initialization we passed them during module creation.
+        for module in self.modules():
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
 
-        This method ensures that models with randomly-initialized weights get the correct initial value distribution,
-        which can be critical for training stability. We also call this method directly when using meta-device init, as
-        the `to_empty` method does not initialize the weights. While the base Transformers model has a similar method,
-        we need to extend it to handle TE-specific modules.
+        # The esm.embeddings layer is the only non-TE layer in this model we need to deal with. We use
+        # `model._init_weights` rather than `reset_parameters` to ensure we honor the original config standard
+        # deviation.
+        self.esm.embeddings.word_embeddings.to_empty(device="cuda")
+        self.esm.embeddings.apply(self._init_weights)
 
-        Args:
-            module (nn.Module): The module to initialize the weights for.
-        """
-        if isinstance(
-            module, (nn.Linear, transformer_engine.pytorch.Linear, transformer_engine.pytorch.LayerNormLinear)
-        ):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        if isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        if isinstance(module, (nn.LayerNorm, transformer_engine.pytorch.LayerNorm)):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, transformer_engine.pytorch.LayerNormLinear):
-            if module.layer_norm_bias is not None:
-                module.layer_norm_bias.data.zero_()
-            module.layer_norm_weight.data.fill_(1.0)
-            if module.layer_norm_bias is not None:
-                module.layer_norm_bias.data.zero_()
-        if isinstance(module, transformer_engine.pytorch.LayerNormMLP):
-            if module.layer_norm_bias is not None:
-                module.layer_norm_bias.data.zero_()
-            module.layer_norm_weight.data.fill_(1.0)
-            if hasattr(module, "fc1_weight") and module.fc1_weight is not None:
-                module.fc1_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if hasattr(module, "fc2_weight") and module.fc2_weight is not None:
-                module.fc2_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if hasattr(module, "fc1_bias") and module.fc1_bias is not None and module.fc1_bias.numel() > 0:
-                module.fc1_bias.data.zero_()
-            if hasattr(module, "fc2_bias") and module.fc2_bias is not None and module.fc2_bias.numel() > 0:
-                module.fc2_bias.data.zero_()
-        if isinstance(module, RotaryPositionEmbedding) and hasattr(module, "inv_freq"):
-            # When we initialize the model with `to_empty`, the `inv_freq` attribute is not initialized, so we need to
-            # re-initialize it here with the correct values.
-            module.inv_freq = RotaryPositionEmbedding(
-                self.config.hidden_size // self.config.num_attention_heads
-            ).inv_freq.to(module.inv_freq.device)
-
-    @classmethod
-    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
-        """Override the default get_init_context method to allow for fp8 model initialization."""
-        return []
+        # Meta-device init seems to break weight tying, so we re-tie the weights here.
+        self.tie_weights()
 
 
 class NVEsmModel(NVEsmPreTrainedModel):
@@ -516,15 +485,20 @@ def __init__(self, config: NVEsmConfig):
             config.hidden_size,
             config.hidden_size,
             params_dtype=config.dtype,
+            device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+            init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
         )
 
-        self.decoder = transformer_engine.pytorch.LayerNormLinear(
-            config.hidden_size,
-            config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
-            bias=True,
-            eps=config.layer_norm_eps,
-            params_dtype=config.dtype,
-        )
+        with transformer_engine.pytorch.fp8_model_init(enabled=False):
+            self.decoder = transformer_engine.pytorch.LayerNormLinear(
+                config.hidden_size,
+                config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
+                bias=True,
+                eps=config.layer_norm_eps,
+                params_dtype=config.dtype,
+                device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+                init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
+            )
 
     def forward(self, features, **kwargs):
         """Forward pass of the NVEsmLMHead.
@@ -553,7 +527,12 @@ def __init__(self, config):
         )
 
         self.layer_norm = (
-            transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            transformer_engine.pytorch.LayerNorm(
+                config.hidden_size,
+                eps=config.layer_norm_eps,
+                params_dtype=config.dtype,
+                device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+            )
             if config.emb_layer_norm_before
             else None
         )
@@ -648,7 +627,11 @@ def __init__(self, config):
         self.esm = NVEsmModel(config, add_pooling_layer=False)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = transformer_engine.pytorch.Linear(
-            config.hidden_size, config.num_labels, params_dtype=config.dtype
+            config.hidden_size,
+            config.num_labels,
+            params_dtype=config.dtype,
+            device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+            init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
         )
 
         self.init_weights()
 
@@ -18,6 +18,8 @@
 
 import pytest
 import transformer_engine.pytorch
+from transformer_engine.common import recipe as recipe_module
+from transformer_engine.pytorch import fp8
 from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling
 
 from esm.convert import convert_esm_hf_to_te
@@ -88,3 +90,63 @@ def te_model_checkpoint(tmp_path):
     model_te = convert_esm_hf_to_te(model_hf)
     model_te.save_pretrained(tmp_path / "te_model_checkpoint")
     return tmp_path / "te_model_checkpoint"
+
+
+ALL_RECIPES = [
+    recipe_module.DelayedScaling(),
+    recipe_module.Float8CurrentScaling(),
+    recipe_module.Float8BlockScaling(),
+    recipe_module.MXFP8BlockScaling(),
+    # recipe_module.NVFP4BlockScaling(disable_rht=True, disable_stochastic_rounding=True),
+]
+
+
+def _check_recipe_support(recipe: recipe_module.Recipe):
+    """Check if a recipe is supported and return (supported, reason)."""
+    if isinstance(recipe, recipe_module.DelayedScaling):
+        recipe_supported, reason = fp8.check_fp8_support()
+    elif isinstance(recipe, recipe_module.Float8CurrentScaling):
+        recipe_supported, reason = fp8.check_fp8_support()
+    elif isinstance(recipe, recipe_module.Float8BlockScaling):
+        recipe_supported, reason = fp8.check_fp8_block_scaling_support()
+    elif isinstance(recipe, recipe_module.MXFP8BlockScaling):
+        recipe_supported, reason = fp8.check_mxfp8_support()
+    elif isinstance(recipe, recipe_module.NVFP4BlockScaling):
+        recipe_supported, reason = fp8.check_nvfp4_support()
+    else:
+        recipe_supported = False
+        reason = "Unsupported recipe"
+    return recipe_supported, reason
+
+
+def requires_recipe_support(recipe: recipe_module.Recipe):
+    """Decorator to skip tests that require recipe support."""
+
+    def requires_recipe_support_inner(func):
+        recipe_supported, reason = _check_recipe_support(recipe)
+        return pytest.mark.skipif(not recipe_supported, reason=reason)(func)
+
+    return requires_recipe_support_inner
+
+
+def parametrize_recipes_with_support(recipes):
+    """Generate pytest.param objects with skip marks for unsupported recipes."""
+    parametrized_recipes = []
+    for recipe in recipes:
+        recipe_supported, reason = _check_recipe_support(recipe)
+        parametrized_recipes.append(
+            pytest.param(
+                recipe,
+                id=recipe.__class__.__name__,
+                marks=pytest.mark.skipif(
+                    not recipe_supported,
+                    reason=reason,
+                ),
+            )
+        )
+    return parametrized_recipes
+
+
+@pytest.fixture(params=parametrize_recipes_with_support(ALL_RECIPES))
+def fp8_recipe(request):
+    return request.param
@@ -135,3 +135,45 @@ def test_padding_unpadding_operations():
     if te_embeddings.shape[0] > original_embeddings.shape[0]:
         padding_rows = te_embeddings[original_embeddings.shape[0] :]
         torch.testing.assert_close(padding_rows, torch.zeros_like(padding_rows), atol=1e-6, rtol=1e-6)
+
+
+def test_weight_initialization_matches_hf():
+    from transformers import AutoConfig, set_seed
+    from transformers.models.esm.modeling_esm import EsmForMaskedLM
+
+    from esm.convert import convert_esm_hf_to_te
+    from esm.modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
+
+    set_seed(42)
+
+    config_hf = AutoConfig.from_pretrained("facebook/esm2_t6_8M_UR50D", vocab_size=64)
+    model_hf = EsmForMaskedLM(config_hf)
+    model_te_converted = convert_esm_hf_to_te(model_hf)
+
+    config = NVEsmConfig(**model_hf.config.to_dict())
+    model_te = NVEsmForMaskedLM(config)
+    model_te.to("cuda")
+    model_te_converted.to("cuda")
+
+    state_dict_hf = model_te_converted.state_dict()
+    state_dict_te = model_te.state_dict()
+
+    for name in state_dict_hf.keys():
+        if name.endswith("_extra_state"):
+            continue
+
+        torch.testing.assert_close(
+            state_dict_te[name].mean(),
+            state_dict_hf[name].mean(),
+            atol=1e-3,
+            rtol=1e-4,
+            msg=lambda x: f"Mean mismatch for parameter {name}: {x}",
+        )
+
+        torch.testing.assert_close(
+            state_dict_te[name].std(),
+            state_dict_hf[name].std(),
+            atol=1e-3,
+            rtol=1e-4,
+            msg=lambda x: f"Std mismatch for parameter {name}: {x}",
+        )