ESM-2 mfsdp recipe expanded tests (#1101)

pstjohn · web-flow · commit b79422469e3a · 2025-09-03T18:58:27.000Z
Adds additional tests to ESM-2 mfsdp recipe to characterize where
convergence issues are occurring, also sets the seed for the THD recipe
to hopefully avoid flaky errors seen on nightly

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- New Features
  - Added a config toggle to enable/disable meta-device initialization.
- Introduced new sharded-training options (overlap reduce/param gather,
per-step sync, collective averaging).
  - Enabled use of Nvidia-hosted ESM2 models.

- Refactor
  - Progress bar now shows a precomputed loss value for consistency.
- Updated recipe defaults, including reduced micro-batch sizes for 650M
and 3B variants.

- Tests
- Reorganized suite around convergence checks; added meta-device and
eager scenarios, multi-GPU cases, and xfail markers.
  - Centralized seeding via fixture for reproducibility.

- Chores
  - Removed redundant optimizer settings.
  - Updated run naming conventions.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/recipes/esm2_native_te_mfsdp/hydra_config/L0_sanity.yaml b/recipes/esm2_native_te_mfsdp/hydra_config/L0_sanity.yaml
@@ -14,6 +14,3 @@ wandb_init_args:
 # Learning rate scheduler config
 lr_scheduler_kwargs:
   num_warmup_steps: 0
-
-adamw_kwargs:
-  lr: 1e-2
diff --git a/recipes/esm2_native_te_mfsdp/hydra_config/L1_3B.yaml b/recipes/esm2_native_te_mfsdp/hydra_config/L1_3B.yaml
@@ -2,8 +2,8 @@ defaults:
   - defaults
 
 # Training config
-model_name: esm2_t33_650M_UR50D
-micro_batch_size: 32
+model_name: nvidia/esm2_t36_3B_UR50D
+micro_batch_size: 16
 num_train_steps: 10_000
 
 # WandB config
diff --git a/recipes/esm2_native_te_mfsdp/hydra_config/L1_650M.yaml b/recipes/esm2_native_te_mfsdp/hydra_config/L1_650M.yaml
@@ -3,12 +3,12 @@ defaults:
   - _self_
 
 # Training config
-model_name: esm2_t33_650M_UR50D
-micro_batch_size: 16
+model_name: nvidia/esm2_t33_650M_UR50D
+micro_batch_size: 4
 num_train_steps: 200
 
 # WandB config
 wandb_init_args:
-  name: "esm2_t33_650M_UR50D_nvfsdp"
+  name: "esm2_t33_650M_UR50D_mfsdp"
   project: "bionemo-recipes-pstjohn"
   mode: "offline"
diff --git a/recipes/esm2_native_te_mfsdp/hydra_config/defaults.yaml b/recipes/esm2_native_te_mfsdp/hydra_config/defaults.yaml
@@ -5,15 +5,19 @@ max_seq_length: 1024
 data_path: .
 num_train_steps: ???
 
+# TODO: Once BIONEMO-2583 and BIONEMO-2719 are fixed, enable this by default and simplify training scripts to remove the
+# meta-device conditional.
+use_meta_device: false
+
 # WandB config
 wandb_init_args:
   name: ???
 
-# nvFSDP config
+# mFSDP config
 fully_shard_kwargs:
   zero_dp_strategy: "optim_grads_params"
   calculate_per_token_loss: false
-  init_model_with_meta_device: false
+  init_model_with_meta_device: ${use_meta_device}
   check_for_nan_in_grad: true
   grad_reduce_in_fp32: false
   preserve_fp32_weights: true
diff --git a/recipes/esm2_native_te_mfsdp/test_train.py b/recipes/esm2_native_te_mfsdp/test_train.py
@@ -28,10 +28,12 @@
 from train_mfsdp import main as main_mfsdp
 
 
-random.seed(42)
-torch.manual_seed(42)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed_all(42)
+@pytest.fixture(autouse=True)
+def set_seed():
+    random.seed(42)
+    torch.manual_seed(42)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(42)
 
 
 requires_multi_gpu = pytest.mark.skipif(
@@ -85,7 +87,7 @@ def mock_distributed_config(monkeypatch):
     _mesh_resources.mesh_dim_group_options.clear()
 
 
-def test_main_invocation_mfsdp(mock_distributed_config, tmp_path):
+def test_sanity_convergence_mfsdp(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked with the correct arguments."""
 
     # Run the training script with Hydra configuration overrides
@@ -96,8 +98,8 @@ def test_main_invocation_mfsdp(mock_distributed_config, tmp_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-@pytest.mark.xfail(reason="MFSDP meta-device init seems to be failing with this model (BIONEMO-2583)")
-def test_main_invocation_mfsdp_meta_device(mock_distributed_config, tmp_path):
+@pytest.mark.xfail(reason="MFSDP meta-device init seems to be failing with both TE and eager models (BIONEMO-2583)")
+def test_sanity_convergence_mfsdp_meta_device(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked with the correct arguments."""
 
     # Run the training script with Hydra configuration overrides
@@ -106,15 +108,34 @@ def test_main_invocation_mfsdp_meta_device(mock_distributed_config, tmp_path):
             config_name="L0_sanity",
             overrides=[
                 f"+wandb_init_args.dir={tmp_path}",
-                "fully_shard_kwargs.init_model_with_meta_device=true",
+                "use_meta_device=true",
             ],
         )
 
     final_loss = main_mfsdp(sanity_config)
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_main_invocation_ddp(mock_distributed_config, tmp_path):
+@pytest.mark.xfail(reason="MFSDP meta-device init seems to be failing with both TE and eager models (BIONEMO-2583)")
+def test_sanity_convergence_mfsdp_eager_meta_device(mock_distributed_config, tmp_path):
+    """Test that the main function can be invoked with the correct arguments."""
+
+    # Run the training script with Hydra configuration overrides
+    with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"+wandb_init_args.dir={tmp_path}",
+                "model_name=facebook/esm2_t6_8M_UR50D",
+                "use_meta_device=true",
+            ],
+        )
+
+    final_loss = main_mfsdp(sanity_config)
+    assert final_loss < 3.0, f"Final loss {final_loss} is too high"
+
+
+def test_sanity_convergence_ddp(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked wrapping the model in DDP."""
 
     # Run the training script with Hydra configuration overrides
@@ -125,18 +146,41 @@ def test_main_invocation_ddp(mock_distributed_config, tmp_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_main_invocation_fsdp2(mock_distributed_config, tmp_path):
+def test_sanity_convergence_fsdp2(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked wrapping the model in FSDP2."""
 
     # Run the training script with Hydra configuration overrides
     with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
-        sanity_config = compose(config_name="L0_sanity", overrides=[f"+wandb_init_args.dir={tmp_path}"])
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"+wandb_init_args.dir={tmp_path}",
+            ],
+        )
 
     final_loss = main_fsdp2(sanity_config)
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_main_invocation_mfsdp_eager(mock_distributed_config, tmp_path):
+@pytest.mark.xfail(reason="FSDP2 meta-device init seems doesn't have the same convergence (BIONEMO-2719)")
+def test_sanity_convergence_fsdp2_meta_device(mock_distributed_config, tmp_path):
+    """Test that the main function can be invoked wrapping the model in FSDP2."""
+
+    # Run the training script with Hydra configuration overrides
+    with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"+wandb_init_args.dir={tmp_path}",
+                "use_meta_device=true",
+            ],
+        )
+
+    final_loss = main_fsdp2(sanity_config)
+    assert final_loss < 3.0, f"Final loss {final_loss} is too high"
+
+
+def test_sanity_convergence_mfsdp_eager(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked with the correct arguments."""
 
     # Run the training script with Hydra configuration overrides
@@ -150,7 +194,7 @@ def test_main_invocation_mfsdp_eager(mock_distributed_config, tmp_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_main_invocation_ddp_eager(mock_distributed_config, tmp_path):
+def test_sanity_convergence_ddp_eager(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked wrapping the model in DDP."""
 
     # Run the training script with Hydra configuration overrides
@@ -164,7 +208,7 @@ def test_main_invocation_ddp_eager(mock_distributed_config, tmp_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
-def test_main_invocation_fsdp2_eager(mock_distributed_config, tmp_path):
+def test_sanity_convergence_fsdp2_eager(mock_distributed_config, tmp_path):
     """Test that the main function can be invoked wrapping the model in FSDP2."""
 
     # Run the training script with Hydra configuration overrides
@@ -178,6 +222,28 @@ def test_main_invocation_fsdp2_eager(mock_distributed_config, tmp_path):
     assert final_loss < 3.0, f"Final loss {final_loss} is too high"
 
 
+@pytest.mark.xfail(reason="This passes on my local 5090 but fails on CI (L4) (BIONEMO-2719)")
+def test_sanity_convergence_fsdp2_eager_meta_device(mock_distributed_config, tmp_path):
+    """Test that the main function can be invoked wrapping the model in FSDP2 and using meta-device init."""
+
+    # Run the training script with Hydra configuration overrides
+    with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"+wandb_init_args.dir={tmp_path}",
+                "model_name=facebook/esm2_t6_8M_UR50D",
+                "use_meta_device=true",
+            ],
+        )
+
+    final_loss = main_fsdp2(sanity_config)
+    assert final_loss < 3.0, f"Final loss {final_loss} is too high"
+
+
+# These tests don't check convergence, they just check that the training script runs successfully on multiple GPUs.
+
+
 @requires_multi_gpu
 def test_multi_gpu_train_te_ddp(tmp_path):
     # Run 'accelerate launch train.py' as a subprocess
@@ -197,7 +263,7 @@ def test_multi_gpu_train_te_ddp(tmp_path):
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_te_mfsdp_no_meta_device(tmp_path):
+def test_multi_gpu_train_te_mfsdp(tmp_path):
     # Run 'accelerate launch train.py' as a subprocess
     run_train_cmd(
         [
@@ -209,14 +275,13 @@ def test_multi_gpu_train_te_mfsdp_no_meta_device(tmp_path):
             "train_mfsdp.py",
             "--config-name",
             "L0_sanity",
-            "fully_shard_kwargs.init_model_with_meta_device=false",
             "num_train_steps=4",
         ]
     )
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_eager_mfsdp(tmp_path):
+def test_multi_gpu_train_te_fsdp2(tmp_path):
     # Run 'accelerate launch train.py' as a subprocess
     run_train_cmd(
         [
@@ -225,17 +290,16 @@ def test_multi_gpu_train_eager_mfsdp(tmp_path):
             "2",
             "--master_port",
             f"{random.randint(20000, 40000)}",
-            "train_mfsdp.py",
+            "train_fsdp2.py",
             "--config-name",
             "L0_sanity",
-            "model_name=facebook/esm2_t6_8M_UR50D",
             "num_train_steps=4",
         ]
     )
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_te_fsdp2(tmp_path):
+def test_multi_gpu_train_eager_fsdp2_meta_device(tmp_path):
     # Run 'accelerate launch train.py' as a subprocess
     run_train_cmd(
         [
@@ -247,6 +311,8 @@ def test_multi_gpu_train_te_fsdp2(tmp_path):
             "train_fsdp2.py",
             "--config-name",
             "L0_sanity",
+            "model_name=facebook/esm2_t6_8M_UR50D",
+            "use_meta_device=true",
             "num_train_steps=4",
         ]
     )
diff --git a/recipes/esm2_native_te_mfsdp/train_ddp.py b/recipes/esm2_native_te_mfsdp/train_ddp.py
@@ -170,7 +170,7 @@ def main(args: DictConfig) -> float | None:
             )
 
             progress_bar.update(1)
-            progress_bar.set_postfix({"loss": loss.item()})
+            progress_bar.set_postfix({"loss": loss_value})
 
     # Clean up distributed training
     if dist_config.is_main_process():
diff --git a/recipes/esm2_native_te_mfsdp/train_fsdp2.py b/recipes/esm2_native_te_mfsdp/train_fsdp2.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import time
+from contextlib import nullcontext
 from dataclasses import dataclass, field
 
 import hydra
@@ -85,7 +86,7 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
     # Create an empty ESM-2 model with a masked language model head.
     if "facebook" in args.model_name:
         config = AutoConfig.from_pretrained(args.model_name, dtype=torch.bfloat16)
-        with torch.device("meta"):
+        with torch.device("meta") if args.use_meta_device else nullcontext():
             model = AutoModelForMaskedLM.from_config(config, attn_implementation="flash_attention_2")
         del model.esm.contact_head
         transformer_stack = model.esm.encoder.layer
@@ -94,7 +95,7 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
         config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True, dtype=torch.bfloat16)
         config.max_seq_length = args.max_seq_length
         config.micro_batch_size = args.micro_batch_size
-        with torch.device("meta"):
+        with torch.device("meta") if args.use_meta_device else nullcontext():
             model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True)
         transformer_stack = model.esm.encoder.layers
 
@@ -111,10 +112,11 @@ def main(args: DictConfig) -> float | None:  # noqa: C901
     optimizer = AdamW(model.parameters(), **args.adamw_kwargs)
     scheduler = get_linear_schedule_with_warmup(optimizer, **args.lr_scheduler_kwargs)
 
-    model.to_empty(device=device)
-    for module in model.modules():
-        if hasattr(module, "reset_parameters"):
-            module.reset_parameters()
+    if args.use_meta_device:
+        model.to_empty(device=device)
+        for module in model.modules():
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
 
     # Training loop.
     model.train()
diff --git a/recipes/esm2_native_te_mfsdp/train_mfsdp.py b/recipes/esm2_native_te_mfsdp/train_mfsdp.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import time
+from contextlib import nullcontext
 from dataclasses import dataclass, field
 
 import hydra
@@ -86,15 +87,19 @@ def main(args: DictConfig) -> float | None:
         config = AutoConfig.from_pretrained(args.model_name, dtype=torch.bfloat16)
         from transformers.models.esm.modeling_esm import EsmForMaskedLM  # noqa: F401
 
-        with torch.device("meta" if args.fully_shard_kwargs.get("init_model_with_meta_device", True) else device):
+        with (
+            torch.device("meta") if args.fully_shard_kwargs.get("init_model_with_meta_device", True) else nullcontext()
+        ):
             model = AutoModelForMaskedLM.from_config(config, attn_implementation="flash_attention_2")
-            del model.esm.contact_head
+        del model.esm.contact_head
 
     else:
         config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True, dtype=torch.bfloat16)
         config.max_seq_length = args.max_seq_length
         config.micro_batch_size = args.micro_batch_size
-        with torch.device("meta" if args.fully_shard_kwargs.get("init_model_with_meta_device", True) else device):
+        with (
+            torch.device("meta") if args.fully_shard_kwargs.get("init_model_with_meta_device", True) else nullcontext()
+        ):
             model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True)
 
     # Log model and number of parameters on main process.
@@ -188,7 +193,7 @@ def main(args: DictConfig) -> float | None:
             )
 
             progress_bar.update(1)
-            progress_bar.set_postfix({"loss": loss.item()})
+            progress_bar.set_postfix({"loss": loss_value})
 
     # Clean up distributed training
     if dist_config.is_main_process():
diff --git a/recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py b/recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py

Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,7 @@ def main(args: DictConfig) -> float \| None:`
`170`	`170`	`)`
`171`	`171`
`172`	`172`	`progress_bar.update(1)`
`173`		`- progress_bar.set_postfix({"loss": loss.item()})`
	`173`	`+ progress_bar.set_postfix({"loss": loss_value})`
`174`	`174`
`175`	`175`	`# Clean up distributed training`
`176`	`176`	`if dist_config.is_main_process():`