NVIDIA
diff --git a/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions b/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion b/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎recipes/README.md‎
Lines changed: 5 additions & 5 deletions b/‎recipes/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎recipes/esm2_native_te_mfsdp/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion b/‎recipes/esm2_native_te_mfsdp/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎recipes/esm2_native_te_mfsdp/hydra_config/L1_15B_perf_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎recipes/esm2_native_te_mfsdp/hydra_config/L1_15B_perf_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎recipes/esm2_native_te_mfsdp/train_ddp.py‎
Lines changed: 2 additions & 2 deletions b/‎recipes/esm2_native_te_mfsdp/train_ddp.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎recipes/esm2_native_te_mfsdp/train_mfsdp.py‎
Lines changed: 2 additions & 2 deletions b/‎recipes/esm2_native_te_mfsdp/train_mfsdp.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎…fsdp_thd/.devcontainer/devcontainer.json‎ ‎…fsdp_thd/.devcontainer/devcontainer.json‎recipes/esm2_native_te_nvfsdp_thd/.devcontainer/devcontainer.json renamed to recipes/esm2_native_te_mfsdp_thd/.devcontainer/devcontainer.json b/‎…fsdp_thd/.devcontainer/devcontainer.json‎ ‎…fsdp_thd/.devcontainer/devcontainer.json‎recipes/esm2_native_te_nvfsdp_thd/.devcontainer/devcontainer.json renamed to recipes/esm2_native_te_mfsdp_thd/.devcontainer/devcontainer.json
diff --git a/‎…/esm2_native_te_nvfsdp_thd/.dockerignore‎ ‎…s/esm2_native_te_mfsdp_thd/.dockerignore‎recipes/esm2_native_te_nvfsdp_thd/.dockerignore renamed to recipes/esm2_native_te_mfsdp_thd/.dockerignore b/‎…/esm2_native_te_nvfsdp_thd/.dockerignore‎ ‎…s/esm2_native_te_mfsdp_thd/.dockerignore‎recipes/esm2_native_te_nvfsdp_thd/.dockerignore renamed to recipes/esm2_native_te_mfsdp_thd/.dockerignore
diff --git a/‎…pes/esm2_native_te_nvfsdp_thd/.ruff.toml‎ ‎…ipes/esm2_native_te_mfsdp_thd/.ruff.toml‎recipes/esm2_native_te_nvfsdp_thd/.ruff.toml renamed to recipes/esm2_native_te_mfsdp_thd/.ruff.toml b/‎…pes/esm2_native_te_nvfsdp_thd/.ruff.toml‎ ‎…ipes/esm2_native_te_mfsdp_thd/.ruff.toml‎recipes/esm2_native_te_nvfsdp_thd/.ruff.toml renamed to recipes/esm2_native_te_mfsdp_thd/.ruff.toml
@@ -8,7 +8,7 @@ The biological AI community is actively prototyping model architectures and need
 
 - **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
 - **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
-- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
+- **Performance optimization**: Leverages TransformerEngine and megatron-fsdp for state-of-the-art training efficiency
 - **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
 
 ### Use Cases
@@ -35,7 +35,7 @@ Example models include ESM-2, Geneformer, and AMPLIFY.
 Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
 
 - **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
-- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
+- **Feature demonstrations**: FP8 training, megatron-fsdp, context parallelism, sequence packing
 - **Scaling strategies**: Single-GPU to multi-node training patterns
 - **Benchmarked performance**: Validated throughput and convergence metrics
 
@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
 
 ```bash
 # Navigate to a recipe
-cd recipes/esm2_native_te_nvfsdp
+cd recipes/esm2_native_te_mfsdp
 
 # Build and run
 docker build -t esm2_recipe .
@@ -191,4 +191,4 @@ For technical support and questions:
 
 - Check existing issues before opening a new one
 - Review our training recipes for implementation examples
-- Consult the TransformerEngine and nvFSDP documentation for underlying technologies
+- Consult the TransformerEngine and megatron-fsdp documentation for underlying technologies
@@ -46,7 +46,6 @@ exclude = [
     "dist",
     "node_modules",
     "venv",
-    "packages/nvFSDP/",
 ]
 
 # Ignore import violations in all `__init__.py` files.
 
@@ -2,7 +2,7 @@
 
 This directory contains self-contained training examples that demonstrate best practices for scaling
 biological foundation models using [TransformerEngine](https://github.com/NVIDIA/TransformerEngine)
-and [nvFSDP](https://github.com/NVIDIA-NeMo/nvFSDP). Each recipe is a complete Docker environment with
+and [megatron-fsdp](https://pypi.org/project/megatron-fsdp/). Each recipe is a complete Docker environment with
 benchmarked training scripts that users can learn from and adapt for their own research.
 
 ## Philosophy
@@ -49,7 +49,7 @@ Follow this naming pattern to clearly communicate what your recipe demonstrates:
 
 Examples:
 
-- `esm2_native_te_nvfsdp/` - ESM-2 with vanilla PyTorch, TransformerEngine, and nvFSDP
+- `esm2_native_te_mfsdp/` - ESM-2 with vanilla PyTorch, TransformerEngine, and megatron-fsdp
 - `amplify_accelerate_fp8/` - AMPLIFY with HuggingFace Accelerate and FP8 training
 - `geneformer_lightning_context_parallel/` - Geneformer with PyTorch Lightning and context parallelism
 
@@ -115,16 +115,16 @@ Your `train.py` should be educational and self-explanatory:
 ```python
 #!/usr/bin/env python3
 """
-ESM-2 training with TransformerEngine and nvFSDP.
+ESM-2 training with TransformerEngine and megatron-fsdp.
 
 This script demonstrates how to:
 1. Load and prepare biological sequence data
 2. Initialize ESM-2 with TransformerEngine layers
-3. Configure nvFSDP for memory-efficient multi-GPU training
+3. Configure megatron-fsdp for memory-efficient multi-GPU training
 4. Implement a training loop with proper checkpointing
 
 Key design decisions:
-- We use nvFSDP ZeRO-3 for maximum memory efficiency
+- We use megatron-fsdp ZeRO-3 for maximum memory efficiency
 - TransformerEngine FP8 is enabled for H100+ hardware
 - Context parallelism handles long biological sequences
 """
 
@@ -8,7 +8,7 @@ num_train_steps: 250
 
 # WandB config
 wandb_init_args:
-  name: "esm2_t6_8M_UR50D_nvfsdp_sanity"
+  name: "esm2_t6_8M_UR50D_mfsdp_sanity"
   mode: "offline"
 
 # Learning rate scheduler config
 
@@ -8,7 +8,7 @@ num_train_steps: 500
 
 # WandB config
 wandb_init_args:
-  name: "esm2_t48_15B_UR50D_nvfsdp_L1_perf"
+  name: "esm2_t48_15B_UR50D_mfsdp_L1_perf"
   project: "bionemo-recipes"
 
 # Optimizer config
 
@@ -51,7 +51,7 @@ def is_main_process(self) -> bool:
 
 @hydra.main(config_path="hydra_config", config_name="L0_sanity", version_base="1.2")
 def main(args: DictConfig) -> float | None:
-    """Train ESM-2 with TE layers using nvFSDP.
+    """Train ESM-2 with TE layers using mfsdp.
 
     Model names are valid ESM-2 model sizes, e.g.:
     - "esm2_t6_8M_UR50D"
@@ -63,7 +63,7 @@ def main(args: DictConfig) -> float | None:
     """
     # Initialize distributed training and create a device mesh for FSDP.
     # We have to create a dummy mesh dimension for context parallel and tensor parallel for things
-    # to work correctly with nvFSDP.
+    # to work correctly with mfsdp.
     dist.init_process_group(backend="nccl")
     dist_config = DistributedConfig()
     torch.cuda.set_device(dist_config.local_rank)
 
@@ -55,7 +55,7 @@ def is_main_process(self) -> bool:
 
 @hydra.main(config_path="hydra_config", config_name="L0_sanity", version_base="1.2")
 def main(args: DictConfig) -> float | None:
-    """Train ESM-2 with TE layers using nvFSDP.
+    """Train ESM-2 with TE layers using mfsdp.
 
     Model names are valid ESM-2 model sizes, e.g.:
     - "esm2_t6_8M_UR50D"
@@ -67,7 +67,7 @@ def main(args: DictConfig) -> float | None:
     """
     # Initialize distributed training and create a device mesh for FSDP.
     # We have to create a dummy mesh dimension for context parallel and tensor parallel for things
-    # to work correctly with nvFSDP.
+    # to work correctly with mfsdp.
     dist.init_process_group(backend="nccl")
     dist_config = DistributedConfig()
     torch.cuda.set_device(dist_config.local_rank)
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@ exclude = [`
`46`	`46`	`"dist",`
`47`	`47`	`"node_modules",`
`48`	`48`	`"venv",`
`49`		`- "packages/nvFSDP/",`
`50`	`49`	`]`
`51`	`50`
`52`	`51`	# Ignore import violations in all `__init__.py` files.