NVIDIA
diff --git a/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions b/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions b/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 3 additions & 3 deletions b/‎ci/benchmarks/partial-conv/evo2_pretrain.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ci/benchmarks/perf/esm2_pretrain.yaml‎
Lines changed: 16 additions & 4 deletions b/‎ci/benchmarks/perf/esm2_pretrain.yaml‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎ci/benchmarks/perf/geneformer_pretrain.yaml‎
Lines changed: 14 additions & 2 deletions b/‎ci/benchmarks/perf/geneformer_pretrain.yaml‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎docs/docs/index.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/docs/index.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion b/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎recipes/README.md‎
Lines changed: 8 additions & 8 deletions b/‎recipes/README.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎recipes/esm2_accelerate/hydra_config/L0_sanity.yaml‎
Lines changed: 7 additions & 4 deletions b/‎recipes/esm2_accelerate/hydra_config/L0_sanity.yaml‎
Lines changed: 7 additions & 4 deletions
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
 torchmetrics
 tqdm
 transformer_engine
-transformers @ git+https://github.com/huggingface/transformers.git
+transformers
 typer
 wandb
@@ -8,7 +8,7 @@ The biological AI community is actively prototyping model architectures and need
 
 - **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
 - **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
-- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
+- **Performance optimization**: Leverages TransformerEngine and megatron-fsdp for state-of-the-art training efficiency
 - **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
 
 ### Use Cases
@@ -35,7 +35,7 @@ Example models include ESM-2, Geneformer, and AMPLIFY.
 Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
 
 - **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
-- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
+- **Feature demonstrations**: FP8 training, megatron-fsdp, context parallelism, sequence packing
 - **Scaling strategies**: Single-GPU to multi-node training patterns
 - **Benchmarked performance**: Validated throughput and convergence metrics
 
@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
 
 ```bash
 # Navigate to a recipe
-cd recipes/esm2_native_te_nvfsdp
+cd recipes/esm2_native_te_mfsdp
 
 # Build and run
 docker build -t esm2_recipe .
@@ -191,4 +191,4 @@ For technical support and questions:
 
 - Check existing issues before opening a new one
 - Review our training recipes for implementation examples
-- Consult the TransformerEngine and nvFSDP documentation for underlying technologies
+- Consult the TransformerEngine and megatron-fsdp documentation for underlying technologies
@@ -89,10 +89,11 @@ script: |-
     --devices=${gpus} \
     --num-nodes=${nodes} \
     --val-check-interval=${val_check_interval} \
-    --wandb-project=${wandb_project_name} \
-    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
     --create-tensorboard-logger \
     --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
     --disable-checkpointing \
     --early-stop-on-step=${stop_steps} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
+    --wandb-job-type=${pipeline_label} \
     --garbage-collect-at-inference;
@@ -7,15 +7,15 @@ key_segments:
   lr: False
   min_lr: False
   wu_steps: False
-  artefacts_url: False
+  pckg_url: False
   file_name_wheel: False
 script_args:
   # All arguments referenced in the script string must be specified here.
   # Arguments not referenced in the script string must have the 'arg' field specified.
   # See jet/core/configs.py for the specification of the configuration class
   workspace: /workspace/bionemo2
   data_path: /data/evo2
-  artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
+  pckg_url: gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple/
   file_name_wheel: subquadratic-ops
   model: evo2
   variant: train
@@ -40,7 +40,7 @@ script_args:
 script: |-
   INSTALL_FLAG="/tmp/install_done_${{SLURMD_NODENAME}}";
   if [ "$SLURM_LOCALID" = "0" ]; then
-      pip install ${file_name_wheel} --index-url ${artefacts_url}
+      pip install ${file_name_wheel} --index-url https://oauth2:$JET_GITLAB_TOKEN@${pckg_url} --extra-index-url https://pypi.org/simple/
       touch $INSTALL_FLAG
   fi
   # All ranks wait until install flag file appears
 
@@ -41,11 +41,23 @@ script_args:
       tp: 1
       dfpnl: ""
 script: |-
+  COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
+  NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
+  if [ "$SLURM_LOCALID" = "0" ]; then
+      df -h;
+      echo $NEW_DATA_PATH;
+      time cp -r ${data_path}/ $NEW_DATA_PATH;
+      touch $COPY_FLAG
+  fi
+  # All ranks wait until install flag file appears
+  while [ ! -f $COPY_FLAG ]; do
+      sleep 1
+  done
   WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
-    --train-cluster-path=${data_path}/train_clusters.parquet \
-    --train-database-path=${data_path}/train.db \
-    --valid-cluster-path=${data_path}/valid_clusters.parquet \
-    --valid-database-path=${data_path}/validation.db \
+    --train-cluster-path=$NEW_DATA_PATH/train_clusters.parquet \
+    --train-database-path=$NEW_DATA_PATH/train.db \
+    --valid-cluster-path=$NEW_DATA_PATH/valid_clusters.parquet \
+    --valid-database-path=$NEW_DATA_PATH/validation.db \
     --micro-batch-size=${batch_size} \
     --num-nodes=${nodes} \
     --num-gpus=${gpus} \
 
@@ -27,8 +27,20 @@ script_args:
       batch_size: 32
 
 script: |-
-   WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
-    --data-dir ${data_path} \
+  COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
+  NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
+  if [ "$SLURM_LOCALID" = "0" ]; then
+      df -h;
+      echo $NEW_DATA_PATH;
+      time cp -r ${data_path}/ $NEW_DATA_PATH;
+      touch $COPY_FLAG
+  fi
+  # All ranks wait until install flag file appears
+  while [ ! -f $COPY_FLAG ]; do
+      sleep 1
+  done
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
+    --data-dir $NEW_DATA_PATH \
     --experiment-name ${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
     --num-gpus ${gpus} \
     --save-last-checkpoint \
 
@@ -22,7 +22,7 @@ hide:
         </span>
       </div>
       <div class="card-title" style="margin: 0;">
-        <strong>Datasets</strong>
+        <strong>User Guide</strong>
       </div>
     </div>
     <hr />
@@ -42,7 +42,7 @@ hide:
         </span>
       </div>
       <div class="card-title" style="margin: 0;">
-        <strong>Datasets</strong>
+        <strong>API Reference</strong>
       </div>
     </div>
     <hr />
@@ -62,7 +62,7 @@ hide:
         </span>
       </div>
       <div class="card-title" style="margin: 0;">
-        <strong>Datasets</strong>
+        <strong>Models</strong>
       </div>
     </div>
     <hr />
 
@@ -46,7 +46,6 @@ exclude = [
     "dist",
     "node_modules",
     "venv",
-    "packages/nvFSDP/",
 ]
 
 # Ignore import violations in all `__init__.py` files.
 
@@ -2,7 +2,7 @@
 
 This directory contains self-contained training examples that demonstrate best practices for scaling
 biological foundation models using [TransformerEngine](https://github.com/NVIDIA/TransformerEngine)
-and [nvFSDP](https://github.com/NVIDIA-NeMo/nvFSDP). Each recipe is a complete Docker environment with
+and [megatron-fsdp](https://pypi.org/project/megatron-fsdp/). Each recipe is a complete Docker environment with
 benchmarked training scripts that users can learn from and adapt for their own research.
 
 ## Philosophy
@@ -49,7 +49,7 @@ Follow this naming pattern to clearly communicate what your recipe demonstrates:
 
 Examples:
 
-- `esm2_native_te_nvfsdp/` - ESM-2 with vanilla PyTorch, TransformerEngine, and nvFSDP
+- `esm2_native_te_mfsdp/` - ESM-2 with vanilla PyTorch, TransformerEngine, and megatron-fsdp
 - `amplify_accelerate_fp8/` - AMPLIFY with HuggingFace Accelerate and FP8 training
 - `geneformer_lightning_context_parallel/` - Geneformer with PyTorch Lightning and context parallelism
 
@@ -115,16 +115,16 @@ Your `train.py` should be educational and self-explanatory:
 ```python
 #!/usr/bin/env python3
 """
-ESM-2 training with TransformerEngine and nvFSDP.
+ESM-2 training with TransformerEngine and megatron-fsdp.
 
 This script demonstrates how to:
 1. Load and prepare biological sequence data
 2. Initialize ESM-2 with TransformerEngine layers
-3. Configure nvFSDP for memory-efficient multi-GPU training
+3. Configure megatron-fsdp for memory-efficient multi-GPU training
 4. Implement a training loop with proper checkpointing
 
 Key design decisions:
-- We use nvFSDP ZeRO-3 for maximum memory efficiency
+- We use megatron-fsdp ZeRO-3 for maximum memory efficiency
 - TransformerEngine FP8 is enabled for H100+ hardware
 - Context parallelism handles long biological sequences
 """
@@ -197,7 +197,7 @@ optimizer:
 # Distributed training
 distributed:
   backend: nccl
-  nvfsdp:
+  mfsdp:
     enable: true
     sharding_strategy: zero3
 
@@ -242,7 +242,7 @@ training:
   num_train_steps: 100    # Enough steps for stable metrics
 
 wandb:
-  name: "esm2_nvfsdp_benchmark"
+  name: "esm2_mfsdp_benchmark"
   tags: ["L1", "benchmark", "performance"]
 ```
 
@@ -411,7 +411,7 @@ docker run --rm -it --gpus all my_recipe pytest -v .
 
 For reference implementations, examine existing recipes:
 
-- **`esm2_native_te_nvfsdp/`**: Comprehensive example showing vanilla PyTorch with TE and nvFSDP
+- **`esm2_native_te_mfsdp/`**: Comprehensive example showing vanilla PyTorch with TE and megatron-fsdp
 - **`amplify_accelerate_fp8/`**: HuggingFace Accelerate integration with FP8 training
 - **`geneformer_lightning_context_parallel/`**: PyTorch Lightning with context parallelism for long sequences
 
 
@@ -1,14 +1,17 @@
 defaults:
   - defaults
+  - _self_
 
 model_tag: "nvidia/esm2_t6_8M_UR50D"
-stop_after_n_steps: 4
+stop_after_n_steps: 250
+
 trainer:
   run_name: "esm2_t6_8M_UR50D_sanity"
   per_device_train_batch_size: 2
   per_device_eval_batch_size: 2
-  save_steps: 2
-  eval_steps: 2
-  logging_steps: 1
+  save_steps: 1000
+  eval_steps: 1000
+  logging_steps: 10
   report_to: "none"
   dataloader_num_workers: 0
+  warmup_steps: 0
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@ exclude = [`
`46`	`46`	`"dist",`
`47`	`47`	`"node_modules",`
`48`	`48`	`"venv",`
`49`		`- "packages/nvFSDP/",`
`50`	`49`	`]`
`51`	`50`
`52`	`51`	# Ignore import violations in all `__init__.py` files.