NVIDIA-BioNeMo
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions b/‎bionemo-recipes.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions b/‎ci/benchmarks/partial-conv/evo2_finetuning.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ci/benchmarks/perf/esm2_pretrain.yaml‎
Lines changed: 16 additions & 4 deletions b/‎ci/benchmarks/perf/esm2_pretrain.yaml‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎ci/benchmarks/perf/geneformer_pretrain.yaml‎
Lines changed: 14 additions & 2 deletions b/‎ci/benchmarks/perf/geneformer_pretrain.yaml‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion b/‎models/.ruff.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎models/amplify/.devcontainer/devcontainer.json‎
Lines changed: 2 additions & 5 deletions b/‎models/amplify/.devcontainer/devcontainer.json‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎models/amplify/Dockerfile‎
Lines changed: 1 addition & 2 deletions b/‎models/amplify/Dockerfile‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎models/amplify/export.py‎
Lines changed: 1 addition & 1 deletion b/‎models/amplify/export.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/amplify/src/amplify/amplify_te.py‎
Lines changed: 9 additions & 13 deletions b/‎models/amplify/src/amplify/amplify_te.py‎
Lines changed: 9 additions & 13 deletions
@@ -82,6 +82,8 @@ With a locally cloned repository and initialized submodules, build the BioNeMo c
 docker buildx build . -t my-container-tag
 ```
 
+If you see an error message like `No file descriptors available (os error 24)`, add the option `--ulimit nofile=65535:65535` to the docker build command.
+
 #### VSCode Devcontainer for Interactive Debugging
 
 We distribute a [development container](https://devcontainers.github.io/) configuration for vscode
 
@@ -8,7 +8,7 @@ The biological AI community is actively prototyping model architectures and need
 
 - **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
 - **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
-- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
+- **Performance optimization**: Leverages TransformerEngine and megatron-fsdp for state-of-the-art training efficiency
 - **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
 
 ### Use Cases
@@ -35,7 +35,7 @@ Example models include ESM-2, Geneformer, and AMPLIFY.
 Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
 
 - **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
-- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
+- **Feature demonstrations**: FP8 training, megatron-fsdp, context parallelism, sequence packing
 - **Scaling strategies**: Single-GPU to multi-node training patterns
 - **Benchmarked performance**: Validated throughput and convergence metrics
 
@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
 
 ```bash
 # Navigate to a recipe
-cd recipes/esm2_native_te_nvfsdp
+cd recipes/esm2_native_te_mfsdp
 
 # Build and run
 docker build -t esm2_recipe .
@@ -191,4 +191,4 @@ For technical support and questions:
 
 - Check existing issues before opening a new one
 - Review our training recipes for implementation examples
-- Consult the TransformerEngine and nvFSDP documentation for underlying technologies
+- Consult the TransformerEngine and megatron-fsdp documentation for underlying technologies
@@ -89,10 +89,11 @@ script: |-
     --devices=${gpus} \
     --num-nodes=${nodes} \
     --val-check-interval=${val_check_interval} \
-    --wandb-project=${wandb_project_name} \
-    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
     --create-tensorboard-logger \
     --activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
     --disable-checkpointing \
     --early-stop-on-step=${stop_steps} \
+    --wandb-project=${wandb_project_name} \
+    --wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
+    --wandb-job-type=${pipeline_label} \
     --garbage-collect-at-inference;
@@ -41,11 +41,23 @@ script_args:
       tp: 1
       dfpnl: ""
 script: |-
+  COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
+  NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
+  if [ "$SLURM_LOCALID" = "0" ]; then
+      df -h;
+      echo $NEW_DATA_PATH;
+      time cp -r ${data_path}/ $NEW_DATA_PATH;
+      touch $COPY_FLAG
+  fi
+  # All ranks wait until install flag file appears
+  while [ ! -f $COPY_FLAG ]; do
+      sleep 1
+  done
   WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
-    --train-cluster-path=${data_path}/train_clusters.parquet \
-    --train-database-path=${data_path}/train.db \
-    --valid-cluster-path=${data_path}/valid_clusters.parquet \
-    --valid-database-path=${data_path}/validation.db \
+    --train-cluster-path=$NEW_DATA_PATH/train_clusters.parquet \
+    --train-database-path=$NEW_DATA_PATH/train.db \
+    --valid-cluster-path=$NEW_DATA_PATH/valid_clusters.parquet \
+    --valid-database-path=$NEW_DATA_PATH/validation.db \
     --micro-batch-size=${batch_size} \
     --num-nodes=${nodes} \
     --num-gpus=${gpus} \
 
@@ -27,8 +27,20 @@ script_args:
       batch_size: 32
 
 script: |-
-   WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
-    --data-dir ${data_path} \
+  COPY_FLAG="/tmp/copy_done_${{SLURMD_NODENAME}}";
+  NEW_DATA_PATH="/dev/shm/data_path_${{SLURMD_NODENAME}}";
+  if [ "$SLURM_LOCALID" = "0" ]; then
+      df -h;
+      echo $NEW_DATA_PATH;
+      time cp -r ${data_path}/ $NEW_DATA_PATH;
+      touch $COPY_FLAG
+  fi
+  # All ranks wait until install flag file appears
+  while [ ! -f $COPY_FLAG ]; do
+      sleep 1
+  done
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY ${variant}_${model} \
+    --data-dir $NEW_DATA_PATH \
     --experiment-name ${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
     --num-gpus ${gpus} \
     --save-last-checkpoint \
 
@@ -46,7 +46,6 @@ exclude = [
     "dist",
     "node_modules",
     "venv",
-    "packages/nvFSDP/",
 ]
 
 # Ignore import violations in all `__init__.py` files.
 
@@ -2,14 +2,11 @@
 // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
     "name": "Existing Dockerfile",
-    "build": {
-        "context": "..",
-        "dockerfile": "Dockerfile.dev"
-    },
+    "image": "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025",
     "mounts": [
         "source=${localEnv:HOME}/.cache,target=/home/ubuntu/.cache,type=bind,consistency=cached"
     ],
-    "postCreateCommand": "pip install -e .[convert,test]",
+    "postCreateCommand": "PIP_CONSTRAINT= pip install -e .",
     "remoteUser": "ubuntu",
     "runArgs": [
         "--gpus=all",
 
@@ -4,8 +4,7 @@ FROM nvcr.io/nvidia/pytorch:25.01-py3
 RUN MAX_JOBS=4 pip --disable-pip-version-check --no-cache-dir install -v git+https://github.com/facebookresearch/xformers.git@v0.0.29.post1#egg=xformers
 RUN PIP_CONSTRAINT= NVTE_FRAMEWORK=pytorch MAX_JOBS=4 pip --disable-pip-version-check --no-cache-dir install -v git+https://github.com/nvidia/TransformerEngine.git@v2.4
 
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /workspace/bionemo
 COPY . .
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
     PIP_CONSTRAINT= pip install -e .
@@ -36,7 +36,7 @@
     # Smoke test that the model can be loaded.
     model_te = AutoModelForMaskedLM.from_pretrained(
         f"./checkpoint_export/{tag}",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         trust_remote_code=True,
     )
     del model_te
 
@@ -147,17 +147,15 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
             config.padded_vocab_size,
             config.hidden_size,
             padding_idx=config.pad_token_id,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         if config.layer_norm_after_embedding:
             self.layer_norm_1 = (
-                transformer_engine.pytorch.RMSNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
-                )
+                transformer_engine.pytorch.RMSNorm(config.hidden_size, config.norm_eps, params_dtype=config.dtype)
                 if config.rms_norm
                 else transformer_engine.pytorch.LayerNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
+                    config.hidden_size, config.norm_eps, params_dtype=config.dtype
                 )
             )
 
@@ -169,6 +167,9 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
             intermediate_size = int(2 * config.intermediate_size / 3)
             intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
 
+        else:
+            intermediate_size = config.intermediate_size
+
         self.transformer_encoder = nn.ModuleList()
         for layer_num in range(config.num_hidden_layers):
             self.transformer_encoder.append(
@@ -194,7 +195,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                     window_size=(-1, -1),
                     rotary_pos_interleaved=True,
                     seq_length=config.max_length,
-                    params_dtype=config.torch_dtype,
+                    params_dtype=config.dtype,
                 )
             )
 
@@ -212,7 +213,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> BaseModelOutput:
         """Forward pass of the AMPLIFY model.
 
@@ -222,7 +222,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             BaseModelOutput: The output of the model.
@@ -277,7 +276,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                 config.hidden_size,
                 config.padded_vocab_size,
                 config.norm_eps,
-                params_dtype=config.torch_dtype,
+                params_dtype=config.dtype,
                 normalization="RMSNorm" if config.rms_norm else "LayerNorm",
                 init_method=lambda x: torch.nn.init.uniform_(
                     x, -self.config.decoder_init_range, self.config.decoder_init_range
@@ -286,7 +285,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
 
         else:
             self.decoder = transformer_engine.pytorch.Linear(
-                config.hidden_size, config.vocab_size, params_dtype=config.torch_dtype
+                config.hidden_size, config.vocab_size, params_dtype=config.dtype
             )
 
     def forward(
@@ -296,7 +295,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> MaskedLMOutput:
         """Forward pass of the AMPLIFYForMaskedLM model.
 
@@ -306,7 +304,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             MaskedLMOutput: The output of the model.
@@ -317,7 +314,6 @@ def forward(
             output_hidden_states,
             output_attentions,
             labels,
-            **kwargs,
         )
 
         # Classification head with layer norm
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@ exclude = [`
`46`	`46`	`"dist",`
`47`	`47`	`"node_modules",`
`48`	`48`	`"venv",`
`49`		`- "packages/nvFSDP/",`
`50`	`49`	`]`
`51`	`50`
`52`	`51`	# Ignore import violations in all `__init__.py` files.
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`# Smoke test that the model can be loaded.`
`37`	`37`	`model_te = AutoModelForMaskedLM.from_pretrained(`
`38`	`38`	`f"./checkpoint_export/{tag}",`
`39`		`- torch_dtype=torch.bfloat16,`
	`39`	`+ dtype=torch.bfloat16,`
`40`	`40`	`trust_remote_code=True,`
`41`	`41`	`)`
`42`	`42`	`del model_te`