NVIDIA
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎models/amplify/tests/test_amplify_model.py‎
Lines changed: 0 additions & 49 deletions b/‎models/amplify/tests/test_amplify_model.py‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/default.yaml‎
Lines changed: 0 additions & 3 deletions b/‎recipes/esm2_accelerate/accelerate_config/default.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/dynamo.yaml‎
Lines changed: 15 additions & 0 deletions b/‎recipes/esm2_accelerate/accelerate_config/dynamo.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/fp8.yaml‎
Lines changed: 25 additions & 0 deletions b/‎recipes/esm2_accelerate/accelerate_config/fp8.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/fsdp1_hf.yaml‎
Lines changed: 0 additions & 3 deletions b/‎recipes/esm2_accelerate/accelerate_config/fsdp1_hf.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/fsdp1_te.yaml‎
Lines changed: 0 additions & 3 deletions b/‎recipes/esm2_accelerate/accelerate_config/fsdp1_te.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/fsdp2_hf.yaml‎
Lines changed: 0 additions & 3 deletions b/‎recipes/esm2_accelerate/accelerate_config/fsdp2_hf.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎recipes/esm2_accelerate/accelerate_config/fsdp2_te.yaml‎
Lines changed: 0 additions & 3 deletions b/‎recipes/esm2_accelerate/accelerate_config/fsdp2_te.yaml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎recipes/esm2_accelerate/dataset.py‎
Lines changed: 35 additions & 27 deletions b/‎recipes/esm2_accelerate/dataset.py‎
Lines changed: 35 additions & 27 deletions
@@ -82,6 +82,8 @@ With a locally cloned repository and initialized submodules, build the BioNeMo c
 docker buildx build . -t my-container-tag
 ```
 
+If you see an error message like `No file descriptors available (os error 24)`, add the option `--ulimit nofile=65535:65535` to the docker build command.
+
 #### VSCode Devcontainer for Interactive Debugging
 
 We distribute a [development container](https://devcontainers.github.io/) configuration for vscode
 
@@ -168,52 +168,3 @@ def test_convert_state_dict():
     te_state_dict_keys.remove("decoder.bias")
 
     assert len(te_state_dict_keys) == 0
-
-
-def test_hf_trained_model_loss(input_data):
-    model = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
-    model.to("cuda", dtype=torch.bfloat16)
-    input_data = {k: v.to("cuda") for k, v in input_data.items()}
-    model.eval()
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(**input_data)
-
-    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
-
-
-def test_te_trained_model_loss(input_data):
-    model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
-    model = convert_amplify_hf_to_te(model_hf)
-    model.to("cuda", dtype=torch.bfloat16)
-    input_data = {k: v.to("cuda") for k, v in input_data.items()}
-    model.eval()
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(**input_data)
-
-    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
-
-
-def test_hf_reinitialized_model_loss(input_data):
-    config = amp_hf.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
-    model = amp_hf.AMPLIFY(config)
-    model.to("cuda", dtype=torch.bfloat16)
-    input_data = {k: v.to("cuda") for k, v in input_data.items()}
-    model.eval()
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(**input_data)
-
-    loss = output.loss.detach().cpu()
-    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
-
-
-def test_te_reinitialized_model_loss(input_data):
-    config = amp_te.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
-    model = amp_te.AMPLIFYForMaskedLM(config)
-    model.to("cuda", dtype=torch.bfloat16)
-    input_data = {k: v.to("cuda") for k, v in input_data.items()}
-    model.eval()
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        output = model(**input_data)
-
-    loss = output.loss.detach().cpu()
-    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
@@ -10,9 +10,6 @@ num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
 use_cpu: false
 dynamo_config:
   dynamo_backend: "NO"
@@ -0,0 +1,15 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1
+rdzv_backend: c10d
+same_network: true
+use_cpu: false
+dynamo_config:
+  dynamo_backend: INDUCTOR
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp8
+fp8_config:
+  amax_compute_algorithm: max
+  amax_history_length: 1024
+  backend: TE
+  fp8_format: HYBRID
+  interval: 1
+  margin: 0
+  override_linear_precision:
+  - false
+  - false
+  - false
+  use_autocast_during_eval: false
+num_machines: 1
+num_processes: 1
+rdzv_backend: c10d
+same_network: true
+use_cpu: false
@@ -15,7 +15,4 @@ num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
 use_cpu: false
@@ -15,7 +15,4 @@ num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
 use_cpu: false
@@ -16,7 +16,4 @@ num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
 use_cpu: false
@@ -16,7 +16,4 @@ num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
 use_cpu: false
@@ -16,39 +16,40 @@
 # Create the dataset -- here, we just use a simple parquet file with some raw protein sequences
 # stored in the repo itself to avoid external dependencies.
 
-from pathlib import Path
-
-from datasets import load_dataset
+from datasets import IterableDataset, load_dataset
 from transformers import AutoTokenizer
 from transformers.data.data_collator import DataCollatorForLanguageModeling
 
 
-def infinite_dataloader(dataloader, sampler):
-    """Create an infinite iterator that automatically restarts at the end of each epoch."""
-    epoch = 0
-    while True:
-        sampler.set_epoch(epoch)  # Update epoch for proper shuffling
-        for batch in dataloader:
-            yield batch
-        epoch += 1  # Increment epoch counter after completing one full pass
-
-
-def create_datasets_and_collator(tokenizer_name: str, max_length: int = 1024):
-    """Create a dataloader for the dataset.
+def create_datasets_and_collator(
+    tokenizer_name: str,
+    train_load_dataset_kwargs: dict,
+    eval_load_dataset_kwargs: dict,
+    max_seq_length: int = 1024,
+    truncate_eval_dataset: int | None = None,
+):
+    """Create datasets and a data collator to pass to the huggingface trainer.
 
     Args:
         tokenizer_name: The name of the tokenizer to pull from the HuggingFace Hub.
-        max_length: The maximum length of the protein sequences.
+        train_load_dataset_kwargs: Keyword arguments to pass to `load_dataset` for the train dataset.
+        eval_load_dataset_kwargs: Keyword arguments to pass to `load_dataset` for the eval dataset.
+        max_seq_length: The maximum length of the protein sequences.
+        truncate_eval_dataset: If not `None`, the eval dataset will be truncated to this number of examples.
+
+    This assumes that the dataset has a "sequence" column that will be tokenized.
 
     Returns:
         Tuple of (train_dataset, eval_dataset, data_collator).
     """
-    # We copy this parquet file to the container to avoid external dependencies, modify if you're
-    # using a local dataset. If you're reading this and scaling up the dataset to a larger size,
-    # look into `set_transform` and other streaming options from the `datasets` library.
-    data_path = Path(__file__).parent / "train.parquet"
-    train_dataset = load_dataset("parquet", data_files=data_path.as_posix(), split="train")
-    eval_dataset = train_dataset.select(range(10))
+    train_dataset = load_dataset(**train_load_dataset_kwargs)
+    eval_dataset = load_dataset(**eval_load_dataset_kwargs)
+    if truncate_eval_dataset is not None:
+        if isinstance(eval_dataset, IterableDataset):
+            raise ValueError(
+                "Cannot truncate an IterableDataset, don't use streaming datasets for eval if you want to truncate."
+            )
+        eval_dataset = eval_dataset.select(range(truncate_eval_dataset))
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
@@ -58,17 +59,24 @@ def tokenize_function(examples):
             examples["sequence"],
             truncation=True,
             padding="max_length",
-            max_length=max_length,
-            return_tensors="pt",
+            max_length=max_seq_length,
         )
 
-    for dataset in [train_dataset, eval_dataset]:
-        dataset.set_transform(tokenize_function)
+    train_dataset = train_dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=train_dataset.column_names,
+    )
+    eval_dataset = eval_dataset.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=eval_dataset.column_names,
+    )
 
     data_collator = DataCollatorForLanguageModeling(
         tokenizer=tokenizer,
         mlm_probability=0.15,
-        pad_to_multiple_of=max_length,
+        pad_to_multiple_of=max_seq_length,
     )
 
     return train_dataset, eval_dataset, data_collator