Skip to content

Commit bf50543

Browse files
committed
Merge branch 'main' of github.com:NVIDIA/bionemo-framework into jstjohn/cp_support_predict
2 parents 5b4108a + 375a44b commit bf50543

23 files changed

Lines changed: 582 additions & 407 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ With a locally cloned repository and initialized submodules, build the BioNeMo c
8282
docker buildx build . -t my-container-tag
8383
```
8484

85+
If you see an error message like `No file descriptors available (os error 24)`, add the option `--ulimit nofile=65535:65535` to the docker build command.
86+
8587
#### VSCode Devcontainer for Interactive Debugging
8688

8789
We distribute a [development container](https://devcontainers.github.io/) configuration for vscode

models/amplify/tests/test_amplify_model.py

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -168,52 +168,3 @@ def test_convert_state_dict():
168168
te_state_dict_keys.remove("decoder.bias")
169169

170170
assert len(te_state_dict_keys) == 0
171-
172-
173-
def test_hf_trained_model_loss(input_data):
174-
model = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
175-
model.to("cuda", dtype=torch.bfloat16)
176-
input_data = {k: v.to("cuda") for k, v in input_data.items()}
177-
model.eval()
178-
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
179-
output = model(**input_data)
180-
181-
torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
182-
183-
184-
def test_te_trained_model_loss(input_data):
185-
model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
186-
model = convert_amplify_hf_to_te(model_hf)
187-
model.to("cuda", dtype=torch.bfloat16)
188-
input_data = {k: v.to("cuda") for k, v in input_data.items()}
189-
model.eval()
190-
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
191-
output = model(**input_data)
192-
193-
torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
194-
195-
196-
def test_hf_reinitialized_model_loss(input_data):
197-
config = amp_hf.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
198-
model = amp_hf.AMPLIFY(config)
199-
model.to("cuda", dtype=torch.bfloat16)
200-
input_data = {k: v.to("cuda") for k, v in input_data.items()}
201-
model.eval()
202-
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
203-
output = model(**input_data)
204-
205-
loss = output.loss.detach().cpu()
206-
assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
207-
208-
209-
def test_te_reinitialized_model_loss(input_data):
210-
config = amp_te.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
211-
model = amp_te.AMPLIFYForMaskedLM(config)
212-
model.to("cuda", dtype=torch.bfloat16)
213-
input_data = {k: v.to("cuda") for k, v in input_data.items()}
214-
model.eval()
215-
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
216-
output = model(**input_data)
217-
218-
loss = output.loss.detach().cpu()
219-
assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"

recipes/esm2_accelerate/accelerate_config/default.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@ num_machines: 1
1010
num_processes: 1
1111
rdzv_backend: c10d
1212
same_network: true
13-
tpu_env: []
14-
tpu_use_cluster: false
15-
tpu_use_sudo: false
1613
use_cpu: false
1714
dynamo_config:
1815
dynamo_backend: "NO"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
distributed_type: MULTI_GPU
4+
downcast_bf16: 'no'
5+
enable_cpu_affinity: false
6+
machine_rank: 0
7+
main_training_function: main
8+
mixed_precision: bf16
9+
num_machines: 1
10+
num_processes: 1
11+
rdzv_backend: c10d
12+
same_network: true
13+
use_cpu: false
14+
dynamo_config:
15+
dynamo_backend: INDUCTOR
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
distributed_type: MULTI_GPU
4+
downcast_bf16: 'no'
5+
enable_cpu_affinity: false
6+
machine_rank: 0
7+
main_training_function: main
8+
mixed_precision: fp8
9+
fp8_config:
10+
amax_compute_algorithm: max
11+
amax_history_length: 1024
12+
backend: TE
13+
fp8_format: HYBRID
14+
interval: 1
15+
margin: 0
16+
override_linear_precision:
17+
- false
18+
- false
19+
- false
20+
use_autocast_during_eval: false
21+
num_machines: 1
22+
num_processes: 1
23+
rdzv_backend: c10d
24+
same_network: true
25+
use_cpu: false

recipes/esm2_accelerate/accelerate_config/fsdp1_hf.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,4 @@ num_machines: 1
1515
num_processes: 1
1616
rdzv_backend: c10d
1717
same_network: true
18-
tpu_env: []
19-
tpu_use_cluster: false
20-
tpu_use_sudo: false
2118
use_cpu: false

recipes/esm2_accelerate/accelerate_config/fsdp1_te.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,4 @@ num_machines: 1
1515
num_processes: 1
1616
rdzv_backend: c10d
1717
same_network: true
18-
tpu_env: []
19-
tpu_use_cluster: false
20-
tpu_use_sudo: false
2118
use_cpu: false

recipes/esm2_accelerate/accelerate_config/fsdp2_hf.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,4 @@ num_machines: 1
1616
num_processes: 1
1717
rdzv_backend: c10d
1818
same_network: true
19-
tpu_env: []
20-
tpu_use_cluster: false
21-
tpu_use_sudo: false
2219
use_cpu: false

recipes/esm2_accelerate/accelerate_config/fsdp2_te.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,4 @@ num_machines: 1
1616
num_processes: 1
1717
rdzv_backend: c10d
1818
same_network: true
19-
tpu_env: []
20-
tpu_use_cluster: false
21-
tpu_use_sudo: false
2219
use_cpu: false

recipes/esm2_accelerate/dataset.py

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,39 +16,40 @@
1616
# Create the dataset -- here, we just use a simple parquet file with some raw protein sequences
1717
# stored in the repo itself to avoid external dependencies.
1818

19-
from pathlib import Path
20-
21-
from datasets import load_dataset
19+
from datasets import IterableDataset, load_dataset
2220
from transformers import AutoTokenizer
2321
from transformers.data.data_collator import DataCollatorForLanguageModeling
2422

2523

26-
def infinite_dataloader(dataloader, sampler):
27-
"""Create an infinite iterator that automatically restarts at the end of each epoch."""
28-
epoch = 0
29-
while True:
30-
sampler.set_epoch(epoch) # Update epoch for proper shuffling
31-
for batch in dataloader:
32-
yield batch
33-
epoch += 1 # Increment epoch counter after completing one full pass
34-
35-
36-
def create_datasets_and_collator(tokenizer_name: str, max_length: int = 1024):
37-
"""Create a dataloader for the dataset.
24+
def create_datasets_and_collator(
25+
tokenizer_name: str,
26+
train_load_dataset_kwargs: dict,
27+
eval_load_dataset_kwargs: dict,
28+
max_seq_length: int = 1024,
29+
truncate_eval_dataset: int | None = None,
30+
):
31+
"""Create datasets and a data collator to pass to the huggingface trainer.
3832
3933
Args:
4034
tokenizer_name: The name of the tokenizer to pull from the HuggingFace Hub.
41-
max_length: The maximum length of the protein sequences.
35+
train_load_dataset_kwargs: Keyword arguments to pass to `load_dataset` for the train dataset.
36+
eval_load_dataset_kwargs: Keyword arguments to pass to `load_dataset` for the eval dataset.
37+
max_seq_length: The maximum length of the protein sequences.
38+
truncate_eval_dataset: If not `None`, the eval dataset will be truncated to this number of examples.
39+
40+
This assumes that the dataset has a "sequence" column that will be tokenized.
4241
4342
Returns:
4443
Tuple of (train_dataset, eval_dataset, data_collator).
4544
"""
46-
# We copy this parquet file to the container to avoid external dependencies, modify if you're
47-
# using a local dataset. If you're reading this and scaling up the dataset to a larger size,
48-
# look into `set_transform` and other streaming options from the `datasets` library.
49-
data_path = Path(__file__).parent / "train.parquet"
50-
train_dataset = load_dataset("parquet", data_files=data_path.as_posix(), split="train")
51-
eval_dataset = train_dataset.select(range(10))
45+
train_dataset = load_dataset(**train_load_dataset_kwargs)
46+
eval_dataset = load_dataset(**eval_load_dataset_kwargs)
47+
if truncate_eval_dataset is not None:
48+
if isinstance(eval_dataset, IterableDataset):
49+
raise ValueError(
50+
"Cannot truncate an IterableDataset, don't use streaming datasets for eval if you want to truncate."
51+
)
52+
eval_dataset = eval_dataset.select(range(truncate_eval_dataset))
5253

5354
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
5455

@@ -58,17 +59,24 @@ def tokenize_function(examples):
5859
examples["sequence"],
5960
truncation=True,
6061
padding="max_length",
61-
max_length=max_length,
62-
return_tensors="pt",
62+
max_length=max_seq_length,
6363
)
6464

65-
for dataset in [train_dataset, eval_dataset]:
66-
dataset.set_transform(tokenize_function)
65+
train_dataset = train_dataset.map(
66+
tokenize_function,
67+
batched=True,
68+
remove_columns=train_dataset.column_names,
69+
)
70+
eval_dataset = eval_dataset.map(
71+
tokenize_function,
72+
batched=True,
73+
remove_columns=eval_dataset.column_names,
74+
)
6775

6876
data_collator = DataCollatorForLanguageModeling(
6977
tokenizer=tokenizer,
7078
mlm_probability=0.15,
71-
pad_to_multiple_of=max_length,
79+
pad_to_multiple_of=max_seq_length,
7280
)
7381

7482
return train_dataset, eval_dataset, data_collator

0 commit comments

Comments
 (0)