Skip to content

Commit 52dacd6

Browse files
committed
Merge remote-tracking branch 'origin/main' into evo2_gpt_inference
2 parents 8d22da6 + 4a47964 commit 52dacd6

61 files changed

Lines changed: 1865 additions & 319 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devcontainer/recipes/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
77
torchmetrics
88
tqdm
99
transformer_engine
10-
transformers
10+
transformers @ git+https://github.com/huggingface/transformers.git
1111
typer
1212
wandb
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: "BioNeMo Model Convergence Tests"
2+
3+
on:
4+
workflow_dispatch:
5+
6+
# run lepton tests
7+
# update dashboard
8+
9+
jobs:
10+
submit-lepton-jobs:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout
14+
uses: actions/checkout@v4
15+
16+
- name: Submit Lepton Jobs
17+
run: |
18+
python ci/lepton/model_convergence/scripts/launch_job.py --config-name "evo2_finetune_lora"

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
url = https://github.com/NVIDIA/Megatron-LM.git
44
[submodule "3rdparty/NeMo"]
55
path = 3rdparty/NeMo
6-
url = https://github.com/NVIDIA/NeMo.git
6+
url = https://github.com/NVIDIA-NeMo/NeMo.git
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
scope: partial-conv
2+
time_limit: 14400
3+
key_segments:
4+
# Modify keys to be renamed (str) or excluded (False) from run identifier. By default, all args under script_args are included.
5+
dataset_config: False
6+
dataset_dir: False
7+
data_base_path: False
8+
num_workers: False
9+
limit_val_batches: False
10+
val_check_interval: False
11+
experiment_name: False
12+
workspace: False
13+
restore_from_checkpoint_path: False
14+
activation_checkpoint_layers: False
15+
lora_enabled: False
16+
lr: False
17+
min_lr: False
18+
warmup_steps: False
19+
accumulate_grad_batches: False
20+
clip_grad: False
21+
weight_decay: False
22+
attention_dropout: False
23+
hidden_dropout: False
24+
precision: False
25+
seq_length: False
26+
script_args:
27+
# All arguments referenced in the script string must be specified here.
28+
# Arguments not referenced in the script string must have the 'arg' field specified.
29+
# See jet/core/configs.py for the specification of the configuration class
30+
workspace: /workspace/bionemo2
31+
data_base_path: /data/evo2
32+
restore_from_checkpoint_path: checkpoints/nemo2_evo2_1b_8k
33+
nodes: 1
34+
model: evo2
35+
config_name: 1b
36+
num_workers: 1
37+
limit_val_batches: 20
38+
dataset_config: training_data_config.yaml
39+
dataset_dir: preprocessed_data
40+
val_check_interval: 5
41+
seq_length: 8192
42+
warmup_steps: 10
43+
activation_checkpoint_layers: 2
44+
lr: 0.000015
45+
min_lr: 0.0000149
46+
accumulate_grad_batches: 4
47+
max_steps: 1000
48+
gpus: 1
49+
clip_grad: 250
50+
weight_decay: 0.001
51+
attention_dropout: 0.01
52+
hidden_dropout: 0.01
53+
stop_steps: 100
54+
batch_size: 2
55+
variant: finetune
56+
precision: fp8
57+
products:
58+
- variant: finetune
59+
lora_enabled: ""
60+
task: finetune_from_ckpt
61+
experiment_name: evo2-finetune
62+
- variant: lora_finetune
63+
lora_enabled: "--lora-finetune"
64+
task: lora_finetune_from_ckpt
65+
experiment_name: evo2-lora-finetune
66+
script: |-
67+
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY train_${model} \
68+
-d ${data_base_path}/${dataset_config} \
69+
--dataset-dir=${data_base_path}/${dataset_dir} \
70+
--ckpt-dir=${data_base_path}/${restore_from_checkpoint_path} \
71+
${lora_enabled} \
72+
--model-size=${config_name} \
73+
--max-steps=${max_steps} \
74+
--experiment-name=${experiment_name}_${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s \
75+
--lr=${lr} \
76+
--min-lr=${min_lr} \
77+
--warmup-steps=${warmup_steps} \
78+
--result-dir=${tensorboard_dir} \
79+
--micro-batch-size=${batch_size} \
80+
--grad-acc-batches=${accumulate_grad_batches} \
81+
--limit-val-batches=${limit_val_batches} \
82+
--seq-length=${seq_length} \
83+
--clip-grad=${clip_grad} \
84+
--wd=${weight_decay} \
85+
--attention-dropout=${attention_dropout} \
86+
--hidden-dropout=${hidden_dropout} \
87+
--num-layers 4 \
88+
--hybrid-override-pattern 'SDH*' \
89+
--devices=${gpus} \
90+
--num-nodes=${nodes} \
91+
--val-check-interval=${val_check_interval} \
92+
--wandb-project=${wandb_project_name} \
93+
--wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
94+
--create-tensorboard-logger \
95+
--activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
96+
--disable-checkpointing \
97+
--early-stop-on-step=${stop_steps} \
98+
--garbage-collect-at-inference;

ci/benchmarks/partial-conv/evo2_pretrain.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ script_args:
1515
# See jet/core/configs.py for the specification of the configuration class
1616
workspace: /workspace/bionemo2
1717
data_path: /data/evo2
18-
artefacts_url: https://__token__:${JET_GITLAB_TOKEN}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
18+
artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
1919
file_name_wheel: subquadratic-ops
2020
model: evo2
2121
variant: train

models/amplify/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
88
WORKDIR /workspace/bionemo
99
COPY . .
1010
RUN --mount=type=cache,target=/root/.cache/uv \
11-
uv pip install --system --break-system-packages -e .
11+
PIP_CONSTRAINT= pip install -e .

models/esm2/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
33
WORKDIR /workspace/bionemo
44
COPY . .
55
RUN --mount=type=cache,target=/root/.cache/uv \
6-
uv pip install --system --break-system-packages -e .
6+
PIP_CONSTRAINT= pip install -e .

models/esm2/pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ dependencies = [
1313
"fiddle",
1414
"hydra-core",
1515
"lightning",
16-
"megatron-core",
17-
"nemo_toolkit[lightning]==2.3.1",
16+
"megatron-core@git+https://github.com/NVIDIA/Megatron-LM.git", # Currently at ToT until mfsdp is in a release.
17+
"megatron-fsdp",
18+
"nemo_toolkit[lightning]", # tested with 2.3.1
1819
"omegaconf",
1920
"pytest",
2021
"torch",
21-
# "transformer_engine[pytorch]",
22-
"transformers",
22+
"transformer_engine[pytorch]",
23+
"transformers<4.56", # TODO: fix me, currently failing with a modelopt import from nemo.
2324
]
2425

2526

models/esm2/src/esm/modeling_esm_te.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,12 @@ def __init__(self, config: NVEsmConfig):
138138
self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
139139
if config.position_embedding_type == "rotary":
140140
self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
141-
self.te_rope_emb = self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cuda()
141+
# Keep on CPU, pin for faster non_blocking H2D; don't persist in state_dict.
142+
self.register_buffer(
143+
"te_rope_emb",
144+
self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cpu().pin_memory(),
145+
persistent=False,
146+
)
142147
else:
143148
self.te_rope_emb = None
144149

@@ -157,14 +162,28 @@ def forward(
157162
"""
158163
all_hidden_states = () if output_hidden_states else None
159164

165+
if self.te_rope_emb is not None:
166+
te_rope_emb = self.te_rope_emb.to(
167+
device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
168+
)
169+
seq_len = hidden_states.shape[1]
170+
if te_rope_emb.size(0) < seq_len:
171+
raise RuntimeError(
172+
f"ROPE length {te_rope_emb.size(0)} < input seq length {seq_len}. "
173+
f"Increase max_position_embeddings."
174+
)
175+
te_rope_emb = te_rope_emb[:seq_len]
176+
else:
177+
te_rope_emb = None
178+
160179
for layer_module in self.layers:
161180
if output_hidden_states:
162181
all_hidden_states = (*all_hidden_states, hidden_states)
163182

164183
hidden_states = layer_module(
165184
hidden_states,
166185
attention_mask,
167-
rotary_pos_emb=self.te_rope_emb,
186+
rotary_pos_emb=te_rope_emb,
168187
)
169188

170189
hidden_states = self.emb_layer_norm_after(hidden_states)

models/esm2/tests/conftest.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ def tokenizer():
3434
return AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
3535

3636

37-
@pytest.fixture
38-
def input_data(tokenizer):
37+
def get_input_data(tokenizer):
3938
torch.manual_seed(42)
4039

4140
test_proteins = [
@@ -87,3 +86,8 @@ def tokenize_function(examples):
8786

8887
batch = next(iter(dataloader))
8988
return batch
89+
90+
91+
@pytest.fixture
92+
def input_data(tokenizer):
93+
return get_input_data(tokenizer)

0 commit comments

Comments
 (0)