Skip to content

Commit cca7577

Browse files
authored
pin transformers, increase tolerance in gradient check (#1441)
Fix a few issues in recipes ci Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent 7c74523 commit cca7577

3 files changed

Lines changed: 6 additions & 3 deletions

File tree

bionemo-recipes/models/amplify/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies = [
1818
"pytest",
1919
"torch==2.6.0a0+ecf3bae40a.nv25.01",
2020
"transformer_engine[pytorch]",
21-
"transformers",
21+
"transformers<5.0",
2222
"xformers",
2323
]
2424

bionemo-recipes/models/esm2/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ dependencies = [
1919
"torch",
2020
"torchao!=0.14.0",
2121
"transformer_engine[pytorch]",
22-
"transformers",
22+
"transformers<5.0",
2323
]
2424

2525

bionemo-recipes/recipes/esm2_native_te/tests/test_stop_and_go.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def is_main_process(self):
3939

4040

4141
def test_stop_and_go_checkpointing_and_dataloader_restoration_single_gpu(tmp_path):
42+
# Set the seed for reproducibility
43+
torch.manual_seed(42)
44+
4245
# Setup the dataloader
4346
tokenizer_name = "facebook/esm2_t6_8M_UR50D"
4447
load_dataset_kwargs = {
@@ -266,7 +269,7 @@ def test_stop_and_go_checkpointing_and_dataloader_restoration_single_gpu(tmp_pat
266269

267270
reference_grads_step_10 = torch.load(f"{step10_path_reference}_grads.pt")
268271
reloaded_grads_step_5 = torch.load(f"{step5_path_reloaded}_grads.pt")
269-
torch.testing.assert_close(reference_grads_step_10, reloaded_grads_step_5, atol=1e-2, rtol=1e-2)
272+
torch.testing.assert_close(reference_grads_step_10, reloaded_grads_step_5, atol=1e-2, rtol=2e-2)
270273

271274
shutil.rmtree(step5_path_reference)
272275
shutil.rmtree(step10_path_reference)

0 commit comments

Comments
 (0)