Skip to content

Commit 9c956cd

Browse files
committed
coderabbit suggestions
Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent e985338 commit 9c956cd

1 file changed

Lines changed: 2 additions & 5 deletions

File tree

recipes/esm2_accelerate/dataset.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,26 +53,23 @@ def create_datasets_and_collator(
5353

5454
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
5555

56-
def tokenize_function(sequence):
56+
def tokenize_function(examples):
5757
"""Tokenize the protein sequences."""
5858
return tokenizer(
59-
sequence,
59+
examples["sequence"],
6060
truncation=True,
6161
padding="max_length",
6262
max_length=max_seq_length,
63-
return_tensors="pt",
6463
)
6564

6665
train_dataset = train_dataset.map(
6766
tokenize_function,
6867
batched=True,
69-
input_columns=["sequence"],
7068
remove_columns=train_dataset.column_names,
7169
)
7270
eval_dataset = eval_dataset.map(
7371
tokenize_function,
7472
batched=True,
75-
input_columns=["sequence"],
7673
remove_columns=eval_dataset.column_names,
7774
)
7875

0 commit comments

Comments
 (0)