We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent e985338 commit 9c956cdCopy full SHA for 9c956cd
1 file changed
recipes/esm2_accelerate/dataset.py
@@ -53,26 +53,23 @@ def create_datasets_and_collator(
53
54
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
55
56
- def tokenize_function(sequence):
+ def tokenize_function(examples):
57
"""Tokenize the protein sequences."""
58
return tokenizer(
59
- sequence,
+ examples["sequence"],
60
truncation=True,
61
padding="max_length",
62
max_length=max_seq_length,
63
- return_tensors="pt",
64
)
65
66
train_dataset = train_dataset.map(
67
tokenize_function,
68
batched=True,
69
- input_columns=["sequence"],
70
remove_columns=train_dataset.column_names,
71
72
eval_dataset = eval_dataset.map(
73
74
75
76
remove_columns=eval_dataset.column_names,
77
78
0 commit comments