Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions asmtransformers/scripts/tokenize_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import sys
import argparse

import datasets

Expand All @@ -17,19 +17,41 @@ def do_tokenize(function):
return dataset.map(do_tokenize, **map_kwargs)


if __name__ == '__main__':
# expect 3 arguments from cli
tokenizer, data_in, data_out = sys.argv[1:]

def main(tokenizer, input_data, output_folder, split):
tokenizer = ASMTokenizer.from_pretrained(tokenizer)
dataset = datasets.load_from_disk(data_in)
print('loading dataset')
dataset = datasets.load_from_disk(input_data)
if split:
dataset = dataset.train_test_split(test_size=split)

# let the tokenizer preprocess data from data_in, write the result to data_out

print('tokenizing dataset')
if isinstance(dataset, datasets.Dataset): # datasets.load_from_disk either a Dataset or DatasetDict type
dataset = tokenize(tokenizer, dataset)
else:
for subset in dataset:
dataset[subset] = tokenize(tokenizer, dataset[subset])

dataset.save_to_disk(data_out)
dataset.save_to_disk(output_folder)


def get_parser():
parser = argparse.ArgumentParser()

parser.add_argument('--tokenizer', type=str, required=True, help='folder with tokenizer')
parser.add_argument('--input-data', type=str, required=True, help='data to be used for training')
parser.add_argument('--output-folder', type=str, required=True, help='folder to leave the tokenized data')
parser.add_argument(
'--split',
type=float,
required=False,
help='split between train and test; define percentage of test data as number between 0 and 1',
)

return parser


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
main(**vars(args))
Loading