Skip to content

Commit 4757718

Browse files
committed
lstm convert unkown to pad token
1 parent 94af466 commit 4757718

1 file changed

Lines changed: 1 addition & 2 deletions

File tree

torch_molecule/predictor/lstm/token_from_smiles.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#Dictionary mapping each character to a corresponding token (integer).
2-
# TODO: convert unkown elements to the pad token
32
char_dic = {
43
'<pad>':0,
54
'#': 1, # Triple bond
@@ -74,7 +73,7 @@ def create_tensor_dataset(string_list, input_len, pad_token=0):
7473

7574
for s in string_list:
7675
# Convert each character in the string to a token
77-
tokens = [char_dic[char] for char in s]
76+
tokens = [char_dic.get(char, pad_token) for char in s]
7877
# Pad the token sequence if it's shorter than input_len; otherwise, truncate it
7978
if len(tokens) < input_len:
8079
tokens = tokens + [pad_token] * (input_len - len(tokens))

0 commit comments

Comments
 (0)