Skip to content

Commit 9a887a3

Browse files
authored
Add median token length as limit (#47)
1 parent 6b8a1be commit 9a887a3

1 file changed

Lines changed: 5 additions & 0 deletions

File tree

model2vec/model.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def __init__(
5656
else:
5757
self.unk_token_id = None
5858

59+
self.median_token_length = int(np.median([len(token) for token in self.tokens]))
5960
self.config = config
6061
self.base_model_name = base_model_name
6162
self.language = language
@@ -123,6 +124,10 @@ def tokenize(self, sentences: list[str], max_length: int | None = None) -> tuple
123124
:param max_length: The maximum length of the sentence.
124125
:return: The tokens.
125126
"""
127+
if max_length is not None:
128+
m = max_length * self.median_token_length
129+
sentences = [sentence[:m] for sentence in sentences]
130+
126131
encodings: list[Encoding] = self.tokenizer.encode_batch(sentences, add_special_tokens=False)
127132
encodings_ids = [encoding.ids for encoding in encodings]
128133

0 commit comments

Comments
 (0)