Skip to content

Commit 13a95dc

Browse files
committed
Adjust arcane formulae
1 parent e06c5d9 commit 13a95dc

1 file changed

Lines changed: 8 additions & 3 deletions

File tree

model2vec/distill/tokenizer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import json
44
import logging
5+
import math
56
import re
67
from typing import Any
78

@@ -111,7 +112,7 @@ def _process_tokenizer(
111112
"""Process the WordPiece tokenizer JSON."""
112113
tokenizer_json["model"]["type"] = "Unigram"
113114
tokenizer_json["model"]["unk_id"] = pre_tokenized_tokens.index(unk_token) if unk_token else None
114-
tokenizer_json["model"]["vocab"] = [(token, 0.0) for token in pre_tokenized_tokens]
115+
tokenizer_json["model"]["vocab"] = [(token, min(0.0, math.log(0.1 * len(token)))) for token in pre_tokenized_tokens]
115116

116117
return tokenizer_json
117118

@@ -170,7 +171,7 @@ def replace_vocabulary(
170171
def _rename_added_token(
171172
form: str | None, new_form: str, added_tokens: list[dict[str, Any]], vocabulary: list[str]
172173
) -> list[dict[str, Any]]:
173-
"""Rename special tokens in the tokenizer."""
174+
"""Rename added tokens in the tokenizer."""
174175
if form is None:
175176
return added_tokens
176177

@@ -228,7 +229,11 @@ def clean_and_create_vocabulary(
228229
n_duplicates += 1
229230
continue
230231

231-
# After checking the token exists, we need to future-normalize it.
232+
# After checking the token exists, we need to normalize it into the token
233+
# it will become. For byte tokens, this means we don't do anything. For
234+
# other types of tokens, we will insert a metaspace.
235+
# In the case of multiword tokens, we replace any spaces with the metaspace
236+
# or byte prefix token.
232237
if not normalized_token.startswith(("▁", "Ġ")):
233238
normalized_token = normalized_token.replace(" ", "▁")
234239
normalized_token = f"▁{normalized_token}"

0 commit comments

Comments
 (0)