|
2 | 2 |
|
3 | 3 | import json |
4 | 4 | import logging |
| 5 | +import math |
5 | 6 | import re |
6 | 7 | from typing import Any |
7 | 8 |
|
@@ -111,7 +112,7 @@ def _process_tokenizer( |
111 | 112 | """Process the WordPiece tokenizer JSON.""" |
112 | 113 | tokenizer_json["model"]["type"] = "Unigram" |
113 | 114 | tokenizer_json["model"]["unk_id"] = pre_tokenized_tokens.index(unk_token) if unk_token else None |
114 | | - tokenizer_json["model"]["vocab"] = [(token, 0.0) for token in pre_tokenized_tokens] |
| 115 | + tokenizer_json["model"]["vocab"] = [(token, min(0.0, math.log(0.1 * len(token)))) for token in pre_tokenized_tokens] |
115 | 116 |
|
116 | 117 | return tokenizer_json |
117 | 118 |
|
@@ -170,7 +171,7 @@ def replace_vocabulary( |
170 | 171 | def _rename_added_token( |
171 | 172 | form: str | None, new_form: str, added_tokens: list[dict[str, Any]], vocabulary: list[str] |
172 | 173 | ) -> list[dict[str, Any]]: |
173 | | - """Rename special tokens in the tokenizer.""" |
| 174 | + """Rename added tokens in the tokenizer.""" |
174 | 175 | if form is None: |
175 | 176 | return added_tokens |
176 | 177 |
|
@@ -228,7 +229,11 @@ def clean_and_create_vocabulary( |
228 | 229 | n_duplicates += 1 |
229 | 230 | continue |
230 | 231 |
|
231 | | - # After checking the token exists, we need to future-normalize it. |
| 232 | + # After checking the token exists, we need to normalize it into the token |
| 233 | + # it will become. For byte tokens, this means we don't do anything. For |
| 234 | + # other types of tokens, we will insert a metaspace. |
| 235 | + # In the case of multiword tokens, we replace any spaces with the metaspace |
| 236 | + # or byte prefix token. |
232 | 237 | if not normalized_token.startswith(("▁", "Ġ")): |
233 | 238 | normalized_token = normalized_token.replace(" ", "▁") |
234 | 239 | normalized_token = f"▁{normalized_token}" |
|
0 commit comments