Skip to content

Commit 3a5fe70

Browse files
committed
comments/docstring
1 parent 70cf1f8 commit 3a5fe70

1 file changed

Lines changed: 7 additions & 7 deletions

File tree

model2vec/distill/tokenizer.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,15 @@ def _make_new_merges_from_vocab(
7171
merges: list[tuple[str, str]], tokens: list[str], special_tokens: set[str | None]
7272
) -> list[tuple[str, str]]:
7373
"""
74-
Generate new merges (bigrams) from a vocabulary.
74+
Generate new merges from a vocabulary.
7575
76-
This function creates new merge pairs (bigrams) from a given vocabulary of tokens.
76+
This function creates new merge pairs from a given vocabulary of tokens.
7777
The merges are used to build or extend a tokenizer's merge table.
7878
79-
:param merges: The list of existing merges in the form "first second" where first and second are tokens.
79+
:param merges: The list of existing merges in the form (first, second) where first and second are tokens.
8080
:param tokens: The list of tokens (vocabulary) from which to generate new merges.
81-
:param special_tokens: Tokens that are not merged.
82-
:return: The list of new merges in the form "first second" where first and second are tokens.
81+
:param special_tokens: Tokens that should not be merged.
82+
:return: The list of new merges in the form (first, second) where first and second are tokens.
8383
"""
8484
new_merges = merges.copy()
8585
current_vocab = set(tokens) - special_tokens
@@ -93,8 +93,8 @@ def _make_new_merges_from_vocab(
9393
if len(token) == 1:
9494
continue
9595
merges = []
96-
for needle in range(1, len(token)):
97-
first, second = token[:needle], token[needle:]
96+
for index in range(1, len(token)):
97+
first, second = token[:index], token[index:]
9898
if first in current_vocab and second in current_vocab:
9999
merges.append((first, second))
100100
if not merges:

0 commit comments

Comments
 (0)