@@ -71,15 +71,15 @@ def _make_new_merges_from_vocab(
7171 merges : list [tuple [str , str ]], tokens : list [str ], special_tokens : set [str | None ]
7272) -> list [tuple [str , str ]]:
7373 """
74- Generate new merges (bigrams) from a vocabulary.
74+ Generate new merges from a vocabulary.
7575
76- This function creates new merge pairs (bigrams) from a given vocabulary of tokens.
76+ This function creates new merge pairs from a given vocabulary of tokens.
7777 The merges are used to build or extend a tokenizer's merge table.
7878
79- :param merges: The list of existing merges in the form " first second" where first and second are tokens.
79+ :param merges: The list of existing merges in the form ( first, second) where first and second are tokens.
8080 :param tokens: The list of tokens (vocabulary) from which to generate new merges.
81- :param special_tokens: Tokens that are not merged.
82- :return: The list of new merges in the form " first second" where first and second are tokens.
81+ :param special_tokens: Tokens that should not be merged.
82+ :return: The list of new merges in the form ( first, second) where first and second are tokens.
8383 """
8484 new_merges = merges .copy ()
8585 current_vocab = set (tokens ) - special_tokens
@@ -93,8 +93,8 @@ def _make_new_merges_from_vocab(
9393 if len (token ) == 1 :
9494 continue
9595 merges = []
96- for needle in range (1 , len (token )):
97- first , second = token [:needle ], token [needle :]
96+ for index in range (1 , len (token )):
97+ first , second = token [:index ], token [index :]
9898 if first in current_vocab and second in current_vocab :
9999 merges .append ((first , second ))
100100 if not merges :
0 commit comments