@@ -12,18 +12,25 @@ def __init__(self, model_dir_path: str):
1212 )
1313
1414 def __call__ (self , prompt : str , return_tensors : str = None , ** kwargs ) -> dict :
15+ # add_special_tokens=False Prevent duplicate BOS token for Llama-3/3.1 models.
16+ # The `prompt` string here is already rendered by `apply_chat_template(tokenize=False)`,
17+ # which explicitly includes the `<|begin_of_text|>` (BOS) token at the start.
18+ # Since `LlamaTokenizerFast` defaults to `add_bos_token=True`, calling the tokenizer
19+ # with the default `add_special_tokens=True` would prepend a second BOS token.
20+ # This shifts the RoPE positional encodings by 1 and causes greedy decoding outputs
21+ # to diverge significantly from HuggingFace. We must explicitly disable it.
1522 if return_tensors is None :
16- return self .tokenizer (prompt )
23+ return self .tokenizer (prompt , add_special_tokens = False )
1724 elif return_tensors == "infini" :
1825 import infinicore
1926
2027 result = {}
21- for key , tensor in self .tokenizer (prompt , return_tensors = "pt" ).items ():
28+ for key , tensor in self .tokenizer (prompt , return_tensors = "pt" , add_special_tokens = False ).items ():
2229 result [key ] = tensor .from_torch (tensor )
2330 return result
2431
2532 # "pt" or "np" or "tf".
26- return self .tokenizer (prompt , return_tensors = "pt" )
33+ return self .tokenizer (prompt , return_tensors = "pt" , add_special_tokens = False )
2734
2835 def apply_chat_template (
2936 self ,
0 commit comments