Skip to content

Commit 1b44040

Browse files
authored
Merge pull request #389 from rubik-hua/double_bos
issue/388 [BugFix](basic_llm_processor): prevent duplicate BOS token in Llama-3/3.1 chat
2 parents 160894e + e0110e1 commit 1b44040

1 file changed

Lines changed: 10 additions & 3 deletions

File tree

python/infinilm/processors/basic_llm_processor.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,25 @@ def __init__(self, model_dir_path: str):
1212
)
1313

1414
def __call__(self, prompt: str, return_tensors: str = None, **kwargs) -> dict:
15+
# add_special_tokens=False Prevent duplicate BOS token for Llama-3/3.1 models.
16+
# The `prompt` string here is already rendered by `apply_chat_template(tokenize=False)`,
17+
# which explicitly includes the `<|begin_of_text|>` (BOS) token at the start.
18+
# Since `LlamaTokenizerFast` defaults to `add_bos_token=True`, calling the tokenizer
19+
# with the default `add_special_tokens=True` would prepend a second BOS token.
20+
# This shifts the RoPE positional encodings by 1 and causes greedy decoding outputs
21+
# to diverge significantly from HuggingFace. We must explicitly disable it.
1522
if return_tensors is None:
16-
return self.tokenizer(prompt)
23+
return self.tokenizer(prompt, add_special_tokens=False)
1724
elif return_tensors == "infini":
1825
import infinicore
1926

2027
result = {}
21-
for key, tensor in self.tokenizer(prompt, return_tensors="pt").items():
28+
for key, tensor in self.tokenizer(prompt, return_tensors="pt", add_special_tokens=False).items():
2229
result[key] = tensor.from_torch(tensor)
2330
return result
2431

2532
# "pt" or "np" or "tf".
26-
return self.tokenizer(prompt, return_tensors="pt")
33+
return self.tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
2734

2835
def apply_chat_template(
2936
self,

0 commit comments

Comments
 (0)