Skip to content

Commit 9777256

Browse files
zhangtao2-1zhangtao
andauthored
convert: add MiniCPM5 tokenizer support (ggml-org#23384)
Add minicpm5 pre-tokenizer hash via convert_hf_to_gguf_update.py and implement hardcoded regex handling in llama-vocab.cpp, consistent with other BPE pre-tokenizers. Co-authored-by: zhangtao <zhangtao2@modelbest.cn>
1 parent 7085492 commit 9777256

4 files changed

Lines changed: 16 additions & 0 deletions

File tree

conversion/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,6 +1625,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
16251625
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
16261626
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
16271627
res = "talkie"
1628+
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
1629+
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
1630+
res = "minicpm5"
16281631

16291632
if res is None:
16301633
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class TOKENIZER_TYPE(IntEnum):
157157
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
158158
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
159159
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
160+
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
160161
]
161162

162163
# some models are known to be broken upstream, so we will skip them as exceptions

src/llama-vocab.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
511511
};
512512
byte_encode = false;
513513
break;
514+
case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
515+
regex_exprs = {
516+
// original regex from tokenizer.json (openbmb/MiniCPM5-1B)
517+
"\\p{N}{1,3}",
518+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
519+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
520+
};
521+
break;
514522
default:
515523
// default regex for BPE tokenization pre-processing
516524
regex_exprs = {
@@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20392047
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
20402048
} else if (tokenizer_pre == "default") {
20412049
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2050+
} else if (tokenizer_pre == "minicpm5") {
2051+
pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
2052+
ignore_merges = true;
20422053
} else if (
20432054
tokenizer_pre == "llama3" ||
20442055
tokenizer_pre == "llama-v3" ||

src/llama-vocab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
6060
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
6161
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
6262
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
63+
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
6364
};
6465

6566
struct LLM_KV;

0 commit comments

Comments
 (0)