Skip to content

Commit 4e50689

Browse files
pwilkinicex
authored andcommitted
vocab: add gemma4 tokenizer tests, fix edge case (ggml-org#21534)
* YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! * Remove unnecessary hash from update script. * minor: move constant
1 parent 7a22c32 commit 4e50689

File tree

5 files changed

+169
-2
lines changed

5 files changed

+169
-2
lines changed

models/ggml-vocab-gemma-4.gguf

15 MB
Binary file not shown.

models/ggml-vocab-gemma-4.gguf.inp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Äpfel
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL

models/ggml-vocab-gemma-4.gguf.out

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
1178 236743 236812 47041 3794
2+
239122 22744 535
3+
4+
236743
5+
138
6+
139
7+
255968
8+
107
9+
108
10+
109
11+
255968 107
12+
9259 1902
13+
26352 1902
14+
9259 4109
15+
26352 4109
16+
26352 4109 236888
17+
9259 236764 1902 236888
18+
26352 236764 1902 236888
19+
672 563 236743 478 397 404 391 236761 12362
20+
236765 236771 236812 236828 236743 236832 11372 12065 31806 3405 9360
21+
1337 12515 1333 4632 165543 3830
22+
234889 63031 219876 66212 239077 237907 144494
23+
242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 568 8960 64334 600 815 1061 1852 8369 236768
24+
9259
25+
26352
26+
138 9259
27+
139 9259
28+
140 9259
29+
140 9259 107 140 9259
30+
568
31+
107 578
32+
236789 6933
33+
9259 236764 570 236789 712 236888 2088 659 611 170124 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352
34+
123947
35+
236800
36+
236800 236800
37+
236800 236800 236800
38+
236800 236800 236800 236800
39+
236800 236800 236800 236800 236800
40+
236800 236800 236800 236800 236800 236800
41+
236800 236800 236800 236800 236800 236800 236800
42+
236800 236800 236800 236800 236800 236800 236800 236800
43+
236800 236800 236800 236800 236800 236800 236800 236800 236800
44+
236780 29719 33154
45+
2243 2206
46+
107 236743 108 236743 109 236743 255968 236743 255969 236743 255968 107 138 107 139 107 140 107 141 107 242015 568 7382 236768 236743 247717 237243 248989 238178 568 43819 111730 150567 236768 113452 236743 478 397 404 391 478 397 404 391 236743 236800 236743 236800 236800 236743 236800 236800 236800 236743 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236743 236800 236800 236800 236800 236800 236800 236800 236800 236743 236800 236761 236800 236743 236800 856 236800 236743 236800 1390 236800 90986 92814 63031 219876 66212 241702 2360 62133 237075 17641 11700 236770 236800 236770 236812 236770 236810 236770 237471 238352 80448 120697 210119 1333 4632 165543 3830 9451 159561 2629 2629 2717 84491 19938 123947 38950 10371 564 236789 560 1010 756 151812 668 236789 236751 993 236764 756 1357 611 2889 236881 756 236792 711 2889 564 236789 859 1386 625 236764 756 236796 611 1133 1070 11115 236881 1191 236789 32541 496 236789 95635

src/llama-vocab.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {
659659

660660
if (token == LLAMA_TOKEN_NULL) {
661661
for (auto j = str.begin(); j != str.end(); ++j) {
662-
std::string byte_str(1, *j);
663-
auto token_multibyte = vocab.text_to_token(byte_str);
662+
llama_token token_multibyte = LLAMA_TOKEN_NULL;
663+
if (tokenizer.byte_encode) {
664+
std::string byte_str(1, *j);
665+
token_multibyte = vocab.text_to_token(byte_str);
666+
} else {
667+
// For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
668+
static const char * hex = "0123456789ABCDEF";
669+
const uint8_t ch = (uint8_t)*j;
670+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
671+
token_multibyte = vocab.text_to_token(buf);
672+
}
664673
if (token_multibyte != LLAMA_TOKEN_NULL) {
665674
output.push_back(token_multibyte);
666675
}

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJE
124124
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
125125
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
126126
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
127+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gemma-4 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gemma-4.gguf)
127128
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
128129
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
129130
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)

0 commit comments

Comments
 (0)