@@ -1018,36 +1018,84 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
10181018
10191019 int n_tokens = 0 ;
10201020
1021- /* Qwen tokenizer has no BOS token, but handle the flag gracefully */
1021+ /* Add BOS token if requested.
1022+ * Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */
10221023 if (add_bos ) {
1023- /* Qwen uses <|im_start|> for conversation start, not a generic BOS.
1024- * For raw text generation, we skip BOS. */
1024+ /* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
1025+ int bos_id = str_lookup (tok , "<bos>" );
1026+ if (bos_id < 0 ) { bos_id = str_lookup (tok , "<|im_start|>" ); }
1027+ if (bos_id >= 0 ) {
1028+ tokens [n_tokens ++ ] = bos_id ;
1029+ }
10251030 }
10261031
10271032 if (* text == '\0' ) return n_tokens ;
10281033
1029- /* Convert each byte of the input text to its BPE character token.
1030- * For byte-level BPE, each input byte maps to a single BPE character
1031- * which should exist in the vocab as a single-char token. */
1034+ /* Detect tokenizer style: Gemma uses ▁ (U+2581) for spaces in vocab,
1035+ * GPT2/Qwen uses byte-level BPE with Ġ/ĉ encoding.
1036+ * Check if '▁' exists in vocab as a simple heuristic. */
1037+ int is_sentencepiece = (str_lookup (tok , "\xe2\x96\x81" ) >= 0 ); /* ▁ = U+2581 = 0xE2 0x96 0x81 */
1038+
10321039 int text_len = (int )strlen (text );
10331040
1034- for (int i = 0 ; i < text_len && n_tokens < max_tokens ; i ++ ) {
1035- unsigned char byte = (unsigned char )text [i ];
1036- char bpe_char [4 ];
1037- encode_byte_to_bpe_char (byte , bpe_char );
1041+ if (is_sentencepiece ) {
1042+ /* SentencePiece-style: replace spaces with ▁, then split into UTF-8 characters.
1043+ * Each character is looked up in vocab directly. */
1044+ /* First, build normalized text with ▁ replacing spaces, and ▁ prepended */
1045+ int norm_cap = text_len * 4 + 16 ;
1046+ char * norm = (char * )malloc ((size_t )norm_cap );
1047+ if (!norm ) return n_tokens ;
1048+ int ni = 0 ;
1049+ /* Prepend ▁ (space before first word, SentencePiece convention) */
1050+ norm [ni ++ ] = (char )0xE2 ; norm [ni ++ ] = (char )0x96 ; norm [ni ++ ] = (char )0x81 ;
1051+ for (int i = 0 ; i < text_len ; i ++ ) {
1052+ if (text [i ] == ' ' ) {
1053+ norm [ni ++ ] = (char )0xE2 ; norm [ni ++ ] = (char )0x96 ; norm [ni ++ ] = (char )0x81 ;
1054+ } else {
1055+ norm [ni ++ ] = text [i ];
1056+ }
1057+ }
1058+ norm [ni ] = '\0' ;
1059+
1060+ /* Split into individual UTF-8 characters */
1061+ for (int i = 0 ; i < ni && n_tokens < max_tokens ; ) {
1062+ /* Determine UTF-8 character length */
1063+ unsigned char c = (unsigned char )norm [i ];
1064+ int clen = 1 ;
1065+ if (c >= 0xF0 ) { clen = 4 ; }
1066+ else if (c >= 0xE0 ) { clen = 3 ; }
1067+ else if (c >= 0xC0 ) { clen = 2 ; }
1068+ if (i + clen > ni ) break ;
1069+
1070+ char ch_str [8 ];
1071+ memcpy (ch_str , norm + i , (size_t )clen );
1072+ ch_str [clen ] = '\0' ;
1073+
1074+ int id = str_lookup (tok , ch_str );
1075+ if (id >= 0 ) {
1076+ tokens [n_tokens ++ ] = id ;
1077+ }
1078+ /* If not found, skip (byte fallback tokens handle this in merges) */
1079+ i += clen ;
1080+ }
1081+ free (norm );
1082+ } else {
1083+ /* GPT2/Qwen byte-level BPE: each byte maps to a BPE character token */
1084+ for (int i = 0 ; i < text_len && n_tokens < max_tokens ; i ++ ) {
1085+ unsigned char byte = (unsigned char )text [i ];
1086+ char bpe_char [4 ];
1087+ encode_byte_to_bpe_char (byte , bpe_char );
10381088
1039- int id = str_lookup (tok , bpe_char );
1040- if (id >= 0 ) {
1041- tokens [n_tokens ++ ] = id ;
1042- } else {
1043- /* Should not happen for valid byte-level BPE vocab */
1044- /* Try direct byte as single-char string fallback */
1045- char direct [2 ] = { (char )byte , '\0' };
1046- id = str_lookup (tok , direct );
1089+ int id = str_lookup (tok , bpe_char );
10471090 if (id >= 0 ) {
10481091 tokens [n_tokens ++ ] = id ;
1092+ } else {
1093+ char direct [2 ] = { (char )byte , '\0' };
1094+ id = str_lookup (tok , direct );
1095+ if (id >= 0 ) {
1096+ tokens [n_tokens ++ ] = id ;
1097+ }
10491098 }
1050- /* If still not found, skip the byte */
10511099 }
10521100 }
10531101
@@ -1112,6 +1160,24 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
11121160 return "" ; /* Don't output special tokens as text */
11131161 }
11141162
1115- /* Decode BPE byte representation to actual UTF-8 */
1163+ /* SentencePiece: replace ▁ (U+2581) with space */
1164+ if (strstr (piece , "\xe2\x96\x81" ) != NULL ) {
1165+ static __thread char sp_buf [1024 ];
1166+ int j = 0 ;
1167+ for (int i = 0 ; piece [i ] && j < (int )sizeof (sp_buf ) - 1 ; ) {
1168+ if ((unsigned char )piece [i ] == 0xE2 &&
1169+ (unsigned char )piece [i + 1 ] == 0x96 &&
1170+ (unsigned char )piece [i + 2 ] == 0x81 ) {
1171+ sp_buf [j ++ ] = ' ' ;
1172+ i += 3 ;
1173+ } else {
1174+ sp_buf [j ++ ] = piece [i ++ ];
1175+ }
1176+ }
1177+ sp_buf [j ] = '\0' ;
1178+ return sp_buf ;
1179+ }
1180+
1181+ /* GPT2/Qwen: decode BPE byte representation to actual UTF-8 */
11161182 return decode_bpe_token (piece );
11171183}
0 commit comments