Skip to content

Commit bdd2f93

Browse files
aldehirOsamaMazhar
authored andcommitted
vocab : add byte token handling to BPE detokenizer for Gemma4 (ggml-org#21488)
1 parent 64c41bf commit bdd2f93

1 file changed

Lines changed: 7 additions & 1 deletion

File tree

src/llama-vocab.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2804,7 +2804,9 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
28042804
return strtol(buf.c_str(), NULL, 16);
28052805
}
28062806
case LLAMA_VOCAB_TYPE_BPE: {
2807-
GGML_ABORT("fatal error");
2807+
// Gemma4 uses BPE with SPM-style byte fallback tokens (<0xXX>)
2808+
auto buf = token_data.text.substr(3, 2);
2809+
return strtol(buf.c_str(), NULL, 16);
28082810
}
28092811
case LLAMA_VOCAB_TYPE_WPM: {
28102812
GGML_ABORT("fatal error");
@@ -3285,6 +3287,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
32853287
std::string result = llama_decode_text(token_text);
32863288
return _try_copy(result.data(), result.size());
32873289
}
3290+
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3291+
char byte = (char) token_to_byte(token);
3292+
return _try_copy((char*) &byte, 1);
3293+
}
32883294
break;
32893295
}
32903296
case LLAMA_VOCAB_TYPE_RWKV: {

0 commit comments

Comments
 (0)