Skip to content

Commit 3ce5332

Browse files
committed
Merge remote-tracking branch 'upstream/master' into parakeet-support
2 parents a8cc3fa + f24588a commit 3ce5332

75 files changed

Lines changed: 7386 additions & 2225 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

examples/talk-llama/llama-arch.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
133133
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
134134
{ LLM_ARCH_MAINCODER, "maincoder" },
135135
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
136+
{ LLM_ARCH_TALKIE, "talkie" },
136137
{ LLM_ARCH_UNKNOWN, "(unknown)" },
137138
};
138139

@@ -767,8 +768,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
767768
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
768769
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
769770
// Nemotron 3 Super
770-
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
771-
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
771+
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
772+
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
773+
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
772774
};
773775

774776
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

examples/talk-llama/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ enum llm_arch {
137137
LLM_ARCH_LLAMA_EMBED,
138138
LLM_ARCH_MAINCODER,
139139
LLM_ARCH_KIMI_LINEAR,
140+
LLM_ARCH_TALKIE,
140141
LLM_ARCH_UNKNOWN,
141142
};
142143

examples/talk-llama/llama-chat.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6262
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
6363
{ "granite", LLM_CHAT_TEMPLATE_GRANITE_3_X },
6464
{ "granite-4.0", LLM_CHAT_TEMPLATE_GRANITE_4_0 },
65+
{ "granite-4.1", LLM_CHAT_TEMPLATE_GRANITE_4_1 },
6566
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
6667
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
6768
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
@@ -194,7 +195,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
194195
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
195196
} else if (tmpl_contains("<|start_of_role|>")) {
196197
if (tmpl_contains("<tool_call>") || tmpl_contains("<tools>")) {
197-
return LLM_CHAT_TEMPLATE_GRANITE_4_0;
198+
if (tmpl_contains("g4_default_system_message")) {
199+
return LLM_CHAT_TEMPLATE_GRANITE_4_0;
200+
}
201+
return LLM_CHAT_TEMPLATE_GRANITE_4_1;
198202
}
199203
return LLM_CHAT_TEMPLATE_GRANITE_3_X;
200204
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
@@ -651,6 +655,20 @@ int32_t llm_chat_apply_template(
651655
if (add_ass) {
652656
ss << "<|start_of_role|>assistant<|end_of_role|>";
653657
}
658+
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_4_1) {
659+
// IBM Granite 4.1 template
660+
for (const auto & message : chat) {
661+
std::string role(message->role);
662+
if (role == "assistant_tool_call") {
663+
ss << "<|start_of_role|>assistant<|end_of_role|><|tool_call|>";
664+
} else {
665+
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
666+
}
667+
ss << message->content << "<|end_of_text|>\n";
668+
}
669+
if (add_ass) {
670+
ss << "<|start_of_role|>assistant<|end_of_role|>";
671+
}
654672
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
655673
// GigaChat template
656674
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";

examples/talk-llama/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ enum llm_chat_template {
4141
LLM_CHAT_TEMPLATE_RWKV_WORLD,
4242
LLM_CHAT_TEMPLATE_GRANITE_3_X,
4343
LLM_CHAT_TEMPLATE_GRANITE_4_0,
44+
LLM_CHAT_TEMPLATE_GRANITE_4_1,
4445
LLM_CHAT_TEMPLATE_GIGACHAT,
4546
LLM_CHAT_TEMPLATE_MEGREZ,
4647
LLM_CHAT_TEMPLATE_YANDEX,

examples/talk-llama/llama-model.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
4444
return new llama_model_llama_embed(params);
4545
case LLM_ARCH_MAINCODER:
4646
return new llama_model_maincoder(params);
47+
case LLM_ARCH_TALKIE:
48+
return new llama_model_talkie(params);
4749
case LLM_ARCH_DECI:
4850
return new llama_model_deci(params);
4951
case LLM_ARCH_BAICHUAN:
@@ -2353,6 +2355,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
23532355
case LLM_ARCH_QWEN3NEXT:
23542356
case LLM_ARCH_MIMO2:
23552357
case LLM_ARCH_STEP35:
2358+
case LLM_ARCH_TALKIE:
23562359
return LLAMA_ROPE_TYPE_NEOX;
23572360

23582361
case LLM_ARCH_QWEN2VL:

examples/talk-llama/llama-model.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ struct llama_layer {
488488
struct ggml_tensor * indexer_attn_k = nullptr;
489489
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
490490

491-
// gemma4 layer output scale
491+
// gemma4 layer output scale, reused for talkie embedding skip scale
492492
struct ggml_tensor * out_scale = nullptr;
493493

494494
struct llama_layer_posnet posnet;

examples/talk-llama/llama-vocab.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
511511
};
512512
byte_encode = false;
513513
break;
514+
case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
515+
regex_exprs = {
516+
// original regex from tokenizer.json (openbmb/MiniCPM5-1B)
517+
"\\p{N}{1,3}",
518+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
519+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
520+
};
521+
break;
514522
default:
515523
// default regex for BPE tokenization pre-processing
516524
regex_exprs = {
@@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20392047
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
20402048
} else if (tokenizer_pre == "default") {
20412049
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2050+
} else if (tokenizer_pre == "minicpm5") {
2051+
pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
2052+
ignore_merges = true;
20422053
} else if (
20432054
tokenizer_pre == "llama3" ||
20442055
tokenizer_pre == "llama-v3" ||
@@ -2196,7 +2207,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21962207
} else if (
21972208
tokenizer_pre == "gpt-4o" ||
21982209
tokenizer_pre == "llama4" ||
2199-
tokenizer_pre == "kanana2") {
2210+
tokenizer_pre == "kanana2" ||
2211+
tokenizer_pre == "talkie") {
22002212
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
22012213
clean_spaces = false;
22022214
} else if (

examples/talk-llama/llama-vocab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
6060
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
6161
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
6262
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
63+
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
6364
};
6465

6566
struct LLM_KV;

examples/talk-llama/models/mistral3.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,9 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
177177
cb(cur, "ffn_norm", il);
178178

179179
cur = build_ffn(cur,
180-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
181-
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
182-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
180+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
181+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
182+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
183183
NULL,
184184
LLM_FFN_SILU, LLM_FFN_PAR, il);
185185
cb(cur, "ffn_out", il);
@@ -200,7 +200,11 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
200200
LLM_FFN_SILU, true,
201201
hparams.expert_weights_scale,
202202
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
203-
il);
203+
il,
204+
nullptr, nullptr,
205+
model.layers[il].ffn_up_exps_s,
206+
model.layers[il].ffn_gate_exps_s,
207+
model.layers[il].ffn_down_exps_s);
204208
cb(cur, "ffn_moe_out", il);
205209
}
206210
cur = ggml_add(ctx0, cur, ffn_inp);

examples/talk-llama/models/models.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,19 @@ struct llama_model_maincoder : public llama_model_base {
186186
};
187187

188188

189+
struct llama_model_talkie : public llama_model_base {
190+
llama_model_talkie(const struct llama_model_params & params) : llama_model_base(params) {}
191+
void load_arch_hparams(llama_model_loader & ml) override;
192+
void load_arch_tensors(llama_model_loader & ml) override;
193+
194+
struct graph : public llm_graph_context {
195+
graph(const llama_model & model, const llm_graph_params & params);
196+
};
197+
198+
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
199+
};
200+
201+
189202
struct llama_model_deci : public llama_model_base {
190203
llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {}
191204
void load_arch_hparams(llama_model_loader & ml) override;

0 commit comments

Comments
 (0)