Skip to content

Commit 5ed76e9

Browse files
committed
talk-llama : sync llama.cpp
1 parent 41cf127 commit 5ed76e9

16 files changed

Lines changed: 555 additions & 58 deletions

examples/talk-llama/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6666
{ LLM_ARCH_XVERSE, "xverse" },
6767
{ LLM_ARCH_COMMAND_R, "command-r" },
6868
{ LLM_ARCH_COHERE2, "cohere2" },
69+
{ LLM_ARCH_COHERE2MOE, "cohere2moe" },
6970
{ LLM_ARCH_DBRX, "dbrx" },
7071
{ LLM_ARCH_OLMO, "olmo" },
7172
{ LLM_ARCH_OLMO2, "olmo2" },

examples/talk-llama/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ enum llm_arch {
7171
LLM_ARCH_XVERSE,
7272
LLM_ARCH_COMMAND_R,
7373
LLM_ARCH_COHERE2,
74+
LLM_ARCH_COHERE2MOE,
7475
LLM_ARCH_DBRX,
7576
LLM_ARCH_OLMO,
7677
LLM_ARCH_OLMO2,

examples/talk-llama/llama-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
13821382
const auto & hparams = model.hparams;
13831383

13841384
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
1385-
const int64_t n_embd = hparams.n_embd_inp();
1385+
const int64_t n_embd = hparams.n_embd_inp_enc();
13861386
const int64_t n_vocab = model.vocab.n_tokens();
13871387

13881388
// note: during encode, we always pass the full sequence starting from pos = 0

examples/talk-llama/llama-ext.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
// this is a staging header for new llama.cpp API
44
// breaking changes and C++ are allowed. everything here should be considered WIP
5+
// try as much as possible to not include this header in the rest of the codebase
56

67
#include "llama.h"
78

examples/talk-llama/llama-graph.cpp

Lines changed: 58 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
10881088
ggml_tensor * w_s) const {
10891089
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
10901090

1091+
if (w_s) {
1092+
res = ggml_mul(ctx0, res, w_s);
1093+
}
1094+
10911095
for (const auto & lora : *loras) {
10921096
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
10931097
if (lw == nullptr) {
@@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
11061110
res = ggml_add(ctx0, res, ab_cur);
11071111
}
11081112

1109-
if (w_s) {
1110-
res = ggml_mul(ctx0, res, w_s);
1111-
}
1112-
11131113
return res;
11141114
}
11151115

11161116
ggml_tensor * llm_graph_context::build_lora_mm_id(
11171117
ggml_tensor * w, // ggml_tensor * as
11181118
ggml_tensor * cur, // ggml_tensor * b
1119-
ggml_tensor * ids) const {
1119+
ggml_tensor * ids,
1120+
ggml_tensor * w_s) const {
11201121
ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
1122+
1123+
if (w_s) {
1124+
const int64_t n_expert = w_s->ne[0];
1125+
const int64_t n_tokens = cur->ne[2];
1126+
ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1);
1127+
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
1128+
s = ggml_get_rows(ctx0, s, ids);
1129+
res = ggml_mul(ctx0, res, s);
1130+
}
11211131
for (const auto & lora : *loras) {
11221132
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
11231133
if (lw == nullptr) {
@@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
12691279
llm_ffn_op_type type_op,
12701280
llm_ffn_gate_type type_gate,
12711281
int il) const {
1282+
// NVFP4 support is currently restricted to
1283+
// 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
1284+
// 2) bias absense (*_s would be applied after bias addition, which is incorrect)
1285+
// TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
1286+
auto has_lora = [this](ggml_tensor * w) {
1287+
if (!w) {
1288+
return false;
1289+
}
1290+
for (const auto & lora : *loras) {
1291+
if (lora.first->get_weight(w) != nullptr) {
1292+
return true;
1293+
}
1294+
}
1295+
return false;
1296+
};
1297+
1298+
GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4);
1299+
GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4);
1300+
GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4);
1301+
GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up));
1302+
GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate));
1303+
GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down));
1304+
12721305
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
12731306
cb(tmp, "ffn_up", il);
12741307

@@ -1627,67 +1660,52 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
16271660

16281661
if (gate_up_exps) {
16291662
// merged gate_up path: one mul_mat_id, then split into gate and up views
1630-
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
1663+
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens]
16311664
cb(gate_up, "ffn_moe_gate_up", il);
16321665

1666+
if (up_exps_s) {
1667+
cb(gate_up, "ffn_moe_gate_up_scaled", il);
1668+
}
1669+
16331670
if (gate_up_exps_b) {
16341671
gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
16351672
cb(gate_up, "ffn_moe_gate_up_biased", il);
16361673
}
16371674

1638-
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
1639-
if (up_exps_s) {
1640-
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
1641-
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
1642-
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1643-
gate_up = ggml_mul(ctx0, gate_up, s);
1644-
cb(gate_up, "ffn_moe_gate_up_scaled", il);
1645-
}
1646-
16471675
const int64_t n_ff = gate_up->ne[0] / 2;
16481676
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
16491677
cb(cur, "ffn_moe_gate", il);
16501678
up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]);
16511679
cb(up, "ffn_moe_up", il);
16521680
} else {
16531681
// separate gate and up path
1654-
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
1682+
up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens]
16551683
cb(up, "ffn_moe_up", il);
16561684

1685+
if (up_exps_s) {
1686+
cb(up, "ffn_moe_up_scaled", il);
1687+
}
1688+
16571689
if (up_exps_b) {
16581690
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
16591691
cb(up, "ffn_moe_up_biased", il);
16601692
}
16611693

1662-
// apply per-expert scale2 to up
1663-
if (up_exps_s) {
1664-
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
1665-
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
1666-
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1667-
up = ggml_mul(ctx0, up, s);
1668-
cb(up, "ffn_moe_up_scaled", il);
1669-
}
1670-
16711694
if (gate_exps) {
1672-
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
1695+
cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens]
16731696
cb(cur, "ffn_moe_gate", il);
16741697
} else {
16751698
cur = up;
16761699
}
16771700

1701+
if (gate_exps_s) {
1702+
cb(cur, "ffn_moe_gate_scaled", il);
1703+
}
1704+
16781705
if (gate_exps_b) {
16791706
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
16801707
cb(cur, "ffn_moe_gate_biased", il);
16811708
}
1682-
1683-
// apply per-expert scale2 to gate
1684-
if (gate_exps_s) {
1685-
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
1686-
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
1687-
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1688-
cur = ggml_mul(ctx0, cur, s);
1689-
cb(cur, "ffn_moe_gate_scaled", il);
1690-
}
16911709
}
16921710

16931711
const bool has_gate = gate_exps || gate_up_exps;
@@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
17591777
GGML_ABORT("fatal error");
17601778
}
17611779

1762-
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1780+
experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens]
17631781
cb(experts, "ffn_moe_down", il);
17641782

1783+
if (down_exps_s) {
1784+
cb(experts, "ffn_moe_down_scaled", il);
1785+
}
1786+
17651787
if (down_exps_b) {
17661788
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
17671789
cb(experts, "ffn_moe_down_biased", il);
17681790
}
17691791

1770-
// apply per-expert scale2 to down
1771-
if (down_exps_s) {
1772-
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
1773-
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
1774-
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
1775-
experts = ggml_mul(ctx0, experts, s);
1776-
cb(experts, "ffn_moe_down_scaled", il);
1777-
}
1778-
17791792
if (!weight_before_ffn) {
17801793
experts = ggml_mul(ctx0, experts, weights);
17811794
cb(experts, "ffn_moe_weighted", il);

examples/talk-llama/llama-graph.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -853,11 +853,12 @@ struct llm_graph_context {
853853
ggml_tensor * cur,
854854
ggml_tensor * w_s = nullptr) const;
855855

856-
// do mat_mul_id, while optionally apply lora
856+
// do mat_mul_id, while optionally apply lora and per-expert scale
857857
ggml_tensor * build_lora_mm_id(
858858
ggml_tensor * w, // ggml_tensor * as
859859
ggml_tensor * cur, // ggml_tensor * b
860-
ggml_tensor * ids) const;
860+
ggml_tensor * ids,
861+
ggml_tensor * w_s = nullptr) const;
861862

862863
ggml_tensor * build_norm(
863864
ggml_tensor * cur,

examples/talk-llama/llama-hparams.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const {
104104
return n_embd_inp;
105105
}
106106

107+
uint32_t llama_hparams::n_embd_inp_enc() const {
108+
return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp();
109+
}
110+
107111
uint32_t llama_hparams::n_embd_out() const {
108112
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
109113
}

examples/talk-llama/llama-hparams.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,10 @@ struct llama_hparams {
189189
// input embedding dimension (0 = use n_embd)
190190
uint32_t n_embd_inp_impl = 0;
191191

192+
// encoder input embedding dimension (0 = use n_embd_inp())
193+
// e.g. the eagle3 encoder fuses target_layers * target_hidden features
194+
uint32_t n_embd_inp_enc_impl = 0;
195+
192196
// output embedding dimension (0 = use n_embd)
193197
uint32_t n_embd_out_impl = 0;
194198

@@ -305,6 +309,9 @@ struct llama_hparams {
305309
// dimension of main + auxiliary input embeddings
306310
uint32_t n_embd_inp() const;
307311

312+
// dimension of the encoder input embeddings
313+
uint32_t n_embd_inp_enc() const;
314+
308315
// dimension of output embeddings
309316
uint32_t n_embd_out() const;
310317

examples/talk-llama/llama-model-saver.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) {
1818
case LLM_ARCH_GEMMA3:
1919
case LLM_ARCH_GEMMA3N:
2020
case LLM_ARCH_COHERE2:
21+
case LLM_ARCH_COHERE2MOE:
2122
case LLM_ARCH_OLMO2:
2223
case LLM_ARCH_BITNET:
2324
case LLM_ARCH_T5:

examples/talk-llama/llama-model.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
157157
return new llama_model_command_r(params);
158158
case LLM_ARCH_COHERE2:
159159
return new llama_model_cohere2(params);
160+
case LLM_ARCH_COHERE2MOE:
161+
return new llama_model_cohere2moe(params);
160162
case LLM_ARCH_DBRX:
161163
return new llama_model_dbrx(params);
162164
case LLM_ARCH_OLMO:
@@ -1467,9 +1469,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
14671469
}
14681470
ml.done_getting_tensors();
14691471

1472+
// Tied NVFP4 output is valid when no separate LM-head scale tensors are present.
1473+
// If sidecar scales exist, the output weight must be an actual output tensor.
14701474
GGML_ASSERT(!(output && tok_embd &&
14711475
strcmp(output->name, tok_embd->name) == 0 &&
1472-
output->type == GGML_TYPE_NVFP4));
1476+
output->type == GGML_TYPE_NVFP4 &&
1477+
(output_s || output_in_s)));
14731478
// populate tensors_by_name
14741479
for (auto & [_, ctx_ptr] : ml.ctx_map) {
14751480
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
@@ -1844,6 +1849,7 @@ void llama_model::print_info() const {
18441849
}
18451850

18461851
if (arch == LLM_ARCH_MELLUM ||
1852+
arch == LLM_ARCH_COHERE2MOE ||
18471853
arch == LLM_ARCH_QWEN3MOE ||
18481854
arch == LLM_ARCH_OPENAI_MOE ||
18491855
arch == LLM_ARCH_QWEN3VLMOE ||
@@ -2389,6 +2395,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
23892395
case LLM_ARCH_XVERSE:
23902396
case LLM_ARCH_COMMAND_R:
23912397
case LLM_ARCH_COHERE2:
2398+
case LLM_ARCH_COHERE2MOE:
23922399
case LLM_ARCH_OLMO:
23932400
case LLM_ARCH_ARCTIC:
23942401
case LLM_ARCH_DEEPSEEK:

0 commit comments

Comments
 (0)