Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9e3f81e
mtmd, arg: fix utf8 handling on windows (llama/24779)
ngxson Jun 19, 2026
7163f68
ggml-webgpu: add adapter toggles for F16 on Vulkan + NVIDIA
yomaytk Jun 19, 2026
a32ed1c
ggml : optimize AMX (llama/24806)
angt Jun 20, 2026
44aff5f
fix(hexagon): use padded stride for ssm-conv weights (llama/24470)
BiReRa Jun 20, 2026
bf0da8c
support bf16 on bin_bcast OP and unary OPs (llama/24838)
arthw Jun 22, 2026
2c3dd7b
opencl: q8_0 gemv precision improvement (llama/24923)
shawngu-quic Jun 23, 2026
44d75b6
ggml-webgpu: improve MTP inference by using mat-vec path for small ba…
yomaytk Jun 23, 2026
c793db0
vulkan: link ggml-cpu when GGML_VULKAN_CHECK_RESULTS / RUN_TESTS are …
Detensable Jun 23, 2026
67fd5c7
vulkan: make mul_mm ALIGNED a spec constant (llama/24689)
jeffbolznv Jun 23, 2026
f8f62c7
vulkan: support CONV_3D (llama/24612)
jeffbolznv Jun 23, 2026
d0aaddd
vulkan: Support GET_ROWS_BACK (llama/24883)
jeffbolznv Jun 23, 2026
f55f4eb
vulkan: support all backend tests for SQR/SQRT/SIN/COS/CLAMP/LEAKY_RE…
jeffbolznv Jun 23, 2026
376093c
vulkan: Apply bias before softmax in FA, to avoid overflow (llama/24909)
jeffbolznv Jun 24, 2026
7f3f9fd
vulkan: fail the build when a shader fails to compile (llama/24450)
liminfei-amd Jun 24, 2026
21bedb3
vulkan: allow reducing the graph submission batches to avoid timeouts…
wbruna Jun 24, 2026
393a2c8
hexagon: MUL_MAT and MUL_MAT_ID rework : 32x32 tiled weight repack, k…
max-krasnyansky Jun 24, 2026
3d57322
opencl: support non-contig rows in norm (llama/24965)
lhez Jun 25, 2026
bb6b2ae
sycl : fix the failed UT cases of conv_3d (llama/24900)
arthw Jun 25, 2026
c1e9f24
sycl : support --split-mode tensor (llama/24152)
Spruill-1 Jun 25, 2026
26fef3f
ggml : address integer overflows in binary ops CUDA implementation (l…
fairydreaming Jun 25, 2026
f8b9ba4
CUDA: Various fixes to `cpy.cu` (llama/25000)
ORippler Jun 25, 2026
8005ef8
opencl: flush profiling batch at shutdown for incomplete batches (lla…
shaofeiqi Jun 26, 2026
8be1e63
CUDA: batch out_prod broadcast (dps2>1) path with cublasSgemmBatched …
leonardHONG Jun 26, 2026
9f0a6b6
sycl : clamp softmax input to avoid underflow (llama/24941)
Jassieluo Jun 26, 2026
96e90a8
ggml-cpu: fix SVE leftover path in ggml_vec_dot_f32 (llama/24699)
tdakhran Jun 26, 2026
c3281af
CUDA: add cublasSgemmBatched mapping for HIP/MUSA vendor headers (lla…
leonardHONG Jun 26, 2026
325c37a
vulkan: Workaround compiler bug in conv2d coopmat2 path (llama/24924)
jeffbolznv Jun 26, 2026
8b2bb5c
ggml : bump version to 0.15.3 (ggml/1550)
ggerganov Jun 26, 2026
b5f276c
sync : ggml
ggerganov Jun 26, 2026
fa343d2
talk-llama : sync llama.cpp
ggerganov Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/talk-llama/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,10 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
sched_need_reserve = true;
}

void llama_context::set_nextn_layer_offset(int32_t offset) {
cparams.nextn_layer_offset = offset;
}

void llama_context::set_causal_attn(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);

Expand Down Expand Up @@ -3699,6 +3703,10 @@ void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool valu
ctx->set_embeddings_layer_inp(lid, value);
}

void llama_set_nextn_layer_offset(llama_context * ctx, int32_t offset) {
ctx->set_nextn_layer_offset(offset);
}

llama_memory_t llama_get_memory(const struct llama_context * ctx) {
if (!ctx) {
return nullptr;
Expand Down
1 change: 1 addition & 0 deletions examples/talk-llama/llama-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ struct llama_context {
void set_embeddings (bool value);
void set_embeddings_nextn(bool value, bool masked);
void set_embeddings_layer_inp(uint32_t lid, bool enable);
void set_nextn_layer_offset(int32_t offset);
void set_causal_attn(bool value);
void set_warmup(bool value);

Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/llama-cparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ struct llama_cparams {
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing

int32_t nextn_layer_offset = 0;

float rope_freq_base;
float rope_freq_scale;

Expand Down
5 changes: 5 additions & 0 deletions examples/talk-llama/llama-ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);

// Select which appended NextN block the DECODER_MTP graph runs (offset past
// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to
// chain multiple trained NextN heads. Default 0 (first head).
LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);

// mirrors:
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
Expand Down
11 changes: 9 additions & 2 deletions examples/talk-llama/llama-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -682,9 +682,16 @@ struct llm_graph_params {
}
}

// TODO: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448035248
if (cparams.nextn_layer_offset != other.cparams.nextn_layer_offset) {
return false;
}

return
cparams.embeddings == other.cparams.embeddings &&
cparams.causal_attn == other.cparams.causal_attn &&
cparams.embeddings == other.cparams.embeddings &&
cparams.embeddings_nextn == other.cparams.embeddings_nextn &&
cparams.embeddings_nextn_masked == other.cparams.embeddings_nextn_masked &&
cparams.causal_attn == other.cparams.causal_attn &&
arch == other.arch &&
gtype == other.gtype &&
cvec == other.cvec &&
Expand Down
5 changes: 5 additions & 0 deletions examples/talk-llama/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_160M: return "160M";
case LLM_TYPE_190M: return "190M";
case LLM_TYPE_220M: return "220M";
case LLM_TYPE_230M: return "230M";
case LLM_TYPE_250M: return "250M";
case LLM_TYPE_256M: return "256M";
case LLM_TYPE_270M: return "270M";
Expand Down Expand Up @@ -2312,6 +2313,10 @@ int32_t llama_model_n_layer(const llama_model * model) {
return model->hparams.n_layer();
}

int32_t llama_model_n_layer_nextn(const llama_model * model) {
return model->hparams.n_layer_nextn;
}

int32_t llama_model_n_head(const llama_model * model) {
return model->hparams.n_head();
}
Expand Down
1 change: 1 addition & 0 deletions examples/talk-llama/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ enum llm_type {
LLM_TYPE_160M,
LLM_TYPE_190M,
LLM_TYPE_220M,
LLM_TYPE_230M,
LLM_TYPE_250M,
LLM_TYPE_256M,
LLM_TYPE_270M,
Expand Down
6 changes: 3 additions & 3 deletions examples/talk-llama/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
qs.has_tied_embeddings = false;
}
}
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer_all;
}

//
Expand Down Expand Up @@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

// copy the KV pairs from the input file
gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);

// Remove split metadata
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
Expand Down
2 changes: 0 additions & 2 deletions examples/talk-llama/llama-sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2813,8 +2813,6 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
cur_p->data[i].logit = -INFINITY;
}
}

llama_sampler_softmax_impl(cur_p, true);
}

static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) {
Expand Down
17 changes: 9 additions & 8 deletions examples/talk-llama/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,14 +558,15 @@ extern "C" {
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);

LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_ctx_train (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);

// Get the model's RoPE frequency scaling factor
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
Expand Down
10 changes: 5 additions & 5 deletions examples/talk-llama/models/glm-dsa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);

// DSA indexer
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
if (i < (int) hparams.n_layer_dense_lead) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
Expand Down
19 changes: 15 additions & 4 deletions examples/talk-llama/models/lfm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
hparams.n_layer_dense_lead = hparams.n_layer();

switch (hparams.n_ff()) {
case 2560: type = LLM_TYPE_230M; break;
case 4608: type = LLM_TYPE_350M; break;
case 6912: type = LLM_TYPE_700M; break;
case 8192: type = LLM_TYPE_1_2B; break;
Expand Down Expand Up @@ -190,7 +191,15 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);

bx = ggml_concat(ctx0, conv, bx, 0);
// causal prepends the state, non-causal pads symmetrically for a centered window
if (hparams.causal_attn) {
bx = ggml_concat(ctx0, conv, bx, 0);
} else {
const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2;
auto * left = ggml_cont(ctx0,
ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0]));
bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0);
}
GGML_ASSERT(bx->ne[0] > conv->ne[0]);

// last d_conv columns is a new conv state
Expand Down Expand Up @@ -266,10 +275,12 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
if (!cparams.embeddings) {
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);

res->t_logits = cur;
res->t_logits = cur;
}

ggml_build_forward_expand(gf, cur);
}
Expand Down
1 change: 0 additions & 1 deletion examples/talk-llama/models/mamba-base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
GGML_ASSERT(d_inner % d_state == 0);
GGML_ASSERT(d_inner % n_group == 0);

ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
Expand Down
13 changes: 7 additions & 6 deletions examples/talk-llama/models/mamba2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
const int64_t dt_rank = hparams.ssm_dt_rank;

const int64_t conv_dim = d_inner + 2 * n_group * d_state;
const int64_t d_in_proj = d_inner + conv_dim + dt_rank;

// only an expansion factor of 2 is supported for now
GGML_ASSERT(2 * n_embd == d_inner);

tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

Expand All @@ -68,11 +69,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);

layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {dt_rank}, 0);

// no "weight" suffix for these
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, dt_rank}, 0);
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, dt_rank}, 0);

layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);

Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/models/qwen35.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para

// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;

ggml_tensor * inpSA = inpL;

cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/models/qwen35moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p

// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;

ggml_tensor * inpSA = inpL;

cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
Expand Down
Loading
Loading