Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
2e773e4
cuda: reset cuda context after reading memory size (llama/23935)
0cc4m Jun 8, 2026
d55525b
vulkan: Use cm2 decode_vector for mul_mat_id B matrix loads (llama/23…
jeffbolznv Jun 8, 2026
61995c1
Implement 2D workgroups for scale, binary, and unary ops (llama/24044)
nikhilJain17 Jun 8, 2026
22fc9bf
Handle buffer overlap / buffer aliasing for concat operator (llama/24…
nikhilJain17 Jun 8, 2026
302cf1a
ggml-webgpu: Improve prefill speeds for k-quants + refactor matmul fo…
yomaytk Jun 8, 2026
df2c42e
ggml-webgpu: Add clang-format job (llama/24308)
reeselevine Jun 9, 2026
eba2b80
Remove case for GGML_TYPE_Q4_K in mvvq.cu (llama/23528)
ravel7524 Jun 9, 2026
333eca5
ggml-cpu : fix rms_norm_back wrong output under in-place aliasing (ll…
devYRPauli Jun 9, 2026
f352e1b
ggml : add GGML_OP_COL2IM_1D (llama/24206)
ServeurpersoCom Jun 9, 2026
b1f4a88
vulkan: add `v_dot2_f32_f16` support in matrix-matrix multiplication …
0cc4m Jun 9, 2026
4bebb16
vulkan: reduce iq1 shared memory usage for mul_mm (llama/24287)
jeffbolznv Jun 9, 2026
714b533
CUDA: Fix ssm_scan_f32 data-races (llama/24360)
ORippler Jun 10, 2026
adc1965
Remove padding and multiple D2D copies for MTP (llama/24086)
gaugarg-nv Jun 10, 2026
eb551d7
vulkan: use medium matmul tile on Asahi Linux (llama/24306)
xingjianll Jun 11, 2026
288faae
vulkan: add fast path for contiguous buffer transfers (llama/23973)
winstonma Jun 11, 2026
8283d92
ggml : bump version to 0.15.0 (ggml/1539)
ggerganov Jun 11, 2026
4ba780d
vulkan: ifdef eMesaHoneykrisp (build fix) (llama/24479)
jeffbolznv Jun 11, 2026
11ea233
opencl: add q5_0/q5_1 gemm and gemv kernels for Adreno (llama/24319)
shaofeiqi Jun 12, 2026
33c889f
ggml: support concat for scalar types at cuda backend (llama/24011)
zihaomu Jun 12, 2026
217f5cf
ggml : bump version to 0.15.1 (ggml/1541)
ggerganov Jun 12, 2026
5bd6e8d
sync : ggml
ggerganov Jun 15, 2026
ed8738e
talk-llama : sync llama.cpp
ggerganov Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 52 additions & 38 deletions examples/talk-llama/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "llama-impl.h"

#include <map>
#include <set>
#include <vector>

static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
Expand Down Expand Up @@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_EAGLE3, "eagle3" },
{ LLM_ARCH_MISTRAL4, "mistral4" },
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
{ LLM_ARCH_MIMO2, "mimo2" },
Expand Down Expand Up @@ -292,46 +292,51 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

{ LLM_KV_TARGET_LAYERS, "%s.target_layers" },
{ LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
{ LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },

{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
// sentence-transformers dense modules feature dims
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, "tokenizer.ggml.normalizer.strip_accents" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },

{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
Expand Down Expand Up @@ -559,6 +564,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
{ LLM_TENSOR_FC, "fc" },
{ LLM_TENSOR_D2T, "d2t" },
};

// declare information about the model weight tensors:
Expand Down Expand Up @@ -783,6 +792,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
// eagle3
{LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
};

LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
Expand Down
11 changes: 11 additions & 0 deletions examples/talk-llama/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ enum llm_arch {
LLM_ARCH_KIMI_LINEAR,
LLM_ARCH_TALKIE,
LLM_ARCH_MELLUM,
LLM_ARCH_EAGLE3,
LLM_ARCH_UNKNOWN,
};

Expand Down Expand Up @@ -314,6 +315,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
Expand All @@ -336,6 +338,10 @@ enum llm_kv {

LLM_KV_CLASSIFIER_OUTPUT_LABELS,

LLM_KV_TARGET_LAYERS,
LLM_KV_TARGET_HIDDEN_SIZE,
LLM_KV_NORM_BEFORE_RESIDUAL,

LLM_KV_SHORTCONV_L_CACHE,

LLM_KV_XIELU_ALPHA_N,
Expand Down Expand Up @@ -566,8 +572,13 @@ enum llm_tensor {
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
LLM_TENSOR_MASKED_EMBD_ORDERING,
LLM_TENSOR_FC,
LLM_TENSOR_D2T,
};


enum llm_tensor_layer {
LLM_TENSOR_LAYER_INPUT,
LLM_TENSOR_LAYER_REPEATING,
Expand Down
Loading
Loading