diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml index 33d7415c9c4..0df358557ef 100644 --- a/.github/workflows/ui-publish.yml +++ b/.github/workflows/ui-publish.yml @@ -13,8 +13,13 @@ on: required: true jobs: + build: + name: Build static output + uses: ./.github/workflows/ui-build.yml + publish: name: Publish UI Static Output + needs: build runs-on: ubuntu-24.04-arm permissions: diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index e83056557bf..528e4c9c069 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() { add_tensor(model->output); add_tensor(model->output_b); add_tensor(model->output_norm_enc); + add_tensor(model->output_s); + add_tensor(model->output_in_s); add_tensor(model->cls); add_tensor(model->cls_b); add_tensor(model->cls_out); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ff30a2ae7a6..46ae010f800 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1394,10 +1394,23 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } } + // output scales + if (output && output->type == GGML_TYPE_NVFP4) { + // weight scale + if (!output_s) { + output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED); + } + // input scale + if (!output_in_s) { + output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED); + } + } } - ml.done_getting_tensors(); + GGML_ASSERT(!(output && tok_embd && + strcmp(output->name, tok_embd->name) == 0 && + output->type == GGML_TYPE_NVFP4)); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { diff --git a/src/llama-model.h b/src/llama-model.h index d63c689185a..01c87a75271 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -533,6 +533,11 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + + // NVFP4 per-tensor scale2, input_scale for LM head + struct ggml_tensor * output_s = nullptr; + struct ggml_tensor * output_in_s = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 602e3176afd..a7c77ee5d28 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index 136ff702957..bec7136521c 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index 70e86d41130..d086c4717ff 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index d8653a44639..27deadffeb7 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 79aa8c90899..9bd04127b25 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4e55290e4e5..4d26081cd5d 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index 030dd4f42a4..fe1ae10864b 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index e7fe3d5b45a..2f0d44a6259 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index b600fb0c954..30b0f3d07d0 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 8510b9e29f8..4bceaefd63b 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index e898eff7939..6766fa71c15 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index e9e85d96713..274dd3342a7 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 79236121bd5..2e231bb3f93 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index 12edbae1094..a514cf88fc6 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index decb89f547b..adf7fcaa20f 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index bce6b04bcf9..af71c775365 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 9f1a959c32c..567e3535276 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp index c7946059662..f52ec9518b6 100644 --- a/src/models/deepseek.cpp +++ b/src/models/deepseek.cpp @@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 93cbcf9d931..435d27281c6 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 60a3f0ec285..12ac6f1ce88 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp index 2bd01a2c512..8d9ff138676 100644 --- a/src/models/ernie4-5-moe.cpp +++ b/src/models/ernie4-5-moe.cpp @@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index fa989fe92cd..9b39c605e35 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 54bb3ca86b3..76d91982fc5 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index 75d5f60631c..c7e9960d718 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 5506e76424d..499e22dde81 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -163,7 +163,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index d353befdb8e..94b65a3c7c9 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index 75f2cfef560..ad546ef2db5 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 06731670007..1519682fdf6 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index 6255bf740fc..ae3f9ffb530 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index ee510fe38b0..63a2b380e71 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -207,7 +207,7 @@ llama_model_gemma3::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 881499b0ca7..6ec3a006081 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); { // final logit soft-capping diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index f45ae4cad59..4f9d8b18bc7 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 45886b51ac1..27654b8cba3 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index d6ef76e26d6..7c242fed298 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index ba49c31b56b..e2dcc8b1521 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 33ebe2d8800..443e35addf2 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 12e4790ae24..27f6706ea10 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits if (hparams.f_logit_scale) { diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 5e7c7b68181..cda4aa231fa 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -145,7 +145,7 @@ llama_model_granite::graph::graph( res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 0bc49d00206..7c46ec1c0f2 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index feef815165b..1cab75adc7f 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -184,7 +184,7 @@ llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index 44af42412f7..deb3c9671f3 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -179,7 +179,7 @@ llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-vl.cpp b/src/models/hunyuan-vl.cpp index 5fb9154bec0..da9bb74de7e 100644 --- a/src/models/hunyuan-vl.cpp +++ b/src/models/hunyuan-vl.cpp @@ -181,7 +181,7 @@ llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f0c5580a6f4..f9ee37a24b6 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -129,7 +129,7 @@ llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index a6451dca095..2ba162605f1 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -123,7 +123,7 @@ llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index ad59b953e8d..8966131441c 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -152,7 +152,7 @@ llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index e1b8d137e38..84ea63c3136 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -189,7 +189,7 @@ llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index df6a8028736..29081344b24 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -262,7 +262,7 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index b60f67f6c4b..9722dde9f17 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -153,7 +153,7 @@ llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada.cpp b/src/models/llada.cpp index fa21c5fe32c..58b2c466e17 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -147,7 +147,7 @@ llama_model_llada::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 8ddb5936820..cef66d054b0 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -235,7 +235,7 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap if constexpr (!embed) { // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 899611d53f6..0ff5376d571 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -260,7 +260,7 @@ llama_model_llama4::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 3dbd82fd362..84cfe399027 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -141,7 +141,7 @@ llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index b7708d7fdd1..887a1fa509a 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -128,7 +128,7 @@ llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 71996616611..d0295ec116f 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -231,7 +231,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index ff5eb6ffa5f..1ffc54fa7c6 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -251,7 +251,7 @@ llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_pa cb(cur, "lmhead_scaling", -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 0dee8934692..22e291d73a3 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -158,7 +158,7 @@ llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 708da49af1f..4e6ebef82cb 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -222,7 +222,7 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index cfc60e8de29..0229d20ed36 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -161,7 +161,7 @@ llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index 865461f61db..a82f9c170b4 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -174,7 +174,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 0c72ed297aa..5d4a3b5c69e 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -140,7 +140,7 @@ llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index 161035e72bc..cfcf17bcb03 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -133,7 +133,7 @@ llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 9633f269965..7cc262f5504 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -198,7 +198,7 @@ llama_model_olmo2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 4bb9013054c..7976ae44a51 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -164,7 +164,7 @@ llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 13a590ce646..15b6c8c1205 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -160,7 +160,7 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index b4128e116e7..9f76350fd4d 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -162,7 +162,7 @@ llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/orion.cpp b/src/models/orion.cpp index 7ace0a5139d..bcb4bbba4b1 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -132,7 +132,7 @@ llama_model_orion::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/paddleocr.cpp b/src/models/paddleocr.cpp index 1c0eadefa98..d39220bd778 100644 --- a/src/models/paddleocr.cpp +++ b/src/models/paddleocr.cpp @@ -98,7 +98,7 @@ llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 41b7e2ac23e..7593f879b24 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -148,7 +148,7 @@ llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index a333602c72d..8f3ed5f7b7d 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -130,7 +130,7 @@ llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index 0a65e91fefa..f8a4a4d5aa5 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -179,7 +179,7 @@ llama_model_phi3::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index 4c16c20a0d4..c7ed1211c31 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -127,7 +127,7 @@ llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 29c8702606a..b713889fe72 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -185,7 +185,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); // Explicitly mark as output tensor to ensure proper backend assignment diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 849f1579e63..29f3e803d68 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -186,7 +186,7 @@ llama_model_plamo3::graph::graph(const llama_model & model, const llm_grap cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/plm.cpp b/src/models/plm.cpp index 57f5995103b..ce050919e6a 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -204,7 +204,7 @@ llama_model_plm::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index cdc076cdf77..00467dbad7d 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -131,7 +131,7 @@ llama_model_qwen::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index 6320458a13b..a5147460bae 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -141,7 +141,7 @@ llama_model_qwen2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7587c802c68..7cb03859deb 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -184,7 +184,7 @@ llama_model_qwen2moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2vl.cpp b/src/models/qwen2vl.cpp index 1a40fa89be4..d79db682cd4 100644 --- a/src/models/qwen2vl.cpp +++ b/src/models/qwen2vl.cpp @@ -134,7 +134,7 @@ llama_model_qwen2vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index fa656c84ea0..41b97fed956 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -147,7 +147,7 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index f276be61ba8..b188810f931 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -167,7 +167,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index cf05dc9d61c..8ec9b8c6f7d 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -180,7 +180,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 4440b83aa45..a4f8e1379c9 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -168,7 +168,7 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index cb1b4814caf..bdc3026c1de 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -176,7 +176,7 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 7871f8f7952..5defd893944 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -163,7 +163,7 @@ llama_model_qwen3vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index b99143c8908..5b77df57122 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -180,7 +180,7 @@ llama_model_qwen3vlmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index f14f10917ff..bf3949a9092 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -150,7 +150,7 @@ llama_model_refact::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index 325ee73ba5c..ca8e009615e 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -167,7 +167,7 @@ llama_model_rnd1::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index 2944711acec..ba2a9dfa0db 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -176,7 +176,7 @@ llama_model_rwkv6::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 6f7d1f5722f..566b8cdcb54 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -158,7 +158,7 @@ llama_model_rwkv6qwen2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index b205e3935e1..7574b252621 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -202,7 +202,7 @@ llama_model_rwkv7::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 83e114740b6..806cba574be 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -141,7 +141,7 @@ llama_model_seed_oss::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 3214e7cbad3..4231cccc666 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -178,7 +178,7 @@ llama_model_smallthinker::graph::graph(const llama_model & model, const ll res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 7adaf34c534..90e7d473eaf 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -143,7 +143,7 @@ llama_model_smollm3::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 8f613e55947..4da7f7aefcf 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -163,7 +163,7 @@ llama_model_stablelm::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index 58cf0ac0edc..e131af058bc 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -135,7 +135,7 @@ llama_model_starcoder::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 45dae0602d4..9c207c02885 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -148,7 +148,7 @@ llama_model_starcoder2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index c4789752d21..3b68e68707a 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -261,7 +261,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 27a0711ba41..73e32741406 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -265,7 +265,7 @@ llama_model_t5::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/wavtokenizer-dec.cpp b/src/models/wavtokenizer-dec.cpp index a873e5d2e8f..214fed99bad 100644 --- a/src/models/wavtokenizer-dec.cpp +++ b/src/models/wavtokenizer-dec.cpp @@ -253,7 +253,7 @@ llama_model_wavtokenizer_dec::graph::graph(const llama_model & model, const llm_ LLM_NORM, -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index e4d111e622a..d6d1c7a2e5d 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -126,7 +126,7 @@ llama_model_xverse::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index ce001499245..e03d13fefda 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -153,7 +153,9 @@ const serverProps = serverStore.props; if (serverProps) { - settingsStore.syncWithServerDefaults(); + untrack(() => { + settingsStore.syncWithServerDefaults(); + }); } });