Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ui-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ on:
required: true

jobs:
build:
name: Build static output
uses: ./.github/workflows/ui-build.yml

publish:
name: Publish UI Static Output
needs: build
runs-on: ubuntu-24.04-arm

permissions:
Expand Down
2 changes: 2 additions & 0 deletions src/llama-model-saver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() {
add_tensor(model->output);
add_tensor(model->output_b);
add_tensor(model->output_norm_enc);
add_tensor(model->output_s);
add_tensor(model->output_in_s);
add_tensor(model->cls);
add_tensor(model->cls_b);
add_tensor(model->cls_out);
Expand Down
15 changes: 14 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1394,10 +1394,23 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
}
// output scales
if (output && output->type == GGML_TYPE_NVFP4) {
// weight scale
if (!output_s) {
output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED);
}
// input scale
if (!output_in_s) {
output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED);
}
}
}

ml.done_getting_tensors();

GGML_ASSERT(!(output && tok_embd &&
strcmp(output->name, tok_embd->name) == 0 &&
output->type == GGML_TYPE_NVFP4));
// populate tensors_by_name
for (auto & [_, ctx_ptr] : ml.ctx_map) {
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
Expand Down
5 changes: 5 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,11 @@ struct llama_model {
struct ggml_tensor * output_b = nullptr;
struct ggml_tensor * output_norm_enc = nullptr;


// NVFP4 per-tensor scale2, input_scale for LM head
struct ggml_tensor * output_s = nullptr;
struct ggml_tensor * output_in_s = nullptr;

// classifier
struct ggml_tensor * cls = nullptr;
struct ggml_tensor * cls_b = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion src/models/afmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

Expand Down
2 changes: 1 addition & 1 deletion src/models/apertus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arcee.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arctic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/arwkv7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/baichuan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/chameleon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output_with_img_logits", -1);

// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
Expand Down
2 changes: 1 addition & 1 deletion src/models/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/codeshell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/cogvlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
Expand Down
2 changes: 1 addition & 1 deletion src/models/cohere2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);
Expand Down
2 changes: 1 addition & 1 deletion src/models/command-r.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);
Expand Down
2 changes: 1 addition & 1 deletion src/models/dbrx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/deci.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/deepseek.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/dots1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/dream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/ernie4-5-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/ernie4-5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/exaone4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/falcon-h1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

// final logit soft-capping
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ llama_model_gemma3::graph<iswa>::graph(const llama_model & model, const llm_grap
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma3n.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

{
// final logit soft-capping
Expand Down
2 changes: 1 addition & 1 deletion src/models/gemma4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
Expand Down
2 changes: 1 addition & 1 deletion src/models/glm4-moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/glm4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;

// Output projection
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gpt2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/gptneox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;

cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

cb(cur, "result_output", -1);
res->t_logits = cur;
Expand Down
2 changes: 1 addition & 1 deletion src/models/granite-hybrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);

// For Granite architectures - scale logits
if (hparams.f_logit_scale) {
Expand Down
Loading
Loading