ORippler
diff --git a/‎src/models/afmoe.cpp‎
Lines changed: 20 additions & 6 deletions b/‎src/models/afmoe.cpp‎
Lines changed: 20 additions & 6 deletions
diff --git a/‎src/models/apertus.cpp‎
Lines changed: 3 additions & 3 deletions b/‎src/models/apertus.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/models/arcee.cpp‎
Lines changed: 5 additions & 2 deletions b/‎src/models/arcee.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/models/arctic.cpp‎
Lines changed: 14 additions & 3 deletions b/‎src/models/arctic.cpp‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎src/models/arwkv7.cpp‎
Lines changed: 5 additions & 2 deletions b/‎src/models/arwkv7.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/models/baichuan.cpp‎
Lines changed: 5 additions & 2 deletions b/‎src/models/baichuan.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/models/bailingmoe.cpp‎
Lines changed: 14 additions & 3 deletions b/‎src/models/bailingmoe.cpp‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎src/models/bailingmoe2.cpp‎
Lines changed: 18 additions & 4 deletions b/‎src/models/bailingmoe2.cpp‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎src/models/bert.cpp‎
Lines changed: 20 additions & 4 deletions b/‎src/models/bert.cpp‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎src/models/bitnet.cpp‎
Lines changed: 6 additions & 3 deletions b/‎src/models/bitnet.cpp‎
Lines changed: 6 additions & 3 deletions
@@ -151,7 +151,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
                     n_embd_head, n_head, n_head_kv, il);
 
             // compute gate from input
-            ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
+            ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp, model.layers[il].wqkv_gate_s, model.layers[il].wqkv_gate_in_s);
             cb(gate, "attn_gate_proj", il);
 
             // Q/K normalization
@@ -186,7 +186,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
             cb(cur, "attn_gated", il);
 
             // now apply output projection
-            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
+            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s, model.layers[il].wo_in_s);
             cb(cur, "attn_o_proj", il);
         }
 
@@ -224,7 +224,15 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
                     hparams.expert_weights_norm,           // norm_w (route_norm=True)
                     hparams.expert_weights_scale,          // w_scale (route_scale=2.826)
                     (llama_expert_gating_func_type) hparams.expert_gating_func,
-                    il);
+                    il,
+                    nullptr,
+                    nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s,
+                    model.layers[il].ffn_up_exps_in_s,
+                    model.layers[il].ffn_gate_exps_in_s,
+                    model.layers[il].ffn_down_exps_in_s);
             cb(moe_out, "ffn_moe_out", il);
 
             // shared expert
@@ -234,7 +242,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
                         NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                        LLM_FFN_SILU, LLM_FFN_PAR, il,
+                        model.layers[il].ffn_up_shexp_in_s,
+                        model.layers[il].ffn_gate_shexp_in_s,
+                        model.layers[il].ffn_down_shexp_in_s);
                 cb(ffn_shexp, "ffn_shexp", il);
 
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -249,7 +260,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
                     NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_in_s,
+                    model.layers[il].ffn_gate_in_s,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         }
 
@@ -277,7 +291,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
 
@@ -125,7 +125,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
             cb(cur, "ffn_norm", il);
 
             // Up projection
-            ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
+            ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur, model.layers[il].ffn_up_s, model.layers[il].ffn_up_in_s);
             cb(up, "ffn_up", il);
 
             float alpha_n_val = hparams.xielu_alpha_n[il];
@@ -138,7 +138,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
             cb(activated, "ffn_xielu", il);
 
             // Down projection
-            cur = build_lora_mm(model.layers[il].ffn_down, activated);
+            cur = build_lora_mm(model.layers[il].ffn_down, activated, model.layers[il].ffn_down_s, model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_down", il);
         }
 
@@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -125,7 +125,10 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
                 NULL,                      NULL, NULL,
                 model.layers[il].ffn_down, NULL, NULL,
                 NULL,
-                LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+                LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il,
+                model.layers[il].ffn_up_in_s,
+                nullptr,
+                model.layers[il].ffn_down_in_s);
         cb(cur, "ffn_out", il);
 
         cur = ggml_add(ctx0, cur, ffn_inp);
@@ -148,7 +151,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -126,7 +126,10 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
                 model.layers[il].ffn_gate, NULL, NULL,
                 model.layers[il].ffn_down, NULL, NULL,
                 NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
+                LLM_FFN_SILU, LLM_FFN_PAR, il,
+                model.layers[il].ffn_up_in_s,
+                model.layers[il].ffn_gate_in_s,
+                model.layers[il].ffn_down_in_s);
         cb(cur, "ffn_out", il);
 
         ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
@@ -148,7 +151,15 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
                 LLM_FFN_SILU, true,
                 hparams.expert_weights_scale,
                 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                il);
+                il,
+                nullptr,
+                nullptr,
+                model.layers[il].ffn_up_exps_s,
+                model.layers[il].ffn_gate_exps_s,
+                model.layers[il].ffn_down_exps_s,
+                model.layers[il].ffn_up_exps_in_s,
+                model.layers[il].ffn_gate_exps_in_s,
+                model.layers[il].ffn_down_exps_in_s);
         cb(cur, "ffn_moe_out", il);
 
         cur = ggml_add(ctx0, cur, ffn_out);
@@ -171,7 +182,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -176,7 +176,10 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
                 model.layers[il].ffn_gate, NULL, NULL,
                 model.layers[il].ffn_down, NULL, NULL,
                 NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
+                LLM_FFN_SILU, LLM_FFN_PAR, il,
+                model.layers[il].ffn_up_in_s,
+                model.layers[il].ffn_gate_in_s,
+                model.layers[il].ffn_down_in_s);
         cb(cur, "ffn_out", il);
 
         cur = ggml_add(ctx0, cur, ffn_inp);
@@ -193,7 +196,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -123,7 +123,10 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
                     NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_in_s,
+                    model.layers[il].ffn_gate_in_s,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         }
 
@@ -146,7 +149,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -135,7 +135,15 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
                     LLM_FFN_SILU, hparams.expert_weights_norm,
                     hparams.expert_weights_scale,
                     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    il);
+                    il,
+                    nullptr,
+                    nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s,
+                    model.layers[il].ffn_up_exps_in_s,
+                    model.layers[il].ffn_gate_exps_in_s,
+                    model.layers[il].ffn_down_exps_in_s);
         cb(moe_out, "ffn_moe_out", il);
 
         // FFN shared expert
@@ -145,7 +153,10 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
                     model.layers[il].ffn_gate_shexp, NULL, NULL,
                     model.layers[il].ffn_down_shexp, NULL, NULL,
                     NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    LLM_FFN_SILU, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_shexp_in_s,
+                    model.layers[il].ffn_gate_shexp_in_s,
+                    model.layers[il].ffn_down_shexp_in_s);
             cb(ffn_shexp, "ffn_shexp", il);
 
             cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -171,7 +182,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -163,7 +163,10 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
                     model.layers[il].ffn_up, NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
-                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_in_s,
+                    model.layers[il].ffn_gate_in_s,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         } else {
             ggml_tensor * moe_out = build_moe_ffn(cur,
@@ -176,7 +179,15 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
                 LLM_FFN_SILU, hparams.expert_weights_norm,
                 hparams.expert_weights_scale,
                 (llama_expert_gating_func_type) hparams.expert_gating_func,
-                il);
+                il,
+                    nullptr,
+                    nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    model.layers[il].ffn_gate_exps_s,
+                    model.layers[il].ffn_down_exps_s,
+                    model.layers[il].ffn_up_exps_in_s,
+                    model.layers[il].ffn_gate_exps_in_s,
+                    model.layers[il].ffn_down_exps_in_s);
             cb(moe_out, "ffn_moe_out", il);
 
             {
@@ -185,7 +196,10 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
                         model.layers[il].ffn_up_shexp, NULL, NULL,
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
-                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
+                            model.layers[il].ffn_up_shexp_in_s,
+                            model.layers[il].ffn_gate_shexp_in_s,
+                            model.layers[il].ffn_down_shexp_in_s);
                 cb(ffn_shexp, "ffn_shexp", il);
 
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
@@ -210,7 +224,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
     res->t_embd = cur;
 
     // lm_head
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    cur = build_lora_mm(model.output, cur, model.output_s, model.output_in_s);
 
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 
@@ -186,15 +186,25 @@ llama_model_bert::graph::graph(const llama_model & model, const llm_graph_params
                     LLM_FFN_GELU, false,
                     hparams.expert_weights_scale,
                     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    il);
+                    il,
+                    nullptr, nullptr,
+                    model.layers[il].ffn_up_exps_s,
+                    nullptr,
+                    model.layers[il].ffn_down_exps_s,
+                    model.layers[il].ffn_up_exps_in_s,
+                    nullptr,
+                    model.layers[il].ffn_down_exps_in_s);
             cb(cur, "ffn_moe_out", il);
         } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
                    model.arch == LLM_ARCH_JINA_BERT_V3) {
             cur = build_ffn(cur,
                     model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
                     NULL, NULL, NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                    LLM_FFN_GELU, LLM_FFN_SEQ, il,
+                    model.layers[il].ffn_up_in_s,
+                    nullptr,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
             const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
@@ -203,14 +213,20 @@ llama_model_bert::graph::graph(const llama_model & model, const llm_graph_params
                     model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    type_op, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_in_s,
+                    model.layers[il].ffn_gate_in_s,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         } else {
             cur = build_ffn(cur,
                 model.layers[il].ffn_up, NULL, NULL,
                 model.layers[il].ffn_gate, NULL, NULL,
                 model.layers[il].ffn_down, NULL, NULL,
-                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il,
+                    model.layers[il].ffn_up_in_s,
+                    model.layers[il].ffn_gate_in_s,
+                    model.layers[il].ffn_down_in_s);
             cb(cur, "ffn_out", il);
         }
 
 
@@ -103,7 +103,7 @@ llama_model_bitnet::graph::graph(const llama_model & model, const llm_graph_para
                     LLM_NORM_RMS, il);
             cb(cur, "attn_sub_norm", il);
 
-            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
+            cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s, model.layers[il].wo_in_s);
             if (model.layers[il].wo_b) {
                 cur = ggml_add(ctx0, cur, model.layers[il].wo_b);
             }
@@ -129,15 +129,18 @@ llama_model_bitnet::graph::graph(const llama_model & model, const llm_graph_para
                 model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
                 NULL,                      NULL, NULL,
                 NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
+                LLM_FFN_SILU, LLM_FFN_PAR, il,
+                model.layers[il].ffn_up_in_s,
+                model.layers[il].ffn_gate_in_s,
+                nullptr);
         cb(cur, "ffn_sub_out", il);
 
         cur = build_norm(cur,
                 model.layers[il].ffn_sub_norm, NULL,
                 LLM_NORM_RMS, il);
         cb(cur, "ffn_sub_norm", il);
 
-        cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
+        cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s, model.layers[il].ffn_down_in_s);
         cb(cur, "ffn_down", il);
 
         cur = ggml_add(ctx0, cur, ffn_inp);