@@ -151,7 +151,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
151151 n_embd_head, n_head, n_head_kv, il);
152152
153153 // compute gate from input
154- ggml_tensor * gate = build_lora_mm (model.layers [il].wqkv_gate , attn_inp);
154+ ggml_tensor * gate = build_lora_mm (model.layers [il].wqkv_gate , attn_inp, model. layers [il]. wqkv_gate_s , model. layers [il]. wqkv_gate_in_s );
155155 cb (gate, " attn_gate_proj" , il);
156156
157157 // Q/K normalization
@@ -186,7 +186,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
186186 cb (cur, " attn_gated" , il);
187187
188188 // now apply output projection
189- cur = build_lora_mm (model.layers [il].wo , cur, model.layers [il].wo_s );
189+ cur = build_lora_mm (model.layers [il].wo , cur, model.layers [il].wo_s , model. layers [il]. wo_in_s );
190190 cb (cur, " attn_o_proj" , il);
191191 }
192192
@@ -224,7 +224,15 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
224224 hparams.expert_weights_norm , // norm_w (route_norm=True)
225225 hparams.expert_weights_scale , // w_scale (route_scale=2.826)
226226 (llama_expert_gating_func_type) hparams.expert_gating_func ,
227- il);
227+ il,
228+ nullptr ,
229+ nullptr ,
230+ model.layers [il].ffn_up_exps_s ,
231+ model.layers [il].ffn_gate_exps_s ,
232+ model.layers [il].ffn_down_exps_s ,
233+ model.layers [il].ffn_up_exps_in_s ,
234+ model.layers [il].ffn_gate_exps_in_s ,
235+ model.layers [il].ffn_down_exps_in_s );
228236 cb (moe_out, " ffn_moe_out" , il);
229237
230238 // shared expert
@@ -234,7 +242,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
234242 model.layers [il].ffn_gate_shexp , NULL , NULL ,
235243 model.layers [il].ffn_down_shexp , NULL , NULL ,
236244 NULL ,
237- LLM_FFN_SILU , LLM_FFN_PAR , il);
245+ LLM_FFN_SILU , LLM_FFN_PAR , il,
246+ model.layers [il].ffn_up_shexp_in_s ,
247+ model.layers [il].ffn_gate_shexp_in_s ,
248+ model.layers [il].ffn_down_shexp_in_s );
238249 cb (ffn_shexp, " ffn_shexp" , il);
239250
240251 cur = ggml_add (ctx0, moe_out, ffn_shexp);
@@ -249,7 +260,10 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
249260 model.layers [il].ffn_gate , NULL , NULL ,
250261 model.layers [il].ffn_down , NULL , NULL ,
251262 NULL ,
252- LLM_FFN_SILU , LLM_FFN_PAR , il);
263+ LLM_FFN_SILU , LLM_FFN_PAR , il,
264+ model.layers [il].ffn_up_in_s ,
265+ model.layers [il].ffn_gate_in_s ,
266+ model.layers [il].ffn_down_in_s );
253267 cb (cur, " ffn_out" , il);
254268 }
255269
@@ -277,7 +291,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
277291 res->t_embd = cur;
278292
279293 // lm_head
280- cur = build_lora_mm (model.output , cur, model.output_s );
294+ cur = build_lora_mm (model.output , cur, model.output_s , model. output_in_s );
281295 cb (cur, " result_output" , -1 );
282296 res->t_logits = cur;
283297
0 commit comments