@@ -209,16 +209,16 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
209209 }
210210 cur = inpL;
211211
212+ // post-norm hidden state feeds both the LM head and the MTP seed below
213+ cur = build_norm (cur, model.output_norm , nullptr , LLM_NORM_RMS, -1 );
214+
212215 cb (cur, " h_pre_norm" , -1 );
213216 res->t_h_pre_norm = cur;
214217
215218 if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
216219 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
217220 }
218221
219- // Final norm
220- cur = build_norm (cur, model.output_norm , nullptr , LLM_NORM_RMS, -1 );
221-
222222 cb (cur, " result_norm" , -1 );
223223 res->t_embd = cur;
224224
@@ -625,18 +625,17 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
625625 cur = ggml_add (ctx0, cur, ffn_residual);
626626 cb (cur, " mtp_post_ffn" , il);
627627
628- // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
629- // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
630- cb (cur, " h_pre_norm" , -1 );
631- res->t_h_pre_norm = cur;
632-
633- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
634-
635628 ggml_tensor * head_norm_w = layer.nextn .shared_head_norm
636629 ? layer.nextn .shared_head_norm
637630 : model.output_norm ;
638631 GGML_ASSERT (head_norm_w && " QWEN35 MTP: missing both nextn.shared_head_norm and output_norm" );
639632 cur = build_norm (cur, head_norm_w, nullptr , LLM_NORM_RMS, -1 );
633+
634+ // this is actually post norm
635+ cb (cur, " h_pre_norm" , -1 );
636+ res->t_h_pre_norm = cur;
637+
638+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
640639 cb (cur, " mtp_shared_head_norm" , -1 );
641640
642641 ggml_tensor * head_w = layer.nextn .shared_head_head ? layer.nextn .shared_head_head : model.output ;
0 commit comments