@@ -208,16 +208,16 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
208208 }
209209 cur = inpL;
210210
211+ // post-norm hidden state feeds both the LM head and the MTP seed below
212+ cur = build_norm (cur, model.output_norm , nullptr , LLM_NORM_RMS, -1 );
213+
211214 cb (cur, " h_pre_norm" , -1 );
212215 res->t_h_pre_norm = cur;
213216
214217 if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
215218 cur = ggml_get_rows (ctx0, cur, inp_out_ids);
216219 }
217220
218- // Final norm
219- cur = build_norm (cur, model.output_norm , nullptr , LLM_NORM_RMS, -1 );
220-
221221 cb (cur, " result_norm" , -1 );
222222 res->t_embd = cur;
223223
@@ -624,18 +624,17 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
624624 cur = ggml_add (ctx0, cur, ffn_residual);
625625 cb (cur, " mtp_post_ffn" , il);
626626
627- // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
628- // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
629- cb (cur, " h_pre_norm" , -1 );
630- res->t_h_pre_norm = cur;
631-
632- cur = ggml_get_rows (ctx0, cur, inp_out_ids);
633-
634627 ggml_tensor * head_norm_w = layer.nextn .shared_head_norm
635628 ? layer.nextn .shared_head_norm
636629 : model.output_norm ;
637630 GGML_ASSERT (head_norm_w && " QWEN35 MTP: missing both nextn.shared_head_norm and output_norm" );
638631 cur = build_norm (cur, head_norm_w, nullptr , LLM_NORM_RMS, -1 );
632+
633+ // this is actually post norm
634+ cb (cur, " h_pre_norm" , -1 );
635+ res->t_h_pre_norm = cur;
636+
637+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
639638 cb (cur, " mtp_shared_head_norm" , -1 );
640639
641640 ggml_tensor * head_w = layer.nextn .shared_head_head ? layer.nextn .shared_head_head : model.output ;
0 commit comments