Skip to content

Commit e94341d

Browse files
committed
qwen35: use post-norm hidden state for MTP
1 parent 06938ac commit e94341d

2 files changed

Lines changed: 18 additions & 19 deletions

File tree

src/models/qwen35.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -209,16 +209,16 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
209209
}
210210
cur = inpL;
211211

212+
// post-norm hidden state feeds both the LM head and the MTP seed below
213+
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
214+
212215
cb(cur, "h_pre_norm", -1);
213216
res->t_h_pre_norm = cur;
214217

215218
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
216219
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
217220
}
218221

219-
// Final norm
220-
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
221-
222222
cb(cur, "result_norm", -1);
223223
res->t_embd = cur;
224224

@@ -625,18 +625,17 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
625625
cur = ggml_add(ctx0, cur, ffn_residual);
626626
cb(cur, "mtp_post_ffn", il);
627627

628-
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
629-
// (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
630-
cb(cur, "h_pre_norm", -1);
631-
res->t_h_pre_norm = cur;
632-
633-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
634-
635628
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
636629
? layer.nextn.shared_head_norm
637630
: model.output_norm;
638631
GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm");
639632
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
633+
634+
// this is actually post norm
635+
cb(cur, "h_pre_norm", -1);
636+
res->t_h_pre_norm = cur;
637+
638+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
640639
cb(cur, "mtp_shared_head_norm", -1);
641640

642641
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;

src/models/qwen35moe.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -232,16 +232,16 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
232232
}
233233
cur = inpL;
234234

235+
// post-norm hidden state feeds both the LM head and the MTP seed below
236+
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
237+
235238
cb(cur, "h_pre_norm", -1);
236239
res->t_h_pre_norm = cur;
237240

238241
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
239242
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
240243
}
241244

242-
// Final norm
243-
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
244-
245245
cb(cur, "result_norm", -1);
246246
res->t_embd = cur;
247247

@@ -721,17 +721,17 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
721721
cur = ggml_add(ctx0, cur, ffn_residual);
722722
cb(cur, "mtp_post_ffn", il);
723723

724-
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
725-
cb(cur, "h_pre_norm", -1);
726-
res->t_h_pre_norm = cur;
727-
728-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
729-
730724
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
731725
? layer.nextn.shared_head_norm
732726
: model.output_norm;
733727
GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm");
734728
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
729+
730+
// this is actually post norm
731+
cb(cur, "h_pre_norm", -1);
732+
res->t_h_pre_norm = cur;
733+
734+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
735735
cb(cur, "mtp_shared_head_norm", -1);
736736

737737
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;

0 commit comments

Comments
 (0)