Skip to content

Commit fbd94d8

Browse files
committed
qwen35: use post-norm hidden state for MTP
1 parent 031ddb2 commit fbd94d8

2 files changed

Lines changed: 18 additions & 19 deletions

File tree

src/models/qwen35.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -208,16 +208,16 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
208208
}
209209
cur = inpL;
210210

211+
// post-norm hidden state feeds both the LM head and the MTP seed below
212+
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
213+
211214
cb(cur, "h_pre_norm", -1);
212215
res->t_h_pre_norm = cur;
213216

214217
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
215218
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
216219
}
217220

218-
// Final norm
219-
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
220-
221221
cb(cur, "result_norm", -1);
222222
res->t_embd = cur;
223223

@@ -624,18 +624,17 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
624624
cur = ggml_add(ctx0, cur, ffn_residual);
625625
cb(cur, "mtp_post_ffn", il);
626626

627-
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
628-
// (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
629-
cb(cur, "h_pre_norm", -1);
630-
res->t_h_pre_norm = cur;
631-
632-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
633-
634627
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
635628
? layer.nextn.shared_head_norm
636629
: model.output_norm;
637630
GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm");
638631
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
632+
633+
// this is actually post norm
634+
cb(cur, "h_pre_norm", -1);
635+
res->t_h_pre_norm = cur;
636+
637+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
639638
cb(cur, "mtp_shared_head_norm", -1);
640639

641640
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;

src/models/qwen35moe.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -231,16 +231,16 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
231231
}
232232
cur = inpL;
233233

234+
// post-norm hidden state feeds both the LM head and the MTP seed below
235+
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
236+
234237
cb(cur, "h_pre_norm", -1);
235238
res->t_h_pre_norm = cur;
236239

237240
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
238241
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
239242
}
240243

241-
// Final norm
242-
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
243-
244244
cb(cur, "result_norm", -1);
245245
res->t_embd = cur;
246246

@@ -720,17 +720,17 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
720720
cur = ggml_add(ctx0, cur, ffn_residual);
721721
cb(cur, "mtp_post_ffn", il);
722722

723-
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
724-
cb(cur, "h_pre_norm", -1);
725-
res->t_h_pre_norm = cur;
726-
727-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
728-
729723
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
730724
? layer.nextn.shared_head_norm
731725
: model.output_norm;
732726
GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm");
733727
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
728+
729+
// this is actually post norm
730+
cb(cur, "h_pre_norm", -1);
731+
res->t_h_pre_norm = cur;
732+
733+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
734734
cb(cur, "mtp_shared_head_norm", -1);
735735

736736
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;

0 commit comments

Comments
 (0)