eagle3: fix params bug

ruixiang63 · ruixiang63 · commit 5eafd448a57a · 2026-05-18T16:18:35.000Z
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -381,8 +381,8 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
 //
 // Performance is overall good but there is waste in verify cycle:
 //   process() runs encoder + decoder on the *full* verify batch including rows for
-//   rejected drafts. The KV at those positions is then dropped. 
-// 
+//   rejected drafts. The KV at those positions is then dropped.
+//
 // TODO: Not sure if we need optimization for this waste?
 // If so we may need hybrid stash:
 //      in verify mode, have process() only stash features and let draft() seed run
@@ -460,8 +460,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         }
 
         // turn on extraction of the draft model's pre-norm hidden state
-        // (used both for the encoder output g_embd and the decoder pre-norm output)
-        llama_set_embeddings_pre_norm(ctx_dft, true);
+        // (used both for the encoder output g_embd and the decoder pre-norm output).
+        llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
 
         pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
         pending_pos_last.assign(n_seq, -1);
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -230,7 +230,7 @@ struct llama_context {
     // map the output row index `i` to batch index
     int64_t output_resolve_row(int32_t i) const;
 
-    // async-copy enabled layer-input tensors (per cparams.output_layer_inp) 
+    // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
     // from backend into host-side embd_layer_inp buffers
     void extract_layer_inputs(const llm_graph_result * res);