File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -381,8 +381,8 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
381381//
382382// Performance is overall good but there is waste in verify cycle:
383383// process() runs encoder + decoder on the *full* verify batch including rows for
384- // rejected drafts. The KV at those positions is then dropped.
385- //
384+ // rejected drafts. The KV at those positions is then dropped.
385+ //
386386// TODO: Not sure if we need optimization for this waste?
387387// If so we may need hybrid stash:
388388// in verify mode, have process() only stash features and let draft() seed run
@@ -460,8 +460,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
460460 }
461461
462462 // turn on extraction of the draft model's pre-norm hidden state
463- // (used both for the encoder output g_embd and the decoder pre-norm output)
464- llama_set_embeddings_pre_norm (ctx_dft, true );
463+ // (used both for the encoder output g_embd and the decoder pre-norm output).
464+ llama_set_embeddings_pre_norm (ctx_dft, true , /* masked */ true );
465465
466466 pending_g_last.assign (n_seq, std::vector<float >(n_embd_dec, 0 .0f ));
467467 pending_pos_last.assign (n_seq, -1 );
Original file line number Diff line number Diff line change @@ -230,7 +230,7 @@ struct llama_context {
230230 // map the output row index `i` to batch index
231231 int64_t output_resolve_row (int32_t i) const ;
232232
233- // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
233+ // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
234234 // from backend into host-side embd_layer_inp buffers
235235 void extract_layer_inputs (const llm_graph_result * res);
236236
You can’t perform that action at this time.
0 commit comments