@@ -2676,6 +2676,7 @@ struct server_context_impl {
26762676 if (ctx_dft) {
26772677 // TODO: in the future, figure out how to infuse target embeddings to the images
26782678 // for now, we skip this for simplicity
2679+ // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
26792680 res = input_tokens.process_chunk (ctx_dft.get (), mctx, slot.prompt .n_tokens (), slot.prompt .tokens .pos_next (), slot.id , n_tokens_out);
26802681 if (res != 0 ) {
26812682 GGML_ABORT (" failed to process multi-modal data on draft context\n " );
@@ -2925,36 +2926,44 @@ struct server_context_impl {
29252926 // | Eagle3 | yes |
29262927 // | DFlash | yes | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982
29272928 //
2928- // TODO: move to `common_speculative_process(spec, batch, ...)` [TAG_COMMON_SPECULATIVE_PROCESS]
2929- if (ctx_dft) {
2930- // TODO: update as needed for MTP, Eagle3, etc.
2931- const bool need_tgt_embd = false ;
2932-
2933- if (need_tgt_embd) {
2934- llama_synchronize (ctx_tgt);
2935- }
2936-
2937- // the logic here varies depending on the speculative decoding method
2938- // - some draft contexts require embeddings from the target context, others don't
2939- // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
2940- // TODO: extract this in a function ?
2941- {
2942- // TODO: hook the embeddings from the last target batch here
2943- if (llama_model_has_encoder (model_dft.get ())) {
2944- // llama_encode(ctx_dft, ...);
2945-
2946- GGML_ABORT (" not implemented yet\n " );
2947- }
2948-
2949- const int ret = llama_decode (ctx_dft.get (), batch_view);
2950-
2951- if (ret != 0 ) {
2952- SRV_ERR (" failed to decode draft batch, ret = %d\n " , ret);
2953-
2954- // TODO: handle error
2955- break ;
2956- }
2957- }
2929+ // note: this logic is now moved in `common_speculative_process()`
2930+ // keeping the sketch here until for a bit, until the logic is finalized
2931+ //
2932+ // if (ctx_dft) {
2933+ // // TODO: update as needed for MTP, Eagle3, etc.
2934+ // const bool need_tgt_embd = false;
2935+
2936+ // if (need_tgt_embd) {
2937+ // llama_synchronize(ctx_tgt);
2938+ // }
2939+
2940+ // // the logic here varies depending on the speculative decoding method
2941+ // // - some draft contexts require embeddings from the target context, others don't
2942+ // // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings
2943+ // // TODO: extract this in a function ?
2944+ // {
2945+ // // TODO: hook the embeddings from the last target batch here
2946+ // if (llama_model_has_encoder(model_dft.get())) {
2947+ // //llama_encode(ctx_dft, ...);
2948+
2949+ // GGML_ABORT("not implemented yet\n");
2950+ // }
2951+
2952+ // const int ret = llama_decode(ctx_dft.get(), batch_view);
2953+
2954+ // if (ret != 0) {
2955+ // SRV_ERR("failed to decode draft batch, ret = %d\n", ret);
2956+
2957+ // // TODO: handle error
2958+ // break;
2959+ // }
2960+ // }
2961+ // }
2962+ if (!common_speculative_process (spec.get (), batch_view)) {
2963+ SRV_ERR (" %s" , " failed to process speculative batch\n " );
2964+
2965+ // TODO: handle error
2966+ break ;
29582967 }
29592968
29602969 // move the head of the batch forward with the number of tokens we just processed
0 commit comments