eagle3 : fix ubatch handling in embd_layer_inp extraction and encoder

ruixiang63 · Dogacel · ruixiang63 · commit e8ddf01ed25a · 2026-05-27T16:41:44.000Z
Co-authored-by: Doğaç Eldenk &lt;dogacel@gmail.com&gt;
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -432,6 +432,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
     // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
     std::vector<float> features_buf;
+    std::vector<float> g_embd_buf;
 
     common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
         : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
@@ -569,25 +570,39 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             }
         }
 
-        llama_batch enc_batch = {
-            /*.n_tokens =*/ n_tokens,
-            /*.token    =*/ nullptr,
-            /*.embd     =*/ features_buf.data(),
-            /*.pos      =*/ nullptr,
-            /*.n_seq_id =*/ nullptr,
-            /*.seq_id   =*/ nullptr,
-            /*.logits   =*/ nullptr,
-        };
-        int rc = llama_encode(ctx_dft, enc_batch);
-        if (rc != 0) {
-            LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d)\n",
-                    __func__, rc, (int) n_tokens);
-            return false;
+        g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
+
+        // llama_encode() requires the full encoder batch to fit in n_ubatch.
+        // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
+        const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
+        for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
+            const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
+
+            llama_batch enc_batch = {
+                /*.n_tokens =*/ n_chunk,
+                /*.token    =*/ nullptr,
+                /*.embd     =*/ features_buf.data() + (size_t) i * n_embd_enc,
+                /*.pos      =*/ nullptr,
+                /*.n_seq_id =*/ nullptr,
+                /*.seq_id   =*/ nullptr,
+                /*.logits   =*/ nullptr,
+            };
+            const int32_t rc = llama_encode(ctx_dft, enc_batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
+                        __func__, rc, (int) n_chunk, (int) i);
+                return false;
+            }
+
+            // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
+            const float * g_embd_chunk = llama_get_embeddings_pre_norm(ctx_dft);
+            GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
+            std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
+                        g_embd_chunk,
+                        (size_t) n_chunk * n_embd_dec * sizeof(float));
         }
 
-        // g_embd has shape [n_tokens, n_embd_dec] in ctx_dft's pre-norm embeddings buffer
-        const float * g_embd = llama_get_embeddings_pre_norm(ctx_dft);
-        GGML_ASSERT(g_embd && "EAGLE3 encoder produced no output.");
+        const float * g_embd = g_embd_buf.data();
 
         const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
 
@@ -648,7 +663,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         }
 
         if (batch.n_tokens > 0) {
-            rc = llama_decode(ctx_dft, batch);
+            const int32_t rc = llama_decode(ctx_dft, batch);
             if (rc != 0) {
                 LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
                         __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1260,10 +1260,10 @@ void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
 }
 
 float * llama_context::get_output_layer_inp(uint32_t layer_id) {
-    if (layer_id >= embd_layer_inp.size() || embd_layer_inp[layer_id].empty()) {
+    if (layer_id >= embd_layer_inp.size() || !embd_layer_inp[layer_id].has_data()) {
         return nullptr;
     }
-    return embd_layer_inp[layer_id].data();
+    return embd_layer_inp[layer_id].data;
 }
 
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
@@ -1960,7 +1960,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
-        extract_layer_inputs(res);
+        extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
 
         // extract pre-norm embeddings (hidden state before the final output norm)
         // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
@@ -2081,6 +2081,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     size_t backend_float_count = 0;
     size_t backend_token_count = 0;
+    size_t embd_layer_inp_float_count = 0;
 
     logits.size        = has_logits        ? n_vocab*n_outputs_max     : 0;
     embd.size          = has_embd          ? n_embd_out*n_outputs_max  : 0;
@@ -2092,6 +2093,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         embd_pre_norm.size = (size_t) n_embd * n_batch;
     }
 
+    for (bool enabled : cparams.output_layer_inp) {
+        if (enabled) {
+            embd_layer_inp_float_count += (size_t) n_embd * n_batch;
+        }
+    }
+
     // Allocate backend sampling output buffers if there are backend samplers configured.
     const bool has_sampling = !sampling.samplers.empty();
     if (has_sampling) {
@@ -2106,8 +2113,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
     const size_t new_size  =
-        (logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) +
-        (                                               backend_token_count) * sizeof(llama_token);
+        (logits.size + embd.size + embd_pre_norm.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
+        (                                                                       backend_token_count) * sizeof(llama_token);
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -2124,6 +2131,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
             logits.data = nullptr;
             embd.data = nullptr;
             embd_pre_norm.data = nullptr;
+            for (auto & layer_inp : embd_layer_inp) {
+                layer_inp = {nullptr, 0};
+            }
         }
 
         auto * buft = ggml_backend_cpu_buffer_type();
@@ -2155,6 +2165,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     embd_pre_norm = has_embd_pre_norm ? buffer_view<float>{(float *) (base + offset), embd_pre_norm.size} : buffer_view<float>{nullptr, 0};
     offset += embd_pre_norm.size * sizeof(float);
 
+    for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
+        if (cparams.output_layer_inp[il]) {
+            embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
+            offset += embd_layer_inp[il].size * sizeof(float);
+        } else {
+            embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
+        }
+    }
+
     if (has_sampling) {
         sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
         offset += sampling.logits.size * sizeof(float);
@@ -2199,20 +2218,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
 
-void llama_context::extract_layer_inputs(const llm_graph_result * res) {
+void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
     for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) {
         if (!cparams.output_layer_inp[il]) {
             continue;
         }
+        if (!embd_layer_inp[il].has_data()) {
+            continue;
+        }
         ggml_tensor * t = res->get_layer_inp((int) il);
         if (!t) {
             continue;
         }
         const size_t nbytes = ggml_nbytes(t);
-        embd_layer_inp[il].resize(nbytes / sizeof(float));
+        const size_t nfloats = nbytes / sizeof(float);
+        GGML_ASSERT(n_tokens > 0);
+        GGML_ASSERT(nfloats % n_tokens == 0);
+
+        const size_t row_floats = nfloats / n_tokens;
+        const size_t dst_offset = token_offset * row_floats;
+        GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
+
         ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
         GGML_ASSERT(backend != nullptr);
-        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data(), 0, nbytes);
+        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
     }
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -232,7 +232,7 @@ struct llama_context {
 
     // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
     // from backend into host-side embd_layer_inp buffers
-    void extract_layer_inputs(const llm_graph_result * res);
+    void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
 
     //
     // graph
@@ -364,7 +364,7 @@ struct llama_context {
 
     // host buffer for output layer input embeddings, per layer
     // populated when cparams.output_layer_inp[il] is true
-    std::vector<std::vector<float>> embd_layer_inp;
+    std::vector<buffer_view<float>> embd_layer_inp;
 
     // keep copies of the per-sequence memory on the device
     std::map<llama_seq_id, llama_memory_buffers> mem_storage;
diff --git a/src/llama-ext.h b/src/llama-ext.h
@@ -112,7 +112,7 @@ LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx,
 // set if the layer input embeddings should be outputed
 LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
 
-// read back the input embeddings of the specified layer for the most recent ubatch
+// read back the input embeddings of the specified layer for the most recent decode batch
 // the layer must have been enabled via llama_set_output_layer_inp
 LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id);