server: stop DFlash at grammar tool boundaries

Anbeeld · Anbeeld · commit 82ecad0ca4c7 · 2026-05-11T04:29:07.000+02:00
Keep DFlash active before lazy grammar triggers, then stop speculative accept/drafting once grammar, reasoning-budget forcing, or raw tool-call markers require normal token-by-token sampling. Track accepted draft tokens separately from hidden-state rows so DFlash rollback and ring updates stay aligned at grammar/tool boundaries. Fixes #5 Refs #6
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -653,19 +653,38 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     return id;
 }
 
+static bool common_sampler_has_speculative_unsafe_grammar(const struct common_sampler * gsmpl) {
+    if (!gsmpl || !gsmpl->grmr) {
+        return false;
+    }
+
+    // Lazy grammars are safe to speculate while still awaiting their trigger.
+    // Once triggered, grammar-constrained regions need normal full-vocab
+    // sampling and one-token streaming/parser boundaries.
+    return llama_sampler_grammar_is_active(gsmpl->grmr);
+}
+
+bool common_sampler_blocks_speculative(const struct common_sampler * gsmpl) {
+    if (!gsmpl) {
+        return true;
+    }
+    if (common_sampler_has_speculative_unsafe_grammar(gsmpl)) {
+        return true;
+    }
+    return common_reasoning_budget_get_state(gsmpl->rbudget) == REASONING_BUDGET_FORCING;
+}
+
 bool common_sampler_supports_reduced(struct common_sampler * gsmpl) {
     if (!gsmpl) {
         return false;
     }
-    // A grammar sampler exists but may be lazy+inactive (awaiting trigger).
-    // Only reject when grammar is actively constraining tokens.
-    if (gsmpl->grmr && llama_sampler_grammar_is_active(gsmpl->grmr)) {
+    if (common_sampler_has_speculative_unsafe_grammar(gsmpl)) {
         return false;
     }
-    if (common_reasoning_budget_get_state(gsmpl->rbudget) == REASONING_BUDGET_FORCING) {
-        return false;
+    if (common_reasoning_budget_get_state(gsmpl->rbudget) != REASONING_BUDGET_FORCING) {
+        return true;
     }
-    return true;
+    return false;
 }
 
 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
@@ -682,6 +701,10 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
 
         result.push_back(id);
 
+        if (common_sampler_blocks_speculative(gsmpl)) {
+            break;
+        }
+
         if (draft[i] != id) {
             break;
         }
@@ -760,6 +783,10 @@ std::vector<llama_token> common_sampler_sample_reduced_and_accept_n(
         common_sampler_accept(gsmpl, id, true);
         result.push_back(id);
 
+        if (common_sampler_blocks_speculative(gsmpl)) {
+            break;
+        }
+
         if (draft[i] != id) {
             break;
         }
diff --git a/common/sampling.h b/common/sampling.h
@@ -67,6 +67,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 
 bool common_sampler_supports_reduced(struct common_sampler * gsmpl);
+bool common_sampler_blocks_speculative(const struct common_sampler * gsmpl);
 
 // generalized version of common_sampler_sample
 //
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -255,7 +255,7 @@ struct common_speculative_state {
 
     // called after verification decode with logits still in ctx
     // batch_tokens: tokens that were in the batch [id_last, draft0, draft1, ...]
-    // n_accepted: how many were accepted (ids.size(), including the bonus token)
+    // n_accepted: number of decoded batch rows to commit (root + accepted draft tokens)
     virtual void update_logits(llama_context * /*ctx*/, const llama_tokens & /*batch_tokens*/, int /*n_accepted*/) {}
 
     // tree variant: accept specific capture-buffer indices instead of a contiguous block.
@@ -2106,9 +2106,9 @@ struct common_speculative_state_dflash : public common_speculative_state {
     void update_logits(llama_context * ctx, const llama_tokens & batch_tokens, int n_accepted) override {
         GGML_UNUSED(ctx);
         GGML_UNUSED(batch_tokens);
-        // n_accepted includes the bonus token: [id_last, draft0, ..., draftN-1] → accepted count
-        // the verification batch had (1 + n_draft) tokens
-        // only the first n_accepted tokens' hidden states should be kept
+        // In this path n_accepted means committed hidden-state rows, not output-token count.
+        // [id_last, draft0, ..., draftN-1] => root + accepted draft tokens.
+        // Boundary stops pass root + accepted draft tokens even when no bonus token was sampled.
         append_target_hiddens(n_accepted);
     }
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp