@@ -260,32 +260,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
260260 }
261261 }
262262
263- // Feed generation prompt tokens to the grammar sampler so it advances past
264- // tokens the template already placed in the prompt.
265- // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
263+ // Compute prefill tokens from the generation prompt
266264 std::vector<llama_token> prefill_tokens;
267- if (!params.generation_prompt .empty () && common_grammar_needs_prefill (params. grammar ) ) {
265+ if (!params.generation_prompt .empty ()) {
268266 GGML_ASSERT (vocab != nullptr );
269- prefill_tokens = common_tokenize (vocab, params.generation_prompt , false , true );
270- if (!prefill_tokens. empty () ) {
271- std::string first_token = common_token_to_piece (vocab, prefill_tokens[ 0 ], true );
272- if (std::isspace (first_token [0 ]) && !std::isspace (params.generation_prompt [0 ])) {
273- // Some tokenizers will add a space before the first special token, need to remove
274- prefill_tokens = std::vector<llama_token>(prefill_tokens. begin () + 1 , prefill_tokens. end ()) ;
267+ auto tokens = common_tokenize (vocab, params.generation_prompt , false , true );
268+ for ( size_t i = 0 ; i < tokens. size (); i++ ) {
269+ std::string piece = common_token_to_piece (vocab, tokens[i ], true );
270+ if (i == 0 && std::isspace (piece [0 ]) && !std::isspace (params.generation_prompt [0 ])) {
271+ // Some tokenizers will add a space before the first special token, need to exclude
272+ continue ;
275273 }
274+ LOG_DBG (" %s: prefill token: %d = %s\n " , __func__, tokens[i], piece.c_str ());
275+ prefill_tokens.push_back (tokens[i]);
276276 }
277+ }
277278
278- if (grmr && !params.grammar_lazy ) {
279- try {
280- for (const auto & token : prefill_tokens) {
281- llama_sampler_accept (grmr, token);
282- LOG_DBG (" %s: accepted prefill token (%d)\n " , __func__, token);
283- }
284- } catch (std::exception &e) {
285- LOG_ERR (" %s: error initializing grammar sampler for grammar:\n %s\n\n Generation prompt:\n '%s'\n " , __func__,
286- common_grammar_value (params.grammar ).c_str (), params.generation_prompt .c_str ());
287- throw e;
279+ // Feed generation prompt tokens to the grammar sampler so it advances past
280+ // tokens the template already placed in the prompt.
281+ // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
282+ if (grmr && !params.grammar_lazy && common_grammar_needs_prefill (params.grammar )) {
283+ try {
284+ for (const auto & token : prefill_tokens) {
285+ llama_sampler_accept (grmr, token);
286+ LOG_DBG (" %s: grammar accepted prefill token (%d)\n " , __func__, token);
288287 }
288+ } catch (std::exception &e) {
289+ LOG_ERR (" %s: error initializing grammar sampler for grammar:\n %s\n\n Generation prompt:\n '%s'\n " , __func__,
290+ common_grammar_value (params.grammar ).c_str (), params.generation_prompt .c_str ());
291+ throw e;
289292 }
290293 }
291294
@@ -296,8 +299,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
296299 params.reasoning_budget_start ,
297300 params.reasoning_budget_end ,
298301 params.reasoning_budget_forced ,
299- params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens ,
300- prefill_tokens);
302+ params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens );
303+
304+ for (const auto & token : prefill_tokens) {
305+ llama_sampler_accept (rbudget, token);
306+ LOG_DBG (" %s: reasoning-budget accepted prefill token (%d)\n " , __func__, token);
307+ }
301308 }
302309
303310 if (params.has_logit_bias ()) {
@@ -431,17 +438,19 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
431438 return true ;
432439}
433440
434- void common_sampler_accept (struct common_sampler * gsmpl, llama_token token, bool accept_grammar ) {
441+ void common_sampler_accept (struct common_sampler * gsmpl, llama_token token, bool is_generated ) {
435442 if (!gsmpl) {
436443 return ;
437444 }
438445
439446 const auto tm = gsmpl->tm ();
440447
441448 // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
442- accept_grammar = accept_grammar && grammar_should_apply (gsmpl);
449+ const auto accept_grammar = is_generated && grammar_should_apply (gsmpl);
443450
444- llama_sampler_accept (gsmpl->rbudget , token);
451+ if (gsmpl->rbudget && is_generated) {
452+ llama_sampler_accept (gsmpl->rbudget , token);
453+ }
445454
446455 if (gsmpl->grmr && accept_grammar) {
447456 llama_sampler_accept (gsmpl->grmr , token);
0 commit comments