fix(tokenizer): respect GGUF tokenizer.ggml.add_bos_token (R8)

unamedkr · claude · unamedkr · commit 714cd4c29d63 · 2026-04-26T16:06:06.000+09:00
Generalises R7 from Qwen3.6 family-only heuristic to a model-agnostic
GGUF metadata read.

Adds tq_tokenizer_t.add_bos_token tristate field:
   1 = explicit true
  -1 = explicit false (suppress BOS prepend even if vocab lookup
       would have enabled it)
   0 = unset (fall through to existing heuristics)

tq_load_tokenizer_from_gguf parses tokenizer.ggml.add_bos_token (bool)
and sets the field accordingly.

tq_generate.c BOS-decision block consults the tristate before any
heuristic, so models that explicitly declare add_bos_token=false
(both Qwen3.6-27B Q4_K_M and 35B-A3B-UD-IQ4_XS do, per direct GGUF
metadata read) are honoured regardless of vocab content.

Verification (clean rebuild, Metal=ON):
- 35B-A3B IQ4_XS quantum: 149 tok natural EOS  (matches baseline,
  Tier 2 — restored from 94 rep loop seen with R1 BOS auto-enable)
- add_bos=-1 logged at load time confirms the metadata path fires

Note: an earlier incremental rebuild after the struct change produced
an ABI mismatch (102 rep loop instead of 149 EOS). Always do a clean
rebuild after touching tokenizer struct layout. Also rediscovered:
src/engine/tq_moe.c uses `goto moe_shared_expert` outside the
`#ifdef TQ_HAS_METAL` block but the label is inside, so non-Metal
configurations fail to build — keep TQ_BUILD_METAL=ON for now.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
@@ -459,6 +459,11 @@ typedef struct {
     int* sorted_indices;
     /* Merge table: pairs of token IDs that merge into a result */
     int* merge_pairs;    /* [n_merges * 3]: (token_a, token_b, result_id) */
+    /* GGUF metadata flag: tokenizer.ggml.add_bos_token
+     *   1  = explicitly true  (force BOS prepend)
+     *  -1  = explicitly false (suppress BOS prepend regardless of vocab lookup)
+     *   0  = unset (use heuristic vocab lookup) */
+    int add_bos_token;
 } tq_tokenizer_t;
 
 /* ============================================================
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -333,9 +333,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
          * Gemma 3/4: model_type==1, BOS=2 (required)
          * Phi-3 / LLaMA 2: vocab has <s> as BOS (required)
          * LLaMA 3: BOS=128000 (<|begin_of_text|>) — tq_encode lookup chain handles it
-         * Qwen3.5 / GPT-2 BPE: no native BOS, skip */
+         * Qwen3.5 / GPT-2 BPE: no native BOS, skip
+         *
+         * Precedence: GGUF metadata `tokenizer.ggml.add_bos_token` wins over
+         * heuristics. -1 = explicit false (suppress), +1 = explicit true,
+         * 0 = unset (fall through to heuristic). */
         int add_bos = 0;
-        if (model->config.model_type == 1) {
+        if (tokenizer->add_bos_token == -1) {
+            /* explicit false — Qwen3.6 27B/35B-A3B path; skip everything */
+        } else if (tokenizer->add_bos_token == 1) {
+            add_bos = 1;
+        } else if (model->config.model_type == 1) {
             add_bos = 1; /* Gemma: always prepend BOS=2 */
         } else {
             /* Auto-detect BOS: check if vocab contains <s> (LLaMA 2, Phi-3)
diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c
@@ -950,8 +950,20 @@ tq_tokenizer_t* tq_load_tokenizer_from_gguf(const void* gguf_ctx_ptr) {
         }
     }
 
-    fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d)\n",
-            tok->vocab_size, tok->max_token_len);
+    /* tokenizer.ggml.add_bos_token (bool, optional)
+     *   true  → tok->add_bos_token = +1 (force BOS prepend)
+     *   false → tok->add_bos_token = -1 (suppress BOS prepend)
+     *   unset →  0 (let tq_generate fall back to vocab heuristic) */
+    int64_t add_bos_idx = tq_gguf_find_key(gguf, "tokenizer.ggml.add_bos_token");
+    if (add_bos_idx >= 0) {
+        const tq_gguf_kv_t* kv = &gguf->kv[add_bos_idx];
+        if (kv->type == TQ_GGUF_TYPE_BOOL) {
+            tok->add_bos_token = kv->value.bool_val ? 1 : -1;
+        }
+    }
+
+    fprintf(stderr, "tq_load_tokenizer_from_gguf: loaded %d tokens (max_len=%d) add_bos=%d\n",
+            tok->vocab_size, tok->max_token_len, tok->add_bos_token);
     return tok;
 }