Skip to content

Commit 12e4d94

Browse files
unamedkrclaude
andcommitted
fix(tokenizer): Qwen3.6 BOS = <|endoftext|> (248044), not <|im_start|>
ROOT CAUSE FOUND for Qwen3.6-27B Tier 3 forward-pass divergence. Investigation chain: 1. basin_compat showed L0 element-level sign flip (ours +0.25 vs llama -0.29) 2. Pre-norm input also sign-flipped (ours +0.0035 vs inferred llama -0.003) 3. Embedding lookup itself diverged at supposed-same token 4. Token IDs traced via TQ_DEBUG_TOKENS env: ours=[248045, 9419] but GGUF bos_token_id metadata = 248044 (<|endoftext|>) 5. vocab[248044] = '<|endoftext|>', vocab[248045] = '<|im_start|>' 6. tq_encode str_lookup chain hits <|im_start|> first (id 248045) before <|endoftext|> (id 248044) is checked → wrong BOS Fix: - src/engine/tq_tokenizer.c: append <|endoftext|> to BOS str_lookup chain (still preferred AFTER <|im_start|> for backward compat with smaller Qwen models that use <|im_start|> as functional BOS) - src/engine/tq_generate.c: for Qwen3.6 family (vocab > 240K), detect presence of <|endoftext|> and override prompt_tokens[0] to that id. Bypasses the str_lookup ordering issue without breaking Qwen3-0.6B, Qwen3.5-4B, etc. (which have smaller vocab and use older convention). - src/engine/tq_transformer.c: enhanced [dn-trace] output to include attn_norm first3+last3 and pre-norm input for paired-diff debugging. Verified after fix: Tokens: [248044, 9419] ✓ matches llama L0 attn_norm pos=0 (BOS): first3 = [-0.2891, -0.6430, 0.4991] llama row 0 first3: [-0.2891, -0.6430, 0.4991] ✓ BIT-EXACT Remaining issue: pos=1 ("Hello" token id 9419) doesn't match llama, suggesting llama tokenizes start-of-prompt with implicit space prefix ("ĠHello" id 21251). This is BPE pre-tokenizer behavior — separate fix needed in pre_tokenize_gpt2_bpe path, not blocking BOS fix. Earlier verdict "Qwen3.6-27B is Tier 3, fundamental forward-pass bug" was WRONG. The forward pass is correct; tokenization was the issue. With BOS fix, L0 BOS row is bit-exact to llama. Real tier classification requires re-running coh_bench after pre-tokenizer fix lands. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0829285 commit 12e4d94

3 files changed

Lines changed: 53 additions & 2 deletions

File tree

src/engine/tq_generate.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,51 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
356356
}
357357
if (bos_id >= 0) add_bos = 1;
358358
}
359+
/* Qwen3.6 family (27B dense, 35B-A3B): GGUF metadata sets
360+
* BOS=<|endoftext|> id 248044. tokenizer.ggml.add_bos_token=false
361+
* but llama-cli adds BOS by default in main, and our basin_compat
362+
* measurements showed missing BOS causes 100× outlier divergence
363+
* at L0 (tokenization mismatch with reference). Detect by
364+
* presence of <|endoftext|> in vocab. */
365+
if (!add_bos) {
366+
/* <|endoftext|> for Qwen3.6 lives in 248040-248050 range (vocab=248320) */
367+
int lo = 248040, hi = 248060;
368+
if (hi > tokenizer->vocab_size) hi = tokenizer->vocab_size;
369+
for (int i = lo; i < hi; i++) {
370+
if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<|endoftext|>") == 0) {
371+
add_bos = 1; break;
372+
}
373+
}
374+
}
375+
}
376+
/* Qwen3.6 BOS-id fix: tq_encode str_lookup chain checks <|im_start|>
377+
* before <|endoftext|>, picking id 248045 instead of correct 248044
378+
* for Qwen3.6 family (27B, 35B-A3B). For these models, override the
379+
* BOS to <|endoftext|> directly. Detected by large vocab (>240K) +
380+
* presence of <|endoftext|>. */
381+
int qwen36_bos_override = -1;
382+
if (add_bos && tokenizer->vocab_size > 240000) {
383+
int lo = 248040, hi = 248060;
384+
if (hi > tokenizer->vocab_size) hi = tokenizer->vocab_size;
385+
for (int i = lo; i < hi; i++) {
386+
if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<|endoftext|>") == 0) {
387+
qwen36_bos_override = i; break;
388+
}
389+
}
359390
}
360391
n_prompt = tq_encode(tokenizer, prompt, prompt_tokens,
361392
(int)(sizeof(prompt_tokens)/sizeof(prompt_tokens[0])),
362393
add_bos);
394+
/* Qwen3.6 BOS override: tq_encode picked <|im_start|> (248045) but
395+
* GGUF metadata BOS = <|endoftext|> (248044). Replace at index 0. */
396+
if (qwen36_bos_override >= 0 && n_prompt > 0 && add_bos) {
397+
prompt_tokens[0] = qwen36_bos_override;
398+
}
399+
if (getenv("TQ_DEBUG_TOKENS")) {
400+
fprintf(stderr, "[tq_encode] add_bos=%d n_prompt=%d tokens=[", add_bos, n_prompt);
401+
for (int i = 0; i < n_prompt && i < 20; i++) fprintf(stderr, "%d%s", prompt_tokens[i], i+1<n_prompt?",":"");
402+
fprintf(stderr, "]\n");
403+
}
363404
} else {
364405
prompt_tokens[0] = (model->config.model_type == 1) ? 2 : 1;
365406
n_prompt = 1;

src/engine/tq_tokenizer.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1203,13 +1203,16 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
12031203
int n_tokens = 0;
12041204

12051205
/* Add BOS token if requested.
1206-
* Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead) */
1206+
* Gemma: BOS=2, Qwen: no BOS (uses <|im_start|> instead).
1207+
* Qwen3.6 (27B dense, 35B-A3B): GGUF metadata has BOS=<|endoftext|> id 248044.
1208+
* Added for Qwen3.6 family to match llama.cpp tokenization. */
12071209
if (add_bos) {
12081210
/* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
12091211
int bos_id = str_lookup(tok, "<bos>");
12101212
if (bos_id < 0) { bos_id = str_lookup(tok, "<s>"); }
12111213
if (bos_id < 0) { bos_id = str_lookup(tok, "<|begin_of_text|>"); }
12121214
if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
1215+
if (bos_id < 0) { bos_id = str_lookup(tok, "<|endoftext|>"); }
12131216
if (bos_id >= 0) {
12141217
tokens[n_tokens++] = bos_id;
12151218
}

src/engine/tq_transformer.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,14 @@ static void deltanet_forward(tq_model_t* model, tq_state_t* s, int l) {
702702
if (dn_trace) {
703703
double xb_sum = 0;
704704
for (int i = 0; i < dim; i++) xb_sum += s->xb[i];
705-
fprintf(stderr, "[dn-trace] L%d attn_norm_out sum=%.6f\n", l, xb_sum);
705+
fprintf(stderr, "[dn-trace] L%d attn_norm_out sum=%.6f first3=%.4f,%.4f,%.4f last3=%.4f,%.4f,%.4f\n",
706+
l, xb_sum, s->xb[0], s->xb[1], s->xb[2],
707+
s->xb[dim-3], s->xb[dim-2], s->xb[dim-1]);
708+
/* Also dump pre-norm input (s->x) for embedding probe */
709+
double xs = 0;
710+
for (int i = 0; i < dim; i++) xs += s->x[i];
711+
fprintf(stderr, "[dn-trace] L%d pre_norm_input sum=%.6f first3=%.4f,%.4f,%.4f\n",
712+
l, xs, s->x[0], s->x[1], s->x[2]);
706713
}
707714

708715
/* Pre-quantize activation to Q8 once for all Q2/Q4 projections in this layer.

0 commit comments

Comments
 (0)