Fix 3 quality issues: weight label, think tags, repetition penalty

unamedkr · claude · unamedkr · commit 444240006f9a · 2026-03-29T23:37:04.000+09:00
1. weights=Q4 now correctly displayed for TQM files (was FP32)
2. &lt;think&gt;/&lt;\/think&gt; tokens filtered from output
3. Repetition penalty (1.1x, 32-token window) prevents degenerate loops
   Before: "190cm tall, 70kg weight, 190cm tall, 70kg weight..."
   After: diverse text generation with minimal repetition

20/20 tests pass. 24.9 tok/s on Q4 TQM.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.claude/state.md b/.claude/state.md
@@ -1,6 +1,6 @@
 # TurboQuant.cpp — Session State
 
-**Last updated**: 2026-03-29 (v0.9.2 TQM format for instant model loading)
+**Last updated**: 2026-03-29 (v0.9.3 bugfixes: weights label, think filter, repetition penalty)
 **Last commit**: pending
 
 ## Speed Progression
@@ -14,7 +14,7 @@ llama.cpp Q4_K_M:  ~50   tok/s  ← target
 ```
 
 ## What Works
-- All 19 tests pass, zero warnings
+- All 20 tests pass, zero warnings
 - Q4 weights: 270 MB, Q8: 533 MB (vs 2.1 GB FP32)
 - Self-contained C inference engine, 0 dependencies
 - DeltaNet + Self-Attention hybrid forward pass
@@ -115,6 +115,31 @@ This takes ~6s for an 0.8B model. Goal: <0.5s via pre-quantized mmap-ready forma
 - `tests/test_tqm.cpp` — NEW test file (6 tests)
 - `CMakeLists.txt` — added tq_convert build target
 
+## v0.9.3 Changes — Inference Quality Fixes
+
+### Fix 1: TQM weights label shows "Q4" instead of "FP32"
+- `tools/tq_run.c`: Changed `wq_name` to check `model->use_q4_weights` / `model->use_q8_weights`
+  instead of the CLI `quant_mode` flag, so TQM-loaded models correctly report "Q4"
+- `tq_load_tqm()` already set `model->use_q4_weights = 1` (no change needed)
+
+### Fix 2: Filter `<think>` tags from output
+- `src/engine/tq_generate.c`: After `tq_decode()`, skip tokens containing `<think>` or `</think>`
+- Prevents Qwen3.5 thinking-mode artifacts from appearing in generated output
+
+### Fix 3: Repetition penalty to prevent degenerate loops
+- Added `rep_penalty` (float, default 1.1) and `rep_window` (int, default 32) to `tq_gen_config_t`
+- `include/turboquant/tq_engine.h`: New fields in gen config struct
+- `src/engine/tq_ops.c`: Default values in `tq_default_gen_config()`
+- `src/engine/tq_generate.c`: Circular buffer tracks recent tokens (up to 64);
+  before each `tq_sample_topp()` call, penalizes logits of recently generated tokens
+  (positive logits divided by penalty, negative logits multiplied)
+
+### Files Modified
+- `include/turboquant/tq_engine.h` — rep_penalty, rep_window fields in tq_gen_config_t
+- `src/engine/tq_generate.c` — think filter + repetition penalty logic
+- `src/engine/tq_ops.c` — default rep_penalty=1.1, rep_window=32
+- `tools/tq_run.c` — weights label based on model flags
+
 ## What Needs Work
 1. Measure actual speed improvement (need model file for tq_run)
 2. Q4 quality on short prompts
diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
@@ -196,6 +196,8 @@ typedef struct {
     int max_tokens;
     tq_type kv_type;     /* KV cache quantization type */
     int n_threads;
+    float rep_penalty;    /* repetition penalty (default: 1.1, 1.0 = disabled) */
+    int rep_window;       /* how many recent tokens to penalize (default: 32) */
     /* Callback for streaming output */
     void (*on_token)(const char* text, void* user_data);
     void* user_data;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -167,13 +167,47 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         tq_forward(model, state, prompt_tokens[i], i);
     }
 
+    /* Repetition penalty setup */
+    int vocab_size = model->config.vocab_size;
+    float rep_penalty = config->rep_penalty;
+    int rep_window = config->rep_window;
+    if (rep_window > 64) rep_window = 64;
+    int recent_tokens[64];
+    int recent_count = 0;
+
+    /* Seed recent tokens with tail of prompt for better penalty coverage */
+    for (int i = (n_prompt > rep_window ? n_prompt - rep_window : 0); i < n_prompt; i++) {
+        recent_tokens[recent_count % 64] = prompt_tokens[i];
+        recent_count++;
+    }
+
+    /* Apply repetition penalty to logits before first sample */
+    if (rep_penalty > 1.0f) {
+        int window = recent_count < rep_window ? recent_count : rep_window;
+        for (int r = 0; r < window; r++) {
+            int idx = (recent_count - 1 - r) % 64;
+            if (idx < 0) idx += 64;
+            int tok = recent_tokens[idx];
+            if (tok >= 0 && tok < vocab_size) {
+                if (state->logits[tok] > 0)
+                    state->logits[tok] /= rep_penalty;
+                else
+                    state->logits[tok] *= rep_penalty;
+            }
+        }
+    }
+
     /* Sample first generated token */
     int pos = n_prompt;
     unsigned long long rng_state = 42;
-    int next_token = tq_sample_topp(state->logits, model->config.vocab_size,
+    int next_token = tq_sample_topp(state->logits, vocab_size,
                                      config->temperature, config->top_p,
                                      &rng_state);
 
+    /* Record first sampled token */
+    recent_tokens[recent_count % 64] = next_token;
+    recent_count++;
+
     int generated = 0;
     int output_pos = 0;
     int prev_token = prompt_tokens[n_prompt - 1];
@@ -194,6 +228,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         /* Decode token to text */
         if (tokenizer) {
             const char* piece = tq_decode(tokenizer, prev_token, next_token);
+
+            /* Skip thinking tokens (e.g. Qwen3.5 <think>...</think>) */
+            if (piece && (strstr(piece, "<think>") || strstr(piece, "</think>"))) {
+                piece = "";
+            }
+
             int piece_len = (int)strlen(piece);
 
             /* Stream callback */
@@ -214,10 +254,30 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         pos++;
         generated++;
 
+        /* Apply repetition penalty before sampling */
+        if (rep_penalty > 1.0f) {
+            int window = recent_count < rep_window ? recent_count : rep_window;
+            for (int r = 0; r < window; r++) {
+                int idx = (recent_count - 1 - r) % 64;
+                if (idx < 0) idx += 64;
+                int tok = recent_tokens[idx];
+                if (tok >= 0 && tok < vocab_size) {
+                    if (state->logits[tok] > 0)
+                        state->logits[tok] /= rep_penalty;
+                    else
+                        state->logits[tok] *= rep_penalty;
+                }
+            }
+        }
+
         /* Sample next token */
-        next_token = tq_sample_topp(state->logits, model->config.vocab_size,
+        next_token = tq_sample_topp(state->logits, vocab_size,
                                      config->temperature, config->top_p,
                                      &rng_state);
+
+        /* Record sampled token for repetition penalty */
+        recent_tokens[recent_count % 64] = next_token;
+        recent_count++;
     }
 
     /* Null-terminate output */
diff --git a/src/engine/tq_ops.c b/src/engine/tq_ops.c
@@ -901,6 +901,8 @@ tq_gen_config_t tq_default_gen_config(void) {
     config.max_tokens = 256;
     config.kv_type = TQ_TYPE_UNIFORM_4B;
     config.n_threads = 1;
+    config.rep_penalty = 1.1f;
+    config.rep_window = 32;
     config.on_token = NULL;
     config.user_data = NULL;
     return config;
diff --git a/tools/tq_run.c b/tools/tq_run.c
@@ -206,7 +206,7 @@ int main(int argc, char** argv) {
     fprintf(stderr, "\n---\n");
     if (n_generated > 0 && elapsed > 0.0) {
         double tok_per_sec = (double)n_generated / elapsed;
-        const char* wq_name = quant_mode == 4 ? "Q4" : (quant_mode == 8 ? "Q8" : "FP32");
+        const char* wq_name = model->use_q4_weights ? "Q4" : (model->use_q8_weights ? "Q8" : "FP32");
         fprintf(stderr, "%d tokens in %.1fs (%.1f tok/s, %d threads, weights=%s, kv=%s)\n",
                 n_generated, elapsed, tok_per_sec, tq_get_threads(), wq_name,
                 kv_type < TQ_TYPE_COUNT ? tq_type_name(kv_type) : "fp32");