@@ -167,13 +167,47 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
167167 tq_forward (model , state , prompt_tokens [i ], i );
168168 }
169169
170+ /* Repetition penalty setup */
171+ int vocab_size = model -> config .vocab_size ;
172+ float rep_penalty = config -> rep_penalty ;
173+ int rep_window = config -> rep_window ;
174+ if (rep_window > 64 ) rep_window = 64 ;
175+ int recent_tokens [64 ];
176+ int recent_count = 0 ;
177+
178+ /* Seed recent tokens with tail of prompt for better penalty coverage */
179+ for (int i = (n_prompt > rep_window ? n_prompt - rep_window : 0 ); i < n_prompt ; i ++ ) {
180+ recent_tokens [recent_count % 64 ] = prompt_tokens [i ];
181+ recent_count ++ ;
182+ }
183+
184+ /* Apply repetition penalty to logits before first sample */
185+ if (rep_penalty > 1.0f ) {
186+ int window = recent_count < rep_window ? recent_count : rep_window ;
187+ for (int r = 0 ; r < window ; r ++ ) {
188+ int idx = (recent_count - 1 - r ) % 64 ;
189+ if (idx < 0 ) idx += 64 ;
190+ int tok = recent_tokens [idx ];
191+ if (tok >= 0 && tok < vocab_size ) {
192+ if (state -> logits [tok ] > 0 )
193+ state -> logits [tok ] /= rep_penalty ;
194+ else
195+ state -> logits [tok ] *= rep_penalty ;
196+ }
197+ }
198+ }
199+
170200 /* Sample first generated token */
171201 int pos = n_prompt ;
172202 unsigned long long rng_state = 42 ;
173- int next_token = tq_sample_topp (state -> logits , model -> config . vocab_size ,
203+ int next_token = tq_sample_topp (state -> logits , vocab_size ,
174204 config -> temperature , config -> top_p ,
175205 & rng_state );
176206
207+ /* Record first sampled token */
208+ recent_tokens [recent_count % 64 ] = next_token ;
209+ recent_count ++ ;
210+
177211 int generated = 0 ;
178212 int output_pos = 0 ;
179213 int prev_token = prompt_tokens [n_prompt - 1 ];
@@ -194,6 +228,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
194228 /* Decode token to text */
195229 if (tokenizer ) {
196230 const char * piece = tq_decode (tokenizer , prev_token , next_token );
231+
232+ /* Skip thinking tokens (e.g. Qwen3.5 <think>...</think>) */
233+ if (piece && (strstr (piece , "<think>" ) || strstr (piece , "</think>" ))) {
234+ piece = "" ;
235+ }
236+
197237 int piece_len = (int )strlen (piece );
198238
199239 /* Stream callback */
@@ -214,10 +254,30 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
214254 pos ++ ;
215255 generated ++ ;
216256
257+ /* Apply repetition penalty before sampling */
258+ if (rep_penalty > 1.0f ) {
259+ int window = recent_count < rep_window ? recent_count : rep_window ;
260+ for (int r = 0 ; r < window ; r ++ ) {
261+ int idx = (recent_count - 1 - r ) % 64 ;
262+ if (idx < 0 ) idx += 64 ;
263+ int tok = recent_tokens [idx ];
264+ if (tok >= 0 && tok < vocab_size ) {
265+ if (state -> logits [tok ] > 0 )
266+ state -> logits [tok ] /= rep_penalty ;
267+ else
268+ state -> logits [tok ] *= rep_penalty ;
269+ }
270+ }
271+ }
272+
217273 /* Sample next token */
218- next_token = tq_sample_topp (state -> logits , model -> config . vocab_size ,
274+ next_token = tq_sample_topp (state -> logits , vocab_size ,
219275 config -> temperature , config -> top_p ,
220276 & rng_state );
277+
278+ /* Record sampled token for repetition penalty */
279+ recent_tokens [recent_count % 64 ] = next_token ;
280+ recent_count ++ ;
221281 }
222282
223283 /* Null-terminate output */
0 commit comments