@@ -165,14 +165,20 @@ async function llmCall(messages, opts = {}) {
165165 }
166166
167167 const model = opts . model || ( opts . vlm ? VLM_MODEL : LLM_MODEL ) || undefined ;
168- // For JSON-expected tests, disable thinking (Qwen3 /no_think directive)
169- // This prevents the model from wasting tokens on reasoning before outputting JSON
168+ // For JSON-expected tests, disable thinking (Qwen3.5 doesn't support /no_think)
169+ // Method 1: Inject empty <think></think> assistant prefix to skip reasoning phase
170+ // Method 2: chat_template_kwargs via extra_body (works if server supports it)
170171 if ( opts . expectJSON ) {
171- const lastUserIdx = messages . findLastIndex ( m => m . role === 'user' ) ;
172- if ( lastUserIdx >= 0 ) {
173- messages = [ ...messages ] ;
174- messages [ lastUserIdx ] = { ...messages [ lastUserIdx ] , content : messages [ lastUserIdx ] . content + ' /no_think' } ;
175- }
172+ messages = [ ...messages ] ;
173+ // Remove any leftover /no_think from messages
174+ messages = messages . map ( m => {
175+ if ( m . role === 'user' && typeof m . content === 'string' && m . content . endsWith ( ' /no_think' ) ) {
176+ return { ...m , content : m . content . slice ( 0 , - 10 ) } ;
177+ }
178+ return m ;
179+ } ) ;
180+ // Inject empty think block as assistant prefix (most portable method)
181+ messages . push ( { role : 'assistant' , content : '<think>\n</think>\n' } ) ;
176182 }
177183
178184 // Build request params
@@ -182,7 +188,9 @@ async function llmCall(messages, opts = {}) {
182188 ...( model && { model } ) ,
183189 ...( opts . temperature !== undefined && { temperature : opts . temperature } ) ,
184190 ...( opts . maxTokens && { max_completion_tokens : opts . maxTokens } ) ,
185- ...( opts . expectJSON && { response_format : { type : 'json_object' } } ) ,
191+ // Qwen3.5 non-thinking mode recommended params
192+ ...( opts . expectJSON && opts . temperature === undefined && { temperature : 0.7 } ) ,
193+ ...( opts . expectJSON && { top_p : 0.8 , presence_penalty : 1.5 } ) ,
186194 ...( opts . tools && { tools : opts . tools } ) ,
187195 } ;
188196
@@ -192,7 +200,7 @@ async function llmCall(messages, opts = {}) {
192200 let idleTimer = setTimeout ( ( ) => controller . abort ( ) , idleMs ) ;
193201 const resetIdle = ( ) => { clearTimeout ( idleTimer ) ; idleTimer = setTimeout ( ( ) => controller . abort ( ) , idleMs ) ; } ;
194202 // Log prompt being sent
195- log ( `\n 📤 Prompt (${ messages . length } messages, params: ${ JSON . stringify ( { maxTokens : opts . maxTokens , expectJSON : ! ! opts . expectJSON , response_format : params . response_format } ) } ):` ) ;
203+ log ( `\n 📤 Prompt (${ messages . length } messages, params: ${ JSON . stringify ( { maxTokens : opts . maxTokens , expectJSON : ! ! opts . expectJSON } ) } ):` ) ;
196204 for ( const m of messages ) {
197205 if ( typeof m . content === 'string' ) {
198206 log ( ` [${ m . role } ] ${ m . content } ` ) ;
@@ -274,10 +282,15 @@ async function llmCall(messages, opts = {}) {
274282 break ;
275283 }
276284 }
277- // Hard cap: abort if token count far exceeds maxTokens (server may
278- // not count thinking tokens toward the limit)
279- if ( opts . maxTokens && tokenCount > opts . maxTokens * 3 ) {
280- log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds ${ opts . maxTokens } ×3 safety limit` ) ;
285+ // Hard cap: abort if token count far exceeds maxTokens
286+ if ( opts . maxTokens && tokenCount > opts . maxTokens * 2 ) {
287+ log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds ${ opts . maxTokens } ×2 safety limit` ) ;
288+ controller . abort ( ) ;
289+ break ;
290+ }
291+ // Global safety limit: no benchmark test should ever need >2000 tokens
292+ if ( tokenCount > 2000 ) {
293+ log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds global 2000-token safety limit` ) ;
281294 controller . abort ( ) ;
282295 break ;
283296 }
@@ -334,10 +347,28 @@ function parseJSON(text) {
334347 const cleaned = stripThink ( text ) ;
335348 let jsonStr = cleaned ;
336349 const codeBlock = cleaned . match ( / ` ` ` (?: j s o n ) ? \s * ( [ \s \S ] * ?) \s * ` ` ` / ) ;
337- if ( codeBlock ) jsonStr = codeBlock [ 1 ] ;
338- else {
339- const idx = cleaned . search ( / [ { [ ] / ) ;
340- if ( idx > 0 ) jsonStr = cleaned . slice ( idx ) ;
350+ if ( codeBlock ) {
351+ jsonStr = codeBlock [ 1 ] ;
352+ } else {
353+ // Find first { or [ and extract balanced JSON
354+ const startIdx = cleaned . search ( / [ { [ ] / ) ;
355+ if ( startIdx >= 0 ) {
356+ const opener = cleaned [ startIdx ] ;
357+ const closer = opener === '{' ? '}' : ']' ;
358+ let depth = 0 ;
359+ let inString = false ;
360+ let escape = false ;
361+ for ( let i = startIdx ; i < cleaned . length ; i ++ ) {
362+ const ch = cleaned [ i ] ;
363+ if ( escape ) { escape = false ; continue ; }
364+ if ( ch === '\\' && inString ) { escape = true ; continue ; }
365+ if ( ch === '"' ) { inString = ! inString ; continue ; }
366+ if ( ! inString ) {
367+ if ( ch === opener ) depth ++ ;
368+ else if ( ch === closer ) { depth -- ; if ( depth === 0 ) { jsonStr = cleaned . slice ( startIdx , i + 1 ) ; break ; } }
369+ }
370+ }
371+ }
341372 }
342373 return JSON . parse ( jsonStr . trim ( ) ) ;
343374}
0 commit comments