@@ -165,9 +165,10 @@ async function llmCall(messages, opts = {}) {
165165 }
166166
167167 const model = opts . model || ( opts . vlm ? VLM_MODEL : LLM_MODEL ) || undefined ;
168- // For JSON-expected tests, disable thinking (Qwen3.5 doesn't support /no_think)
169- // Method 1: Inject empty <think></think> assistant prefix to skip reasoning phase
170- // Method 2: chat_template_kwargs via extra_body (works if server supports it)
168+ // For JSON-expected tests, use low temperature + top_p to encourage
169+ // direct JSON output without extended reasoning.
170+ // NOTE: Do NOT inject assistant prefill — Qwen3.5 rejects prefill
171+ // when enable_thinking is active (400 error).
171172 if ( opts . expectJSON ) {
172173 messages = [ ...messages ] ;
173174 // Remove any leftover /no_think from messages
@@ -177,8 +178,6 @@ async function llmCall(messages, opts = {}) {
177178 }
178179 return m ;
179180 } ) ;
180- // Inject empty think block as assistant prefix (most portable method)
181- messages . push ( { role : 'assistant' , content : '<think>\n</think>\n' } ) ;
182181 }
183182
184183 // Build request params
@@ -188,9 +187,8 @@ async function llmCall(messages, opts = {}) {
188187 ...( model && { model } ) ,
189188 ...( opts . temperature !== undefined && { temperature : opts . temperature } ) ,
190189 ...( opts . maxTokens && { max_tokens : opts . maxTokens } ) ,
191- // Qwen3.5 non-thinking mode recommended params
192190 ...( opts . expectJSON && opts . temperature === undefined && { temperature : 0.7 } ) ,
193- ...( opts . expectJSON && { top_p : 0.8 , presence_penalty : 1.5 } ) ,
191+ ...( opts . expectJSON && { top_p : 0.8 } ) ,
194192 ...( opts . tools && { tools : opts . tools } ) ,
195193 } ;
196194
@@ -2021,10 +2019,11 @@ async function main() {
20212019 const indexFile = path . join ( RESULTS_DIR , 'index.json' ) ;
20222020 let index = [ ] ;
20232021 try { index = JSON . parse ( fs . readFileSync ( indexFile , 'utf8' ) ) ; } catch { }
2024- // Compute LLM vs VLM split
2025- const vlmSuite = results . suites . find ( s => s . name . includes ( 'VLM' ) ) ;
2026- const vlmPassed = vlmSuite ? vlmSuite . tests . filter ( t => t . status === 'pass' ) . length : 0 ;
2027- const vlmTotal = vlmSuite ? vlmSuite . tests . length : 0 ;
2022+ // Compute LLM vs VLM split (only count image analysis suites as VLM)
2023+ const isVlmImageSuite = ( name ) => name . includes ( 'VLM Scene' ) || name . includes ( '📸' ) ;
2024+ const vlmSuites = results . suites . filter ( s => isVlmImageSuite ( s . name ) ) ;
2025+ const vlmPassed = vlmSuites . reduce ( ( n , s ) => n + s . tests . filter ( t => t . status === 'pass' ) . length , 0 ) ;
2026+ const vlmTotal = vlmSuites . reduce ( ( n , s ) => n + s . tests . length , 0 ) ;
20282027 const llmPassed = passed - vlmPassed ;
20292028 const llmTotal = total - vlmTotal ;
20302029
0 commit comments