@@ -387,14 +387,52 @@ async function llmCall(messages, opts = {}) {
387387 let tokenCount = 0 ;
388388 let tokenBuffer = '' ;
389389 let firstTokenTime = null ; // For TTFT measurement
390+ // For JSON-expected tests: track whether we've seen the start of JSON content.
391+ // Until we see '{' or '[', everything in delta.content is treated as reasoning.
392+ let jsonContentStarted = ! opts . expectJSON ; // true immediately for non-JSON tests
390393
391394 for await ( const chunk of stream ) {
392395 resetIdle ( ) ;
393396
394397 if ( chunk . model ) model = chunk . model ;
395398
396399 const delta = chunk . choices ?. [ 0 ] ?. delta ;
397- if ( delta ?. content ) content += delta . content ;
400+ if ( delta ?. content ) {
401+ if ( jsonContentStarted ) {
402+ // Already in content mode — accumulate normally
403+ content += delta . content ;
404+ } else {
405+ // JSON test: haven't seen JSON start yet.
406+ // Check if this chunk contains the start of JSON.
407+ // First strip any <think>...</think> blocks.
408+ const cleaned = ( content + delta . content )
409+ . replace ( / < t h i n k > [ \s \S ] * ?< \/ t h i n k > \s * / gi, '' )
410+ . trimStart ( ) ;
411+ const jsonIdx = cleaned . search ( / [ { \[ ] / ) ;
412+ if ( jsonIdx >= 0 ) {
413+ // Found JSON start — split: everything before is reasoning,
414+ // everything from JSON start onwards is content.
415+ jsonContentStarted = true ;
416+ const allText = content + delta . content ;
417+ // Find the actual position in the raw text
418+ const rawCleaned = allText . replace ( / < t h i n k > [ \s \S ] * ?< \/ t h i n k > \s * / gi, '' ) ;
419+ const rawJsonIdx = rawCleaned . search ( / [ { \[ ] / ) ;
420+ if ( rawJsonIdx >= 0 ) {
421+ const thinkingPart = rawCleaned . slice ( 0 , rawJsonIdx ) ;
422+ const contentPart = rawCleaned . slice ( rawJsonIdx ) ;
423+ if ( thinkingPart . trim ( ) ) reasoningContent += thinkingPart ;
424+ content = contentPart ;
425+ } else {
426+ content = allText ;
427+ }
428+ } else {
429+ // Still no JSON — accumulate as reasoning
430+ reasoningContent += delta . content ;
431+ // Keep the raw content in a temp buffer for the split logic above
432+ content += delta . content ;
433+ }
434+ }
435+ }
398436 if ( delta ?. reasoning_content ) reasoningContent += delta . reasoning_content ;
399437 // Fallback: Mistral Small 4 in llama-server may route thinking tokens through
400438 // `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
@@ -407,7 +445,7 @@ async function llmCall(messages, opts = {}) {
407445 // Capture TTFT on first content/reasoning token
408446 if ( ! firstTokenTime ) firstTokenTime = Date . now ( ) ;
409447 // Buffer and log tokens — tag with field source
410- const isContent = ! ! delta ?. content ;
448+ const isContent = ! ! delta ?. content && jsonContentStarted ;
411449 const tok = delta ?. content || delta ?. reasoning_content || delta ?. reasoning || '' ;
412450 // Tag first token of each field type
413451 if ( tokenCount === 1 ) tokenBuffer += isContent ? '[C] ' : '[R] ' ;
@@ -428,18 +466,10 @@ async function llmCall(messages, opts = {}) {
428466 controller . abort ( ) ;
429467 break ;
430468 }
431- // If content is arriving, check it starts with JSON.
432- // Be patient with thinking models: llama-server sends reasoning
433- // as plain text in delta.content (no <think> tags or separate
434- // reasoning field). Wait for enough content before deciding.
435- if ( opts . expectJSON && isContent && content . length >= 200 ) {
436- // Strip <think> blocks AND common plain-text reasoning prefixes
437- // that thinking models (Qwen3.5, etc.) emit before JSON output
438- let stripped = content . replace ( / < t h i n k > [ \s \S ] * ?< \/ t h i n k > \s * / gi, '' ) . trimStart ( ) ;
439- // Strip leading plain-text reasoning (models often start with
440- // "Let me analyze...", "I need to...", followed by actual JSON)
441- stripped = stripped . replace ( / ^ (?: L e t m e | I n e e d t o | I ' l l | I w i l l | F i r s t , | O k a y , | S u r e , | A l r i g h t , | H e r e ' s | L o o k i n g a t | A n a l y z i n g ) [ \s \S ] * ?(? = \s * [ { \[ ] ) / i, '' ) . trimStart ( ) ;
442- if ( stripped . length >= 200 && ! / ^ \s * [ { \[ ] / . test ( stripped ) ) {
469+ // If we have actual JSON content, verify it looks valid
470+ if ( opts . expectJSON && jsonContentStarted && content . length >= 50 ) {
471+ const stripped = content . trimStart ( ) ;
472+ if ( stripped . length >= 50 && ! / ^ \s * [ { \[ ] / . test ( stripped ) ) {
443473 log ( ` ⚠ Aborting: expected JSON but got: "${ stripped . slice ( 0 , 80 ) } …"` ) ;
444474 controller . abort ( ) ;
445475 break ;
@@ -482,6 +512,13 @@ async function llmCall(messages, opts = {}) {
482512 // Flush remaining token buffer
483513 if ( tokenBuffer ) log ( tokenBuffer ) ;
484514
515+ // If JSON was expected but never found in content, the content is all thinking text.
516+ // Clear it so the reasoning fallback below can extract JSON from reasoningContent.
517+ if ( opts . expectJSON && ! jsonContentStarted ) {
518+ log ( ` 💭 Model produced ${ tokenCount } thinking tokens, no JSON content yet — checking reasoning for JSON` ) ;
519+ content = '' ;
520+ }
521+
485522 // If the model only produced reasoning_content (thinking) with no content,
486523 // use the reasoning output as the response content for evaluation purposes.
487524 // Try to extract JSON from reasoning if this was a JSON-expected call.
0 commit comments