Skip to content

Commit e908615

Browse files
committed
fix: properly split thinking from content in benchmark streaming
For JSON-expected tests, delta.content text arriving before any JSON start ({/[) is now routed to reasoningContent, not content. This handles llama-server with Qwen3.5-Claude models where thinking appears as plain text in delta.content without <think> tags. The model's thinking is logged/shown but NOT evaluated as output. Only the actual JSON content (after the first {/[) is treated as the model's response for test evaluation.
1 parent d5c7c9d commit e908615

File tree

1 file changed

+51
-14
lines changed

1 file changed

+51
-14
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -387,14 +387,52 @@ async function llmCall(messages, opts = {}) {
387387
let tokenCount = 0;
388388
let tokenBuffer = '';
389389
let firstTokenTime = null; // For TTFT measurement
390+
// For JSON-expected tests: track whether we've seen the start of JSON content.
391+
// Until we see '{' or '[', everything in delta.content is treated as reasoning.
392+
let jsonContentStarted = !opts.expectJSON; // true immediately for non-JSON tests
390393

391394
for await (const chunk of stream) {
392395
resetIdle();
393396

394397
if (chunk.model) model = chunk.model;
395398

396399
const delta = chunk.choices?.[0]?.delta;
397-
if (delta?.content) content += delta.content;
400+
if (delta?.content) {
401+
if (jsonContentStarted) {
402+
// Already in content mode — accumulate normally
403+
content += delta.content;
404+
} else {
405+
// JSON test: haven't seen JSON start yet.
406+
// Check if this chunk contains the start of JSON.
407+
// First strip any <think>...</think> blocks.
408+
const cleaned = (content + delta.content)
409+
.replace(/<think>[\s\S]*?<\/think>\s*/gi, '')
410+
.trimStart();
411+
const jsonIdx = cleaned.search(/[{\[]/);
412+
if (jsonIdx >= 0) {
413+
// Found JSON start — split: everything before is reasoning,
414+
// everything from JSON start onwards is content.
415+
jsonContentStarted = true;
416+
const allText = content + delta.content;
417+
// Find the actual position in the raw text
418+
const rawCleaned = allText.replace(/<think>[\s\S]*?<\/think>\s*/gi, '');
419+
const rawJsonIdx = rawCleaned.search(/[{\[]/);
420+
if (rawJsonIdx >= 0) {
421+
const thinkingPart = rawCleaned.slice(0, rawJsonIdx);
422+
const contentPart = rawCleaned.slice(rawJsonIdx);
423+
if (thinkingPart.trim()) reasoningContent += thinkingPart;
424+
content = contentPart;
425+
} else {
426+
content = allText;
427+
}
428+
} else {
429+
// Still no JSON — accumulate as reasoning
430+
reasoningContent += delta.content;
431+
// Keep the raw content in a temp buffer for the split logic above
432+
content += delta.content;
433+
}
434+
}
435+
}
398436
if (delta?.reasoning_content) reasoningContent += delta.reasoning_content;
399437
// Fallback: Mistral Small 4 in llama-server may route thinking tokens through
400438
// `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
@@ -407,7 +445,7 @@ async function llmCall(messages, opts = {}) {
407445
// Capture TTFT on first content/reasoning token
408446
if (!firstTokenTime) firstTokenTime = Date.now();
409447
// Buffer and log tokens — tag with field source
410-
const isContent = !!delta?.content;
448+
const isContent = !!delta?.content && jsonContentStarted;
411449
const tok = delta?.content || delta?.reasoning_content || delta?.reasoning || '';
412450
// Tag first token of each field type
413451
if (tokenCount === 1) tokenBuffer += isContent ? '[C] ' : '[R] ';
@@ -428,18 +466,10 @@ async function llmCall(messages, opts = {}) {
428466
controller.abort();
429467
break;
430468
}
431-
// If content is arriving, check it starts with JSON.
432-
// Be patient with thinking models: llama-server sends reasoning
433-
// as plain text in delta.content (no <think> tags or separate
434-
// reasoning field). Wait for enough content before deciding.
435-
if (opts.expectJSON && isContent && content.length >= 200) {
436-
// Strip <think> blocks AND common plain-text reasoning prefixes
437-
// that thinking models (Qwen3.5, etc.) emit before JSON output
438-
let stripped = content.replace(/<think>[\s\S]*?<\/think>\s*/gi, '').trimStart();
439-
// Strip leading plain-text reasoning (models often start with
440-
// "Let me analyze...", "I need to...", followed by actual JSON)
441-
stripped = stripped.replace(/^(?:Let me|I need to|I'll|I will|First,|Okay,|Sure,|Alright,|Here's|Looking at|Analyzing)[\s\S]*?(?=\s*[{\[])/i, '').trimStart();
442-
if (stripped.length >= 200 && !/^\s*[{\[]/.test(stripped)) {
469+
// If we have actual JSON content, verify it looks valid
470+
if (opts.expectJSON && jsonContentStarted && content.length >= 50) {
471+
const stripped = content.trimStart();
472+
if (stripped.length >= 50 && !/^\s*[{\[]/.test(stripped)) {
443473
log(` ⚠ Aborting: expected JSON but got: "${stripped.slice(0, 80)}…"`);
444474
controller.abort();
445475
break;
@@ -482,6 +512,13 @@ async function llmCall(messages, opts = {}) {
482512
// Flush remaining token buffer
483513
if (tokenBuffer) log(tokenBuffer);
484514

515+
// If JSON was expected but never found in content, the content is all thinking text.
516+
// Clear it so the reasoning fallback below can extract JSON from reasoningContent.
517+
if (opts.expectJSON && !jsonContentStarted) {
518+
log(` 💭 Model produced ${tokenCount} thinking tokens, no JSON content yet — checking reasoning for JSON`);
519+
content = '';
520+
}
521+
485522
// If the model only produced reasoning_content (thinking) with no content,
486523
// use the reasoning output as the response content for evaluation purposes.
487524
// Try to extract JSON from reasoning if this was a JSON-expected call.

0 commit comments

Comments
 (0)