Skip to content

Commit c031470

Browse files
committed
fix: handle delta.reasoning field from mlx-lm Python server
1 parent e4fc076 commit c031470

File tree

1 file changed

+91
-15
lines changed

1 file changed

+91
-15
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 91 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,15 @@ const MODEL_FAMILIES = [
174174
},
175175
// Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
176176
// abort in llmCall — no extra per-request params needed.
177+
{
178+
name: 'GPT-OSS',
179+
// gpt-oss-20b uses <|channel|>analysis/final structure.
180+
// reasoning_effort=none hints the model to minimize analysis (injected into system prompt
181+
// by the chat template). The mlx-server OutputFilter suppresses analysis at token ID level.
182+
match: (m) => m.includes('gpt-oss'),
183+
apiParams: { reasoning_effort: 'none' },
184+
serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"}',
185+
},
177186
];
178187

179188
/**
@@ -391,13 +400,15 @@ async function llmCall(messages, opts = {}) {
391400
// `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
392401
// compatibility varies by version). Capture it so the idle timer resets.
393402
if (delta?.thinking) reasoningContent += delta.thinking;
394-
if (delta?.content || delta?.reasoning_content || delta?.thinking) {
403+
// mlx-lm Python server uses `delta.reasoning` instead of `delta.reasoning_content`
404+
if (delta?.reasoning) reasoningContent += delta.reasoning;
405+
if (delta?.content || delta?.reasoning_content || delta?.thinking || delta?.reasoning) {
395406
tokenCount++;
396407
// Capture TTFT on first content/reasoning token
397408
if (!firstTokenTime) firstTokenTime = Date.now();
398409
// Buffer and log tokens — tag with field source
399410
const isContent = !!delta?.content;
400-
const tok = delta?.content || delta?.reasoning_content || '';
411+
const tok = delta?.content || delta?.reasoning_content || delta?.reasoning || '';
401412
// Tag first token of each field type
402413
if (tokenCount === 1) tokenBuffer += isContent ? '[C] ' : '[R] ';
403414
tokenBuffer += tok;
@@ -526,10 +537,19 @@ async function llmCall(messages, opts = {}) {
526537
if (decodeTokensPerSec !== null) results.perfTotals.decodeTokensPerSec.push(decodeTokensPerSec);
527538

528539
// Capture model name from first response
540+
// MLX server returns the full filesystem path as model name
541+
// e.g. /Users/simba/.aegis-ai/models/mlx_models/mlx-community/Qwen3.5-9B-8bit
542+
// Strip to just the org/model portion: mlx-community/Qwen3.5-9B-8bit
543+
const cleanName = (n) => {
544+
if (!n || !n.includes('/')) return n;
545+
const parts = n.split('/');
546+
// If it looks like a filesystem path (>3 segments), keep last 2 (org/model)
547+
return parts.length > 3 ? parts.slice(-2).join('/') : n;
548+
};
529549
if (opts.vlm) {
530-
if (!results.model.vlm && model) results.model.vlm = model;
550+
if (!results.model.vlm && model) results.model.vlm = cleanName(model);
531551
} else {
532-
if (!results.model.name && model) results.model.name = model;
552+
if (!results.model.name && model) results.model.name = cleanName(model);
533553
}
534554

535555
return { content, toolCalls, usage: callTokens, perf: callPerf, model };
@@ -545,6 +565,11 @@ function stripThink(text) {
545565
// Strip Qwen3.5 'Thinking Process:' blocks (outputs plain text reasoning
546566
// instead of <think> tags when enable_thinking is active)
547567
cleaned = cleaned.replace(/^Thinking Process[:\s]*[\s\S]*?(?=\n\s*[{\[]|\n```|$)/i, '').trim();
568+
// Strip gpt-oss <|channel|>...<|message|> routing tokens
569+
// e.g. "<|channel|>analysis<|message|>We need to decide..." → "We need to decide..."
570+
cleaned = cleaned.replace(/^<\|channel\|>[^<]*<\|message\|>/i, '').trim();
571+
// Strip any remaining <|...|> special tokens (end_turn, etc.)
572+
cleaned = cleaned.replace(/<\|[^|]+\|>/g, '').trim();
548573
return cleaned;
549574
}
550575

@@ -555,24 +580,38 @@ function parseJSON(text) {
555580
if (codeBlock) {
556581
jsonStr = codeBlock[1];
557582
} else {
558-
// Find first { or [ and extract balanced JSON
559-
const startIdx = cleaned.search(/[{\[]/);
560-
if (startIdx >= 0) {
583+
// Extract ALL balanced JSON objects/arrays, then pick the largest.
584+
// Some models (gpt-oss) emit an empty `{}` prefix before the real JSON.
585+
const candidates = [];
586+
let searchFrom = 0;
587+
while (searchFrom < cleaned.length) {
588+
const sub = cleaned.slice(searchFrom);
589+
const startOff = sub.search(/[{\[]/);
590+
if (startOff < 0) break;
591+
const startIdx = searchFrom + startOff;
561592
const opener = cleaned[startIdx];
562593
const closer = opener === '{' ? '}' : ']';
563-
let depth = 0;
564-
let inString = false;
565-
let escape = false;
594+
let depth = 0, inString = false, escape = false, endIdx = -1;
566595
for (let i = startIdx; i < cleaned.length; i++) {
567596
const ch = cleaned[i];
568597
if (escape) { escape = false; continue; }
569598
if (ch === '\\' && inString) { escape = true; continue; }
570599
if (ch === '"') { inString = !inString; continue; }
571600
if (!inString) {
572601
if (ch === opener) depth++;
573-
else if (ch === closer) { depth--; if (depth === 0) { jsonStr = cleaned.slice(startIdx, i + 1); break; } }
602+
else if (ch === closer) { depth--; if (depth === 0) { endIdx = i; break; } }
574603
}
575604
}
605+
if (endIdx >= 0) {
606+
candidates.push(cleaned.slice(startIdx, endIdx + 1));
607+
searchFrom = endIdx + 1;
608+
} else {
609+
break;
610+
}
611+
}
612+
// Prefer the longest candidate (most likely the real response)
613+
if (candidates.length > 0) {
614+
jsonStr = candidates.reduce((a, b) => a.length >= b.length ? a : b);
576615
}
577616
}
578617
// Clean common local model artifacts before parsing:
@@ -592,7 +631,12 @@ function parseJSON(text) {
592631
.replace(/"placeholder"(\s*"placeholder")*/g, '"placeholder"') // collapse repeated placeholders
593632
.replace(/\bplaceholder\b/g, '""') // placeholder → empty string
594633
.replace(/,\s*([}\]])/g, '$1'); // re-clean trailing commas
595-
return JSON.parse(aggressive.trim());
634+
try {
635+
return JSON.parse(aggressive.trim());
636+
} catch (secondErr) {
637+
// Include raw content in error for diagnostics
638+
throw new Error(`${secondErr.message} | raw(120): "${(text || '').slice(0, 120)}"`);
639+
}
596640
}
597641
}
598642

@@ -646,6 +690,38 @@ function sampleResourceMetrics() {
646690
return sample;
647691
}
648692

693+
/**
694+
* Aggregate resource samples to produce a representative summary.
695+
* Uses PEAK GPU utilization (since point-in-time samples often miss active inference)
696+
* and MAX GPU memory (high-water mark during the benchmark run).
697+
*/
698+
function aggregateResourceSamples(samples) {
699+
if (!samples || samples.length === 0) return null;
700+
const gpuSamples = samples.filter(s => s.gpu);
701+
if (gpuSamples.length === 0) {
702+
// No GPU data — return last sample for sys memory at least
703+
return samples[samples.length - 1];
704+
}
705+
// Find peak GPU utilization sample
706+
const peakGpu = gpuSamples.reduce((best, s) =>
707+
(s.gpu.util > (best.gpu?.util ?? -1)) ? s : best, gpuSamples[0]);
708+
// Find max GPU memory sample
709+
const maxMem = gpuSamples.reduce((best, s) =>
710+
((s.gpu.memUsedGB || 0) > (best.gpu?.memUsedGB || 0)) ? s : best, gpuSamples[0]);
711+
// Use the last sample for system memory (most recent)
712+
const lastSample = samples[samples.length - 1];
713+
return {
714+
...lastSample,
715+
gpu: {
716+
util: peakGpu.gpu.util,
717+
renderer: peakGpu.gpu.renderer,
718+
tiler: peakGpu.gpu.tiler,
719+
memUsedGB: maxMem.gpu.memUsedGB,
720+
memAllocGB: maxMem.gpu.memAllocGB,
721+
},
722+
};
723+
}
724+
649725
// ─── Live progress: intermediate saves + report regeneration ────────────────
650726
let _liveReportOpened = false;
651727
let _runStartedAt = null; // Set when runSuites() begins
@@ -697,7 +773,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
697773
prefillTokensPerSec: results.perfTotals.prefillTokensPerSec,
698774
decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec,
699775
},
700-
resource: results.resourceSamples.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null,
776+
resource: aggregateResourceSamples(results.resourceSamples),
701777
} : null;
702778

703779
// Preserve previous runs in index for comparison sidebar
@@ -2454,7 +2530,7 @@ async function main() {
24542530
...(LLM_MODEL && { model: LLM_MODEL }),
24552531
messages: [{ role: 'user', content: 'Reply with just the word: hello' }],
24562532
stream: true,
2457-
max_tokens: 10,
2533+
max_tokens: 200, // models with thinking/analysis phases need >10 tokens to reach final output
24582534
...getModelApiParams(LLM_MODEL),
24592535
};
24602536
const warmupStream = await llmClient.chat.completions.create(warmupParams);
@@ -2615,7 +2691,7 @@ async function main() {
26152691
tokens: results.tokenTotals.total,
26162692
perfSummary: {
26172693
...(results.perfSummary || {}),
2618-
resource: results.resourceSamples?.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null,
2694+
resource: aggregateResourceSamples(results.resourceSamples),
26192695
},
26202696
});
26212697
fs.writeFileSync(indexFile, JSON.stringify(index, null, 2));

0 commit comments

Comments
 (0)