@@ -174,6 +174,15 @@ const MODEL_FAMILIES = [
174174 } ,
175175 // Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
176176 // abort in llmCall — no extra per-request params needed.
177+ {
178+ name : 'GPT-OSS' ,
179+ // gpt-oss-20b uses <|channel|>analysis/final structure.
180+ // reasoning_effort=none hints the model to minimize analysis (injected into system prompt
181+ // by the chat template). The mlx-server OutputFilter suppresses analysis at token ID level.
182+ match : ( m ) => m . includes ( 'gpt-oss' ) ,
183+ apiParams : { reasoning_effort : 'none' } ,
184+ serverFlags : '--chat-template-kwargs {"reasoning_effort":"none"}' ,
185+ } ,
177186] ;
178187
179188/**
@@ -391,13 +400,15 @@ async function llmCall(messages, opts = {}) {
391400 // `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
392401 // compatibility varies by version). Capture it so the idle timer resets.
393402 if ( delta ?. thinking ) reasoningContent += delta . thinking ;
394- if ( delta ?. content || delta ?. reasoning_content || delta ?. thinking ) {
403+ // mlx-lm Python server uses `delta.reasoning` instead of `delta.reasoning_content`
404+ if ( delta ?. reasoning ) reasoningContent += delta . reasoning ;
405+ if ( delta ?. content || delta ?. reasoning_content || delta ?. thinking || delta ?. reasoning ) {
395406 tokenCount ++ ;
396407 // Capture TTFT on first content/reasoning token
397408 if ( ! firstTokenTime ) firstTokenTime = Date . now ( ) ;
398409 // Buffer and log tokens — tag with field source
399410 const isContent = ! ! delta ?. content ;
400- const tok = delta ?. content || delta ?. reasoning_content || '' ;
411+ const tok = delta ?. content || delta ?. reasoning_content || delta ?. reasoning || '' ;
401412 // Tag first token of each field type
402413 if ( tokenCount === 1 ) tokenBuffer += isContent ? '[C] ' : '[R] ' ;
403414 tokenBuffer += tok ;
@@ -526,10 +537,19 @@ async function llmCall(messages, opts = {}) {
526537 if ( decodeTokensPerSec !== null ) results . perfTotals . decodeTokensPerSec . push ( decodeTokensPerSec ) ;
527538
528539 // Capture model name from first response
540+ // MLX server returns the full filesystem path as model name
541+ // e.g. /Users/simba/.aegis-ai/models/mlx_models/mlx-community/Qwen3.5-9B-8bit
542+ // Strip to just the org/model portion: mlx-community/Qwen3.5-9B-8bit
543+ const cleanName = ( n ) => {
544+ if ( ! n || ! n . includes ( '/' ) ) return n ;
545+ const parts = n . split ( '/' ) ;
546+ // If it looks like a filesystem path (>3 segments), keep last 2 (org/model)
547+ return parts . length > 3 ? parts . slice ( - 2 ) . join ( '/' ) : n ;
548+ } ;
529549 if ( opts . vlm ) {
530- if ( ! results . model . vlm && model ) results . model . vlm = model ;
550+ if ( ! results . model . vlm && model ) results . model . vlm = cleanName ( model ) ;
531551 } else {
532- if ( ! results . model . name && model ) results . model . name = model ;
552+ if ( ! results . model . name && model ) results . model . name = cleanName ( model ) ;
533553 }
534554
535555 return { content, toolCalls, usage : callTokens , perf : callPerf , model } ;
@@ -545,6 +565,11 @@ function stripThink(text) {
545565 // Strip Qwen3.5 'Thinking Process:' blocks (outputs plain text reasoning
546566 // instead of <think> tags when enable_thinking is active)
547567 cleaned = cleaned . replace ( / ^ T h i n k i n g P r o c e s s [: \s] * [ \s \S ] * ?(? = \n \s * [ { \[ ] | \n ` ` ` | $ ) / i, '' ) . trim ( ) ;
568+ // Strip gpt-oss <|channel|>...<|message|> routing tokens
569+ // e.g. "<|channel|>analysis<|message|>We need to decide..." → "We need to decide..."
570+ cleaned = cleaned . replace ( / ^ < \| c h a n n e l \| > [ ^ < ] * < \| m e s s a g e \| > / i, '' ) . trim ( ) ;
571+ // Strip any remaining <|...|> special tokens (end_turn, etc.)
572+ cleaned = cleaned . replace ( / < \| [ ^ | ] + \| > / g, '' ) . trim ( ) ;
548573 return cleaned ;
549574}
550575
@@ -555,24 +580,38 @@ function parseJSON(text) {
555580 if ( codeBlock ) {
556581 jsonStr = codeBlock [ 1 ] ;
557582 } else {
558- // Find first { or [ and extract balanced JSON
559- const startIdx = cleaned . search ( / [ { \[ ] / ) ;
560- if ( startIdx >= 0 ) {
583+ // Extract ALL balanced JSON objects/arrays, then pick the largest.
584+ // Some models (gpt-oss) emit an empty `{}` prefix before the real JSON.
585+ const candidates = [ ] ;
586+ let searchFrom = 0 ;
587+ while ( searchFrom < cleaned . length ) {
588+ const sub = cleaned . slice ( searchFrom ) ;
589+ const startOff = sub . search ( / [ { \[ ] / ) ;
590+ if ( startOff < 0 ) break ;
591+ const startIdx = searchFrom + startOff ;
561592 const opener = cleaned [ startIdx ] ;
562593 const closer = opener === '{' ? '}' : ']' ;
563- let depth = 0 ;
564- let inString = false ;
565- let escape = false ;
594+ let depth = 0 , inString = false , escape = false , endIdx = - 1 ;
566595 for ( let i = startIdx ; i < cleaned . length ; i ++ ) {
567596 const ch = cleaned [ i ] ;
568597 if ( escape ) { escape = false ; continue ; }
569598 if ( ch === '\\' && inString ) { escape = true ; continue ; }
570599 if ( ch === '"' ) { inString = ! inString ; continue ; }
571600 if ( ! inString ) {
572601 if ( ch === opener ) depth ++ ;
573- else if ( ch === closer ) { depth -- ; if ( depth === 0 ) { jsonStr = cleaned . slice ( startIdx , i + 1 ) ; break ; } }
602+ else if ( ch === closer ) { depth -- ; if ( depth === 0 ) { endIdx = i ; break ; } }
574603 }
575604 }
605+ if ( endIdx >= 0 ) {
606+ candidates . push ( cleaned . slice ( startIdx , endIdx + 1 ) ) ;
607+ searchFrom = endIdx + 1 ;
608+ } else {
609+ break ;
610+ }
611+ }
612+ // Prefer the longest candidate (most likely the real response)
613+ if ( candidates . length > 0 ) {
614+ jsonStr = candidates . reduce ( ( a , b ) => a . length >= b . length ? a : b ) ;
576615 }
577616 }
578617 // Clean common local model artifacts before parsing:
@@ -592,7 +631,12 @@ function parseJSON(text) {
592631 . replace ( / " p l a c e h o l d e r " ( \s * " p l a c e h o l d e r " ) * / g, '"placeholder"' ) // collapse repeated placeholders
593632 . replace ( / \b p l a c e h o l d e r \b / g, '""' ) // placeholder → empty string
594633 . replace ( / , \s * ( [ } \] ] ) / g, '$1' ) ; // re-clean trailing commas
595- return JSON . parse ( aggressive . trim ( ) ) ;
634+ try {
635+ return JSON . parse ( aggressive . trim ( ) ) ;
636+ } catch ( secondErr ) {
637+ // Include raw content in error for diagnostics
638+ throw new Error ( `${ secondErr . message } | raw(120): "${ ( text || '' ) . slice ( 0 , 120 ) } "` ) ;
639+ }
596640 }
597641}
598642
@@ -646,6 +690,38 @@ function sampleResourceMetrics() {
646690 return sample ;
647691}
648692
693+ /**
694+ * Aggregate resource samples to produce a representative summary.
695+ * Uses PEAK GPU utilization (since point-in-time samples often miss active inference)
696+ * and MAX GPU memory (high-water mark during the benchmark run).
697+ */
698+ function aggregateResourceSamples ( samples ) {
699+ if ( ! samples || samples . length === 0 ) return null ;
700+ const gpuSamples = samples . filter ( s => s . gpu ) ;
701+ if ( gpuSamples . length === 0 ) {
702+ // No GPU data — return last sample for sys memory at least
703+ return samples [ samples . length - 1 ] ;
704+ }
705+ // Find peak GPU utilization sample
706+ const peakGpu = gpuSamples . reduce ( ( best , s ) =>
707+ ( s . gpu . util > ( best . gpu ?. util ?? - 1 ) ) ? s : best , gpuSamples [ 0 ] ) ;
708+ // Find max GPU memory sample
709+ const maxMem = gpuSamples . reduce ( ( best , s ) =>
710+ ( ( s . gpu . memUsedGB || 0 ) > ( best . gpu ?. memUsedGB || 0 ) ) ? s : best , gpuSamples [ 0 ] ) ;
711+ // Use the last sample for system memory (most recent)
712+ const lastSample = samples [ samples . length - 1 ] ;
713+ return {
714+ ...lastSample ,
715+ gpu : {
716+ util : peakGpu . gpu . util ,
717+ renderer : peakGpu . gpu . renderer ,
718+ tiler : peakGpu . gpu . tiler ,
719+ memUsedGB : maxMem . gpu . memUsedGB ,
720+ memAllocGB : maxMem . gpu . memAllocGB ,
721+ } ,
722+ } ;
723+ }
724+
649725// ─── Live progress: intermediate saves + report regeneration ────────────────
650726let _liveReportOpened = false ;
651727let _runStartedAt = null ; // Set when runSuites() begins
@@ -697,7 +773,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
697773 prefillTokensPerSec : results . perfTotals . prefillTokensPerSec ,
698774 decodeTokensPerSec : results . perfTotals . serverDecodeTokensPerSec ,
699775 } ,
700- resource : results . resourceSamples . length > 0 ? results . resourceSamples [ results . resourceSamples . length - 1 ] : null ,
776+ resource : aggregateResourceSamples ( results . resourceSamples ) ,
701777 } : null ;
702778
703779 // Preserve previous runs in index for comparison sidebar
@@ -2454,7 +2530,7 @@ async function main() {
24542530 ...( LLM_MODEL && { model : LLM_MODEL } ) ,
24552531 messages : [ { role : 'user' , content : 'Reply with just the word: hello' } ] ,
24562532 stream : true ,
2457- max_tokens : 10 ,
2533+ max_tokens : 200 , // models with thinking/analysis phases need >10 tokens to reach final output
24582534 ...getModelApiParams ( LLM_MODEL ) ,
24592535 } ;
24602536 const warmupStream = await llmClient . chat . completions . create ( warmupParams ) ;
@@ -2615,7 +2691,7 @@ async function main() {
26152691 tokens : results . tokenTotals . total ,
26162692 perfSummary : {
26172693 ...( results . perfSummary || { } ) ,
2618- resource : results . resourceSamples ?. length > 0 ? results . resourceSamples [ results . resourceSamples . length - 1 ] : null ,
2694+ resource : aggregateResourceSamples ( results . resourceSamples ) ,
26192695 } ,
26202696 } ) ;
26212697 fs . writeFileSync ( indexFile , JSON . stringify ( index , null , 2 ) ) ;
0 commit comments