@@ -120,6 +120,66 @@ const vlmClient = VLM_URL ? new OpenAI({
120120 baseURL : `${ strip ( VLM_URL ) } /v1` ,
121121} ) : null ;
122122
123+ // ─── Model Family Capabilities Config ────────────────────────────────────────
124+ //
125+ // Different model families require different per-request params to control
126+ // thinking/reasoning behavior. This table centralizes those differences so
127+ // llmCall() can dispatch them automatically.
128+ //
129+ // Fields:
130+ // match — fn(modelName: string) → bool
131+ // apiParams — extra params merged into every chat/completions request
132+ // serverFlags — llama-server startup flags needed for full control
133+ // (documentation only — llmCall is a client and cannot set these)
134+ //
135+ // ┌─────────────────────┬──────────────────────────────┬──────────────────────────────────────────┐
136+ // │ Family │ Per-request param │ llama-server startup flag │
137+ // ├─────────────────────┼──────────────────────────────┼──────────────────────────────────────────┤
138+ // │ Mistral Small 4+ │ reasoning_effort: 'none' │ --reasoning-budget 0 │
139+ // │ Qwen3.5 (thinking) │ (none needed — handled by │ --chat-template-kwargs │
140+ // │ │ /no_think prompt suffix and │ '{"enable_thinking":false}' │
141+ // │ │ 500-token reasoning abort) │ │
142+ // │ GPT / Claude │ (none — cloud API, no local │ N/A │
143+ // │ │ thinking tokens) │ │
144+ // └─────────────────────┴──────────────────────────────┴──────────────────────────────────────────┘
145+ //
146+ // To add a new model family: append an entry to MODEL_FAMILIES.
147+ // The match fn receives the lower-cased model name/filename.
148+
149+ const MODEL_FAMILIES = [
150+ {
151+ name : 'Mistral' ,
152+ // Covers: Mistral-Small-4, Mistral-*, Magistral-*, Mixtral-*
153+ match : ( m ) => m . includes ( 'mistral' ) || m . includes ( 'magistral' ) || m . includes ( 'mixtral' ) ,
154+ // reasoning_effort=none disables thinking and routes all output to delta.content.
155+ // Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
156+ // Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
157+ apiParams : { reasoning_effort : 'none' } ,
158+ serverFlags : '--reasoning-budget 0' ,
159+ } ,
160+ // Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
161+ // abort in llmCall — no extra per-request params needed.
162+ // {
163+ // name: 'Qwen3',
164+ // match: (m) => m.includes('qwen') || m.includes('qwq'),
165+ // apiParams: {}, // could add: { chat_template_kwargs: { enable_thinking: false } }
166+ // serverFlags: "--chat-template-kwargs '{\"enable_thinking\":false}'",
167+ // },
168+ ] ;
169+
170+ /**
171+ * Return the merged extra API params for the given model name.
172+ * Returns {} if the model is not in any known family.
173+ */
174+ function getModelApiParams ( modelName ) {
175+ if ( ! modelName ) return { } ;
176+ const lower = modelName . toLowerCase ( ) ;
177+ for ( const family of MODEL_FAMILIES ) {
178+ if ( family . match ( lower ) ) return family . apiParams || { } ;
179+ }
180+ return { } ;
181+ }
182+
123183// ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
124184
125185/**
@@ -226,6 +286,10 @@ async function llmCall(messages, opts = {}) {
226286 // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
227287 // reasoning_content counts against the limit.
228288
289+ // Lookup model-family-specific extra params (e.g. reasoning_effort for Mistral).
290+ // VLM calls skip the LLM family table — VLM models are always local llava-compatible.
291+ const modelFamilyParams = opts . vlm ? { } : getModelApiParams ( model || LLM_MODEL ) ;
292+
229293 // Build request params
230294 const params = {
231295 messages,
@@ -238,6 +302,9 @@ async function llmCall(messages, opts = {}) {
238302 ...( opts . expectJSON && opts . temperature === undefined && { temperature : 0.7 } ) ,
239303 ...( opts . expectJSON && { top_p : 0.8 } ) ,
240304 ...( opts . tools && { tools : opts . tools } ) ,
305+ // Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
306+ // These are merged last so they take precedence over defaults.
307+ ...modelFamilyParams ,
241308 } ;
242309
243310 // Use an AbortController with idle timeout that resets on each streamed chunk.
@@ -297,7 +364,11 @@ async function llmCall(messages, opts = {}) {
297364 const delta = chunk . choices ?. [ 0 ] ?. delta ;
298365 if ( delta ?. content ) content += delta . content ;
299366 if ( delta ?. reasoning_content ) reasoningContent += delta . reasoning_content ;
300- if ( delta ?. content || delta ?. reasoning_content ) {
367+ // Fallback: Mistral Small 4 in llama-server may route thinking tokens through
368+ // `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
369+ // compatibility varies by version). Capture it so the idle timer resets.
370+ if ( delta ?. thinking ) reasoningContent += delta . thinking ;
371+ if ( delta ?. content || delta ?. reasoning_content || delta ?. thinking ) {
301372 tokenCount ++ ;
302373 // Capture TTFT on first content/reasoning token
303374 if ( ! firstTokenTime ) firstTokenTime = Date . now ( ) ;
@@ -2347,8 +2418,61 @@ async function main() {
23472418 emit ( { event : 'error' , message : `Cannot reach LLM endpoint: ${ err . message } ` } ) ;
23482419 process . exit ( IS_SKILL_MODE ? 0 : 1 ) ;
23492420 }
2421+ // ── Streaming sanity check ────────────────────────────────────────────────
2422+ // Fires a tiny streaming call to verify the model actually produces content.
2423+ // Catches the Mistral "token-loop" bug: server started with a Qwen-specific
2424+ // --chat-template-kwargs flag causes Mistral to emit only empty token ID 31
2425+ // on every chunk, giving 0 content tokens for every test.
2426+ //
2427+ // This check saves ~30 minutes of doomed benchmark runs by failing fast.
2428+ log ( '\n 🔍 Streaming sanity check (10 tokens)...' ) ;
2429+ try {
2430+ const warmupParams = {
2431+ ...( LLM_MODEL && { model : LLM_MODEL } ) ,
2432+ messages : [ { role : 'user' , content : 'Reply with just the word: hello' } ] ,
2433+ stream : true ,
2434+ max_tokens : 10 ,
2435+ ...getModelApiParams ( LLM_MODEL ) ,
2436+ } ;
2437+ const warmupStream = await llmClient . chat . completions . create ( warmupParams ) ;
2438+ let warmupContent = '' ;
2439+ let warmupChunks = 0 ;
2440+ const warmupController = new AbortController ( ) ;
2441+ const warmupTimeout = setTimeout ( ( ) => warmupController . abort ( ) , 15000 ) ;
2442+ try {
2443+ for await ( const chunk of warmupStream ) {
2444+ warmupChunks ++ ;
2445+ const d = chunk . choices ?. [ 0 ] ?. delta ;
2446+ if ( d ?. content ) warmupContent += d . content ;
2447+ if ( d ?. reasoning_content ) warmupContent += d . reasoning_content ;
2448+ if ( d ?. thinking ) warmupContent += d . thinking ;
2449+ if ( warmupChunks >= 30 ) break ; // enough chunks to decide
2450+ }
2451+ } finally {
2452+ clearTimeout ( warmupTimeout ) ;
2453+ }
2454+
2455+ if ( warmupContent . trim ( ) . length === 0 ) {
2456+ // Model produced chunks but zero content — server is in a bad state
2457+ const modelName = results . model . name || LLM_MODEL || 'current model' ;
2458+ log ( `\n ❌ STREAMING SANITY CHECK FAILED` ) ;
2459+ log ( ` The model (${ modelName } ) produced ${ warmupChunks } stream chunks but 0 content tokens.` ) ;
2460+ log ( ` This usually means the llama-server was started with an incompatible` ) ;
2461+ log ( ` --chat-template-kwargs flag (e.g. Qwen's enable_thinking:false applied to Mistral).` ) ;
2462+ log ( `\n ➡ Fix: Reload the model in Aegis-AI to restart the llama-server with` ) ;
2463+ log ( ` the correct flags for this model family.` ) ;
2464+ log ( ` Mistral requires: --reasoning-budget 0` ) ;
2465+ log ( ` Qwen requires: --chat-template-kwargs '{"enable_thinking":false}'\n` ) ;
2466+ emit ( { event : 'error' , message : `Streaming sanity failed: ${ warmupChunks } chunks, 0 content tokens. Reload the model in Aegis-AI to fix.` } ) ;
2467+ process . exit ( IS_SKILL_MODE ? 0 : 1 ) ;
2468+ }
2469+
2470+ log ( ` ✅ Streaming OK — ${ warmupContent . trim ( ) . split ( / \s + / ) . length } words, ${ warmupChunks } chunks` ) ;
2471+ } catch ( err ) {
2472+ // Non-fatal — if warmup errors, let the benchmark try; individual tests will surface the issue
2473+ log ( ` ⚠️ Streaming warmup error (non-fatal): ${ err . message } ` ) ;
2474+ }
23502475
2351- // Collect system info
23522476 results . system = collectSystemInfo ( ) ;
23532477 log ( ` System: ${ results . system . cpu } (${ results . system . cpuCores } cores)` ) ;
23542478 log ( ` Memory: ${ results . system . freeMemoryGB } GB free / ${ results . system . totalMemoryGB } GB total` ) ;
0 commit comments