fix(benchmark): model-family-aware thinking suppression + streaming sanity check

solderzzc · solderzzc · commit fa077453fcaa · 2026-03-20T23:53:36.000-07:00
- Add MODEL_FAMILIES config table with per-model API params and server flags
- Add getModelApiParams() helper to inject reasoning_effort:none for Mistral
- Add delta.thinking fallback in streaming loop to capture thinking tokens
- Add streaming sanity check before benchmark run (detects empty-token loops)
- Add test-model-config.cjs with 17 unit tests for model detection logic
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -120,6 +120,66 @@ const vlmClient = VLM_URL ? new OpenAI({
     baseURL: `${strip(VLM_URL)}/v1`,
 }) : null;
 
+// ─── Model Family Capabilities Config ────────────────────────────────────────
+//
+// Different model families require different per-request params to control
+// thinking/reasoning behavior.  This table centralizes those differences so
+// llmCall() can dispatch them automatically.
+//
+// Fields:
+//   match         — fn(modelName: string) → bool
+//   apiParams     — extra params merged into every chat/completions request
+//   serverFlags   — llama-server startup flags needed for full control
+//                   (documentation only — llmCall is a client and cannot set these)
+//
+// ┌─────────────────────┬──────────────────────────────┬──────────────────────────────────────────┐
+// │ Family              │ Per-request param             │ llama-server startup flag                │
+// ├─────────────────────┼──────────────────────────────┼──────────────────────────────────────────┤
+// │ Mistral Small 4+    │ reasoning_effort: 'none'      │ --reasoning-budget 0                     │
+// │ Qwen3.5 (thinking)  │ (none needed — handled by     │ --chat-template-kwargs                   │
+// │                     │  /no_think prompt suffix and  │   '{"enable_thinking":false}'            │
+// │                     │  500-token reasoning abort)   │                                          │
+// │ GPT / Claude        │ (none — cloud API, no local   │ N/A                                      │
+// │                     │  thinking tokens)             │                                          │
+// └─────────────────────┴──────────────────────────────┴──────────────────────────────────────────┘
+//
+// To add a new model family: append an entry to MODEL_FAMILIES.
+// The match fn receives the lower-cased model name/filename.
+
+const MODEL_FAMILIES = [
+    {
+        name: 'Mistral',
+        // Covers: Mistral-Small-4, Mistral-*, Magistral-*, Mixtral-*
+        match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
+        // reasoning_effort=none disables thinking and routes all output to delta.content.
+        // Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
+        // Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
+        apiParams: { reasoning_effort: 'none' },
+        serverFlags: '--reasoning-budget 0',
+    },
+    // Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
+    // abort in llmCall — no extra per-request params needed.
+    // {
+    //   name: 'Qwen3',
+    //   match: (m) => m.includes('qwen') || m.includes('qwq'),
+    //   apiParams: {},  // could add: { chat_template_kwargs: { enable_thinking: false } }
+    //   serverFlags: "--chat-template-kwargs '{\"enable_thinking\":false}'",
+    // },
+];
+
+/**
+ * Return the merged extra API params for the given model name.
+ * Returns {} if the model is not in any known family.
+ */
+function getModelApiParams(modelName) {
+    if (!modelName) return {};
+    const lower = modelName.toLowerCase();
+    for (const family of MODEL_FAMILIES) {
+        if (family.match(lower)) return family.apiParams || {};
+    }
+    return {};
+}
+
 // ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
 
 /**
@@ -226,6 +286,10 @@ async function llmCall(messages, opts = {}) {
     // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
     // reasoning_content counts against the limit.
 
+    // Lookup model-family-specific extra params (e.g. reasoning_effort for Mistral).
+    // VLM calls skip the LLM family table — VLM models are always local llava-compatible.
+    const modelFamilyParams = opts.vlm ? {} : getModelApiParams(model || LLM_MODEL);
+
     // Build request params
     const params = {
         messages,
@@ -238,6 +302,9 @@ async function llmCall(messages, opts = {}) {
         ...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
         ...(opts.expectJSON && { top_p: 0.8 }),
         ...(opts.tools && { tools: opts.tools }),
+        // Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
+        // These are merged last so they take precedence over defaults.
+        ...modelFamilyParams,
     };
 
     // Use an AbortController with idle timeout that resets on each streamed chunk.
@@ -297,7 +364,11 @@ async function llmCall(messages, opts = {}) {
             const delta = chunk.choices?.[0]?.delta;
             if (delta?.content) content += delta.content;
             if (delta?.reasoning_content) reasoningContent += delta.reasoning_content;
-            if (delta?.content || delta?.reasoning_content) {
+            // Fallback: Mistral Small 4 in llama-server may route thinking tokens through
+            // `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
+            // compatibility varies by version). Capture it so the idle timer resets.
+            if (delta?.thinking) reasoningContent += delta.thinking;
+            if (delta?.content || delta?.reasoning_content || delta?.thinking) {
                 tokenCount++;
                 // Capture TTFT on first content/reasoning token
                 if (!firstTokenTime) firstTokenTime = Date.now();
@@ -2347,8 +2418,61 @@ async function main() {
         emit({ event: 'error', message: `Cannot reach LLM endpoint: ${err.message}` });
         process.exit(IS_SKILL_MODE ? 0 : 1);
     }
+    // ── Streaming sanity check ────────────────────────────────────────────────
+    // Fires a tiny streaming call to verify the model actually produces content.
+    // Catches the Mistral "token-loop" bug: server started with a Qwen-specific
+    // --chat-template-kwargs flag causes Mistral to emit only empty token ID 31
+    // on every chunk, giving 0 content tokens for every test.
+    //
+    // This check saves ~30 minutes of doomed benchmark runs by failing fast.
+    log('\n  🔍 Streaming sanity check (10 tokens)...');
+    try {
+        const warmupParams = {
+            ...(LLM_MODEL && { model: LLM_MODEL }),
+            messages: [{ role: 'user', content: 'Reply with just the word: hello' }],
+            stream: true,
+            max_tokens: 10,
+            ...getModelApiParams(LLM_MODEL),
+        };
+        const warmupStream = await llmClient.chat.completions.create(warmupParams);
+        let warmupContent = '';
+        let warmupChunks = 0;
+        const warmupController = new AbortController();
+        const warmupTimeout = setTimeout(() => warmupController.abort(), 15000);
+        try {
+            for await (const chunk of warmupStream) {
+                warmupChunks++;
+                const d = chunk.choices?.[0]?.delta;
+                if (d?.content) warmupContent += d.content;
+                if (d?.reasoning_content) warmupContent += d.reasoning_content;
+                if (d?.thinking) warmupContent += d.thinking;
+                if (warmupChunks >= 30) break; // enough chunks to decide
+            }
+        } finally {
+            clearTimeout(warmupTimeout);
+        }
+
+        if (warmupContent.trim().length === 0) {
+            // Model produced chunks but zero content — server is in a bad state
+            const modelName = results.model.name || LLM_MODEL || 'current model';
+            log(`\n  ❌ STREAMING SANITY CHECK FAILED`);
+            log(`     The model (${modelName}) produced ${warmupChunks} stream chunks but 0 content tokens.`);
+            log(`     This usually means the llama-server was started with an incompatible`);
+            log(`     --chat-template-kwargs flag (e.g. Qwen's enable_thinking:false applied to Mistral).`);
+            log(`\n  ➡  Fix: Reload the model in Aegis-AI to restart the llama-server with`);
+            log(`          the correct flags for this model family.`);
+            log(`          Mistral requires: --reasoning-budget 0`);
+            log(`          Qwen requires:    --chat-template-kwargs '{"enable_thinking":false}'\n`);
+            emit({ event: 'error', message: `Streaming sanity failed: ${warmupChunks} chunks, 0 content tokens. Reload the model in Aegis-AI to fix.` });
+            process.exit(IS_SKILL_MODE ? 0 : 1);
+        }
+
+        log(`  ✅ Streaming OK — ${warmupContent.trim().split(/\s+/).length} words, ${warmupChunks} chunks`);
+    } catch (err) {
+        // Non-fatal — if warmup errors, let the benchmark try; individual tests will surface the issue
+        log(`  ⚠️  Streaming warmup error (non-fatal): ${err.message}`);
+    }
 
-    // Collect system info
     results.system = collectSystemInfo();
     log(`  System:   ${results.system.cpu} (${results.system.cpuCores} cores)`);
     log(`  Memory:   ${results.system.freeMemoryGB}GB free / ${results.system.totalMemoryGB}GB total`);
diff --git a/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs b/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs
@@ -0,0 +1,170 @@
+#!/usr/bin/env node
+/**
+ * Unit tests for MODEL_FAMILIES / getModelApiParams logic.
+ *
+ * Tests the model-family detection and per-request param injection
+ * without needing a running LLM server.
+ *
+ * Usage:
+ *   node scripts/test-model-config.cjs
+ */
+
+// ── Inline the config under test ─────────────────────────────────────────────
+// (Kept in sync with run-benchmark.cjs MODEL_FAMILIES section)
+
+const MODEL_FAMILIES = [
+    {
+        name: 'Mistral',
+        match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
+        apiParams: { reasoning_effort: 'none' },
+        serverFlags: '--reasoning-budget 0',
+    },
+    // Qwen3.5: no extra per-request params needed (handled by prompt + abort logic)
+];
+
+function getModelApiParams(modelName) {
+    if (!modelName) return {};
+    const lower = modelName.toLowerCase();
+    for (const family of MODEL_FAMILIES) {
+        if (family.match(lower)) return family.apiParams || {};
+    }
+    return {};
+}
+
+// ── Mirror the server-manager detection ──────────────────────────────────────
+function getServerFlags(modelFilePath) {
+    const lower = modelFilePath.toLowerCase();
+    const isMistralFamily = lower.includes('mistral') ||
+                            lower.includes('magistral') ||
+                            lower.includes('mixtral');
+    return isMistralFamily
+        ? { flag: '--reasoning-budget', value: '0' }
+        : { flag: '--chat-template-kwargs', value: '{"enable_thinking":false}' };
+}
+
+// ── Test harness ─────────────────────────────────────────────────────────────
+
+let passed = 0;
+let failed = 0;
+
+function test(name, fn) {
+    try {
+        fn();
+        console.log(`  ✅ ${name}`);
+        passed++;
+    } catch (err) {
+        console.log(`  ❌ ${name}: ${err.message}`);
+        failed++;
+    }
+}
+
+function assert(condition, msg) {
+    if (!condition) throw new Error(msg || 'Assertion failed');
+}
+
+function assertDeepEqual(a, b, msg) {
+    const as = JSON.stringify(a), bs = JSON.stringify(b);
+    if (as !== bs) throw new Error(`${msg || 'Not equal'}: got ${as}, expected ${bs}`);
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+console.log('\n=== MODEL_FAMILIES / getModelApiParams ===\n');
+
+// ── Mistral detection ─────────────────────────────────────────────────────────
+test('Mistral-Small-4-119B GGUF filename → reasoning_effort:none', () => {
+    const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
+    assertDeepEqual(p, { reasoning_effort: 'none' });
+});
+
+test('Mistral-Small-4 Q2_K_XL variant → reasoning_effort:none', () => {
+    const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf');
+    assertDeepEqual(p, { reasoning_effort: 'none' });
+});
+
+test('Magistral model → reasoning_effort:none', () => {
+    const p = getModelApiParams('magistral-medium-2506.gguf');
+    assertDeepEqual(p, { reasoning_effort: 'none' });
+});
+
+test('Mixtral-8x7B → reasoning_effort:none', () => {
+    const p = getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf');
+    assertDeepEqual(p, { reasoning_effort: 'none' });
+});
+
+test('Mistral cloud API model ID → reasoning_effort:none', () => {
+    const p = getModelApiParams('mistral-small-latest');
+    assertDeepEqual(p, { reasoning_effort: 'none' });
+});
+
+// ── Non-Mistral: should get no extra params ───────────────────────────────────
+test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
+    const p = getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf');
+    assertDeepEqual(p, {});
+});
+
+test('Qwen3.5-27B → no extra params', () => {
+    const p = getModelApiParams('Qwen3.5-27B-UD-Q8_K_XL.gguf');
+    assertDeepEqual(p, {});
+});
+
+test('NVIDIA Nemotron-30B → no extra params', () => {
+    const p = getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
+    assertDeepEqual(p, {});
+});
+
+test('LFM2-24B → no extra params', () => {
+    const p = getModelApiParams('LFM2-24B-A2B-Q8_0.gguf');
+    assertDeepEqual(p, {});
+});
+
+test('GPT-5.4 → no extra params', () => {
+    const p = getModelApiParams('gpt-5.4-2026-03-05');
+    assertDeepEqual(p, {});
+});
+
+test('Empty model name → no extra params', () => {
+    const p = getModelApiParams('');
+    assertDeepEqual(p, {});
+});
+
+test('Undefined model name → no extra params', () => {
+    const p = getModelApiParams(undefined);
+    assertDeepEqual(p, {});
+});
+
+// ── Server-manager flags (mirrors llm-server-manager.cjs logic) ───────────────
+console.log('\n=== Server-manager startup flags ===\n');
+
+test('Mistral GGUF path → --reasoning-budget 0', () => {
+    const f = getServerFlags('/Users/simba/.aegis-ai/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
+    assert(f.flag === '--reasoning-budget' && f.value === '0',
+        `Expected --reasoning-budget 0, got ${f.flag} ${f.value}`);
+});
+
+test('Magistral path → --reasoning-budget 0', () => {
+    const f = getServerFlags('/models/magistral-medium.gguf');
+    assert(f.flag === '--reasoning-budget' && f.value === '0');
+});
+
+test('Qwen path → --chat-template-kwargs enable_thinking:false', () => {
+    const f = getServerFlags('/models/Qwen3.5-9B-Q4_K_M.gguf');
+    assert(f.flag === '--chat-template-kwargs');
+    assert(f.value.includes('enable_thinking'));
+    assert(f.value.includes('false'));
+});
+
+test('Nemotron path → --chat-template-kwargs enable_thinking:false', () => {
+    const f = getServerFlags('/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
+    assert(f.flag === '--chat-template-kwargs');
+});
+
+test('LFM2 path → --chat-template-kwargs enable_thinking:false', () => {
+    const f = getServerFlags('/models/LFM2-24B-A2B-Q8_0.gguf');
+    assert(f.flag === '--chat-template-kwargs');
+});
+
+// ── Summary ──────────────────────────────────────────────────────────────────
+
+console.log(`\n${passed + failed} tests: ${passed} passed, ${failed} failed\n`);
+process.exit(failed > 0 ? 1 : 0);