Skip to content

Commit fa07745

Browse files
committed
fix(benchmark): model-family-aware thinking suppression + streaming sanity check
- Add MODEL_FAMILIES config table with per-model API params and server flags - Add getModelApiParams() helper to inject reasoning_effort:none for Mistral - Add delta.thinking fallback in streaming loop to capture thinking tokens - Add streaming sanity check before benchmark run (detects empty-token loops) - Add test-model-config.cjs with 17 unit tests for model detection logic
1 parent c7e9ddd commit fa07745

File tree

2 files changed

+296
-2
lines changed

2 files changed

+296
-2
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 126 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,66 @@ const vlmClient = VLM_URL ? new OpenAI({
120120
baseURL: `${strip(VLM_URL)}/v1`,
121121
}) : null;
122122

123+
// ─── Model Family Capabilities Config ────────────────────────────────────────
124+
//
125+
// Different model families require different per-request params to control
126+
// thinking/reasoning behavior. This table centralizes those differences so
127+
// llmCall() can dispatch them automatically.
128+
//
129+
// Fields:
130+
// match — fn(modelName: string) → bool
131+
// apiParams — extra params merged into every chat/completions request
132+
// serverFlags — llama-server startup flags needed for full control
133+
// (documentation only — llmCall is a client and cannot set these)
134+
//
135+
// ┌─────────────────────┬──────────────────────────────┬──────────────────────────────────────────┐
136+
// │ Family │ Per-request param │ llama-server startup flag │
137+
// ├─────────────────────┼──────────────────────────────┼──────────────────────────────────────────┤
138+
// │ Mistral Small 4+ │ reasoning_effort: 'none' │ --reasoning-budget 0 │
139+
// │ Qwen3.5 (thinking) │ (none needed — handled by │ --chat-template-kwargs │
140+
// │ │ /no_think prompt suffix and │ '{"enable_thinking":false}' │
141+
// │ │ 500-token reasoning abort) │ │
142+
// │ GPT / Claude │ (none — cloud API, no local │ N/A │
143+
// │ │ thinking tokens) │ │
144+
// └─────────────────────┴──────────────────────────────┴──────────────────────────────────────────┘
145+
//
146+
// To add a new model family: append an entry to MODEL_FAMILIES.
147+
// The match fn receives the lower-cased model name/filename.
148+
149+
const MODEL_FAMILIES = [
150+
{
151+
name: 'Mistral',
152+
// Covers: Mistral-Small-4, Mistral-*, Magistral-*, Mixtral-*
153+
match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
154+
// reasoning_effort=none disables thinking and routes all output to delta.content.
155+
// Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
156+
// Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
157+
apiParams: { reasoning_effort: 'none' },
158+
serverFlags: '--reasoning-budget 0',
159+
},
160+
// Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
161+
// abort in llmCall — no extra per-request params needed.
162+
// {
163+
// name: 'Qwen3',
164+
// match: (m) => m.includes('qwen') || m.includes('qwq'),
165+
// apiParams: {}, // could add: { chat_template_kwargs: { enable_thinking: false } }
166+
// serverFlags: "--chat-template-kwargs '{\"enable_thinking\":false}'",
167+
// },
168+
];
169+
170+
/**
171+
* Return the merged extra API params for the given model name.
172+
* Returns {} if the model is not in any known family.
173+
*/
174+
function getModelApiParams(modelName) {
175+
if (!modelName) return {};
176+
const lower = modelName.toLowerCase();
177+
for (const family of MODEL_FAMILIES) {
178+
if (family.match(lower)) return family.apiParams || {};
179+
}
180+
return {};
181+
}
182+
123183
// ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
124184

125185
/**
@@ -226,6 +286,10 @@ async function llmCall(messages, opts = {}) {
226286
// Sending max_tokens to thinking models (Qwen3.5) starves actual output since
227287
// reasoning_content counts against the limit.
228288

289+
// Lookup model-family-specific extra params (e.g. reasoning_effort for Mistral).
290+
// VLM calls skip the LLM family table — VLM models are always local llava-compatible.
291+
const modelFamilyParams = opts.vlm ? {} : getModelApiParams(model || LLM_MODEL);
292+
229293
// Build request params
230294
const params = {
231295
messages,
@@ -238,6 +302,9 @@ async function llmCall(messages, opts = {}) {
238302
...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
239303
...(opts.expectJSON && { top_p: 0.8 }),
240304
...(opts.tools && { tools: opts.tools }),
305+
// Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
306+
// These are merged last so they take precedence over defaults.
307+
...modelFamilyParams,
241308
};
242309

243310
// Use an AbortController with idle timeout that resets on each streamed chunk.
@@ -297,7 +364,11 @@ async function llmCall(messages, opts = {}) {
297364
const delta = chunk.choices?.[0]?.delta;
298365
if (delta?.content) content += delta.content;
299366
if (delta?.reasoning_content) reasoningContent += delta.reasoning_content;
300-
if (delta?.content || delta?.reasoning_content) {
367+
// Fallback: Mistral Small 4 in llama-server may route thinking tokens through
368+
// `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
369+
// compatibility varies by version). Capture it so the idle timer resets.
370+
if (delta?.thinking) reasoningContent += delta.thinking;
371+
if (delta?.content || delta?.reasoning_content || delta?.thinking) {
301372
tokenCount++;
302373
// Capture TTFT on first content/reasoning token
303374
if (!firstTokenTime) firstTokenTime = Date.now();
@@ -2347,8 +2418,61 @@ async function main() {
23472418
emit({ event: 'error', message: `Cannot reach LLM endpoint: ${err.message}` });
23482419
process.exit(IS_SKILL_MODE ? 0 : 1);
23492420
}
2421+
// ── Streaming sanity check ────────────────────────────────────────────────
2422+
// Fires a tiny streaming call to verify the model actually produces content.
2423+
// Catches the Mistral "token-loop" bug: server started with a Qwen-specific
2424+
// --chat-template-kwargs flag causes Mistral to emit only empty token ID 31
2425+
// on every chunk, giving 0 content tokens for every test.
2426+
//
2427+
// This check saves ~30 minutes of doomed benchmark runs by failing fast.
2428+
log('\n 🔍 Streaming sanity check (10 tokens)...');
2429+
try {
2430+
const warmupParams = {
2431+
...(LLM_MODEL && { model: LLM_MODEL }),
2432+
messages: [{ role: 'user', content: 'Reply with just the word: hello' }],
2433+
stream: true,
2434+
max_tokens: 10,
2435+
...getModelApiParams(LLM_MODEL),
2436+
};
2437+
const warmupStream = await llmClient.chat.completions.create(warmupParams);
2438+
let warmupContent = '';
2439+
let warmupChunks = 0;
2440+
const warmupController = new AbortController();
2441+
const warmupTimeout = setTimeout(() => warmupController.abort(), 15000);
2442+
try {
2443+
for await (const chunk of warmupStream) {
2444+
warmupChunks++;
2445+
const d = chunk.choices?.[0]?.delta;
2446+
if (d?.content) warmupContent += d.content;
2447+
if (d?.reasoning_content) warmupContent += d.reasoning_content;
2448+
if (d?.thinking) warmupContent += d.thinking;
2449+
if (warmupChunks >= 30) break; // enough chunks to decide
2450+
}
2451+
} finally {
2452+
clearTimeout(warmupTimeout);
2453+
}
2454+
2455+
if (warmupContent.trim().length === 0) {
2456+
// Model produced chunks but zero content — server is in a bad state
2457+
const modelName = results.model.name || LLM_MODEL || 'current model';
2458+
log(`\n ❌ STREAMING SANITY CHECK FAILED`);
2459+
log(` The model (${modelName}) produced ${warmupChunks} stream chunks but 0 content tokens.`);
2460+
log(` This usually means the llama-server was started with an incompatible`);
2461+
log(` --chat-template-kwargs flag (e.g. Qwen's enable_thinking:false applied to Mistral).`);
2462+
log(`\n ➡ Fix: Reload the model in Aegis-AI to restart the llama-server with`);
2463+
log(` the correct flags for this model family.`);
2464+
log(` Mistral requires: --reasoning-budget 0`);
2465+
log(` Qwen requires: --chat-template-kwargs '{"enable_thinking":false}'\n`);
2466+
emit({ event: 'error', message: `Streaming sanity failed: ${warmupChunks} chunks, 0 content tokens. Reload the model in Aegis-AI to fix.` });
2467+
process.exit(IS_SKILL_MODE ? 0 : 1);
2468+
}
2469+
2470+
log(` ✅ Streaming OK — ${warmupContent.trim().split(/\s+/).length} words, ${warmupChunks} chunks`);
2471+
} catch (err) {
2472+
// Non-fatal — if warmup errors, let the benchmark try; individual tests will surface the issue
2473+
log(` ⚠️ Streaming warmup error (non-fatal): ${err.message}`);
2474+
}
23502475

2351-
// Collect system info
23522476
results.system = collectSystemInfo();
23532477
log(` System: ${results.system.cpu} (${results.system.cpuCores} cores)`);
23542478
log(` Memory: ${results.system.freeMemoryGB}GB free / ${results.system.totalMemoryGB}GB total`);
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Unit tests for MODEL_FAMILIES / getModelApiParams logic.
4+
*
5+
* Tests the model-family detection and per-request param injection
6+
* without needing a running LLM server.
7+
*
8+
* Usage:
9+
* node scripts/test-model-config.cjs
10+
*/
11+
12+
// ── Inline the config under test ─────────────────────────────────────────────
13+
// (Kept in sync with run-benchmark.cjs MODEL_FAMILIES section)
14+
15+
const MODEL_FAMILIES = [
16+
{
17+
name: 'Mistral',
18+
match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
19+
apiParams: { reasoning_effort: 'none' },
20+
serverFlags: '--reasoning-budget 0',
21+
},
22+
// Qwen3.5: no extra per-request params needed (handled by prompt + abort logic)
23+
];
24+
25+
function getModelApiParams(modelName) {
26+
if (!modelName) return {};
27+
const lower = modelName.toLowerCase();
28+
for (const family of MODEL_FAMILIES) {
29+
if (family.match(lower)) return family.apiParams || {};
30+
}
31+
return {};
32+
}
33+
34+
// ── Mirror the server-manager detection ──────────────────────────────────────
35+
function getServerFlags(modelFilePath) {
36+
const lower = modelFilePath.toLowerCase();
37+
const isMistralFamily = lower.includes('mistral') ||
38+
lower.includes('magistral') ||
39+
lower.includes('mixtral');
40+
return isMistralFamily
41+
? { flag: '--reasoning-budget', value: '0' }
42+
: { flag: '--chat-template-kwargs', value: '{"enable_thinking":false}' };
43+
}
44+
45+
// ── Test harness ─────────────────────────────────────────────────────────────
46+
47+
let passed = 0;
48+
let failed = 0;
49+
50+
function test(name, fn) {
51+
try {
52+
fn();
53+
console.log(` ✅ ${name}`);
54+
passed++;
55+
} catch (err) {
56+
console.log(` ❌ ${name}: ${err.message}`);
57+
failed++;
58+
}
59+
}
60+
61+
function assert(condition, msg) {
62+
if (!condition) throw new Error(msg || 'Assertion failed');
63+
}
64+
65+
function assertDeepEqual(a, b, msg) {
66+
const as = JSON.stringify(a), bs = JSON.stringify(b);
67+
if (as !== bs) throw new Error(`${msg || 'Not equal'}: got ${as}, expected ${bs}`);
68+
}
69+
70+
// ── Tests ────────────────────────────────────────────────────────────────────
71+
72+
console.log('\n=== MODEL_FAMILIES / getModelApiParams ===\n');
73+
74+
// ── Mistral detection ─────────────────────────────────────────────────────────
75+
test('Mistral-Small-4-119B GGUF filename → reasoning_effort:none', () => {
76+
const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
77+
assertDeepEqual(p, { reasoning_effort: 'none' });
78+
});
79+
80+
test('Mistral-Small-4 Q2_K_XL variant → reasoning_effort:none', () => {
81+
const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf');
82+
assertDeepEqual(p, { reasoning_effort: 'none' });
83+
});
84+
85+
test('Magistral model → reasoning_effort:none', () => {
86+
const p = getModelApiParams('magistral-medium-2506.gguf');
87+
assertDeepEqual(p, { reasoning_effort: 'none' });
88+
});
89+
90+
test('Mixtral-8x7B → reasoning_effort:none', () => {
91+
const p = getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf');
92+
assertDeepEqual(p, { reasoning_effort: 'none' });
93+
});
94+
95+
test('Mistral cloud API model ID → reasoning_effort:none', () => {
96+
const p = getModelApiParams('mistral-small-latest');
97+
assertDeepEqual(p, { reasoning_effort: 'none' });
98+
});
99+
100+
// ── Non-Mistral: should get no extra params ───────────────────────────────────
101+
test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
102+
const p = getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf');
103+
assertDeepEqual(p, {});
104+
});
105+
106+
test('Qwen3.5-27B → no extra params', () => {
107+
const p = getModelApiParams('Qwen3.5-27B-UD-Q8_K_XL.gguf');
108+
assertDeepEqual(p, {});
109+
});
110+
111+
test('NVIDIA Nemotron-30B → no extra params', () => {
112+
const p = getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
113+
assertDeepEqual(p, {});
114+
});
115+
116+
test('LFM2-24B → no extra params', () => {
117+
const p = getModelApiParams('LFM2-24B-A2B-Q8_0.gguf');
118+
assertDeepEqual(p, {});
119+
});
120+
121+
test('GPT-5.4 → no extra params', () => {
122+
const p = getModelApiParams('gpt-5.4-2026-03-05');
123+
assertDeepEqual(p, {});
124+
});
125+
126+
test('Empty model name → no extra params', () => {
127+
const p = getModelApiParams('');
128+
assertDeepEqual(p, {});
129+
});
130+
131+
test('Undefined model name → no extra params', () => {
132+
const p = getModelApiParams(undefined);
133+
assertDeepEqual(p, {});
134+
});
135+
136+
// ── Server-manager flags (mirrors llm-server-manager.cjs logic) ───────────────
137+
console.log('\n=== Server-manager startup flags ===\n');
138+
139+
test('Mistral GGUF path → --reasoning-budget 0', () => {
140+
const f = getServerFlags('/Users/simba/.aegis-ai/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
141+
assert(f.flag === '--reasoning-budget' && f.value === '0',
142+
`Expected --reasoning-budget 0, got ${f.flag} ${f.value}`);
143+
});
144+
145+
test('Magistral path → --reasoning-budget 0', () => {
146+
const f = getServerFlags('/models/magistral-medium.gguf');
147+
assert(f.flag === '--reasoning-budget' && f.value === '0');
148+
});
149+
150+
test('Qwen path → --chat-template-kwargs enable_thinking:false', () => {
151+
const f = getServerFlags('/models/Qwen3.5-9B-Q4_K_M.gguf');
152+
assert(f.flag === '--chat-template-kwargs');
153+
assert(f.value.includes('enable_thinking'));
154+
assert(f.value.includes('false'));
155+
});
156+
157+
test('Nemotron path → --chat-template-kwargs enable_thinking:false', () => {
158+
const f = getServerFlags('/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
159+
assert(f.flag === '--chat-template-kwargs');
160+
});
161+
162+
test('LFM2 path → --chat-template-kwargs enable_thinking:false', () => {
163+
const f = getServerFlags('/models/LFM2-24B-A2B-Q8_0.gguf');
164+
assert(f.flag === '--chat-template-kwargs');
165+
});
166+
167+
// ── Summary ──────────────────────────────────────────────────────────────────
168+
169+
console.log(`\n${passed + failed} tests: ${passed} passed, ${failed} failed\n`);
170+
process.exit(failed > 0 ? 1 : 0);

0 commit comments

Comments
 (0)