Skip to content

Commit e4fc076

Browse files
committed
fix(benchmark): temperature clamping for Nemotron and LFM2
Add Nemotron and LFM2 model families to MODEL_FAMILIES with minTemperature: 1.0 — these models reject temperature < 1.0 with HTTP 400. The benchmark now clamps temperature to the family minimum before sending the request. - Refactor getModelApiParams → getModelFamily (returns full config) - Add resolveTemperature logic in llmCall params builder - Update test-model-config.cjs: 27 tests including temperature clamp - Fix Mistral serverFlags to match current llm-server-manager.cjs
1 parent f99d28b commit e4fc076

File tree

2 files changed

+160
-70
lines changed

2 files changed

+160
-70
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -155,31 +155,45 @@ const MODEL_FAMILIES = [
155155
// Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
156156
// Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
157157
apiParams: { reasoning_effort: 'none' },
158-
serverFlags: '--reasoning-budget 0',
158+
serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
159+
},
160+
{
161+
name: 'Nemotron',
162+
// NVIDIA Nemotron-3-Nano (4B, 30B) — rejects temperature < 1.0 with HTTP 400:
163+
// "Unsupported value: 'temperature' does not support 0.1 with this model"
164+
match: (m) => m.includes('nemotron'),
165+
apiParams: {},
166+
minTemperature: 1.0,
167+
},
168+
{
169+
name: 'LFM',
170+
// Liquid LFM2 / LFM2.5 — same temperature restriction as Nemotron
171+
match: (m) => m.includes('lfm'),
172+
apiParams: {},
173+
minTemperature: 1.0,
159174
},
160175
// Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
161176
// abort in llmCall — no extra per-request params needed.
162-
// {
163-
// name: 'Qwen3',
164-
// match: (m) => m.includes('qwen') || m.includes('qwq'),
165-
// apiParams: {}, // could add: { chat_template_kwargs: { enable_thinking: false } }
166-
// serverFlags: "--chat-template-kwargs '{\"enable_thinking\":false}'",
167-
// },
168177
];
169178

170179
/**
171-
* Return the merged extra API params for the given model name.
180+
* Return the matched MODEL_FAMILIES entry for the given model name.
172181
* Returns {} if the model is not in any known family.
173182
*/
174-
function getModelApiParams(modelName) {
183+
function getModelFamily(modelName) {
175184
if (!modelName) return {};
176185
const lower = modelName.toLowerCase();
177186
for (const family of MODEL_FAMILIES) {
178-
if (family.match(lower)) return family.apiParams || {};
187+
if (family.match(lower)) return family;
179188
}
180189
return {};
181190
}
182191

192+
/** Return extra API params for the model (e.g. reasoning_effort for Mistral). */
193+
function getModelApiParams(modelName) {
194+
return getModelFamily(modelName).apiParams || {};
195+
}
196+
183197
// ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
184198

185199
/**
@@ -286,9 +300,19 @@ async function llmCall(messages, opts = {}) {
286300
// Sending max_tokens to thinking models (Qwen3.5) starves actual output since
287301
// reasoning_content counts against the limit.
288302

289-
// Lookup model-family-specific extra params (e.g. reasoning_effort for Mistral).
303+
// Lookup model-family-specific config (e.g. reasoning_effort for Mistral,
304+
// minTemperature for Nemotron/LFM2).
290305
// VLM calls skip the LLM family table — VLM models are always local llava-compatible.
291-
const modelFamilyParams = opts.vlm ? {} : getModelApiParams(model || LLM_MODEL);
306+
const modelFamily = opts.vlm ? {} : getModelFamily(model || LLM_MODEL);
307+
const modelFamilyParams = modelFamily.apiParams || {};
308+
309+
// Resolve temperature: apply model-specific minimum if needed.
310+
// Nemotron and LFM2 reject temperature < 1.0 with HTTP 400.
311+
let temperature = opts.temperature;
312+
if (temperature === undefined && opts.expectJSON) temperature = 0.7;
313+
if (temperature !== undefined && modelFamily.minTemperature !== undefined) {
314+
temperature = Math.max(temperature, modelFamily.minTemperature);
315+
}
292316

293317
// Build request params
294318
const params = {
@@ -298,8 +322,7 @@ async function llmCall(messages, opts = {}) {
298322
// llama-server crashes with "Failed to parse input" when stream_options is present)
299323
...(isCloudApi && { stream_options: { include_usage: true } }),
300324
...(model && { model }),
301-
...(opts.temperature !== undefined && { temperature: opts.temperature }),
302-
...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
325+
...(temperature !== undefined && { temperature }),
303326
...(opts.expectJSON && { top_p: 0.8 }),
304327
...(opts.tools && { tools: opts.tools }),
305328
// Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).

skills/analysis/home-security-benchmark/scripts/test-model-config.cjs

Lines changed: 123 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/usr/bin/env node
22
/**
3-
* Unit tests for MODEL_FAMILIES / getModelApiParams logic.
3+
* Unit tests for MODEL_FAMILIES / getModelFamily / getModelApiParams logic.
44
*
5-
* Tests the model-family detection and per-request param injection
6-
* without needing a running LLM server.
5+
* Tests the model-family detection, per-request param injection,
6+
* and temperature clamping without needing a running LLM server.
77
*
88
* Usage:
99
* node scripts/test-model-config.cjs
@@ -17,28 +17,54 @@ const MODEL_FAMILIES = [
1717
name: 'Mistral',
1818
match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
1919
apiParams: { reasoning_effort: 'none' },
20-
serverFlags: '--reasoning-budget 0',
20+
serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
21+
},
22+
{
23+
name: 'Nemotron',
24+
match: (m) => m.includes('nemotron'),
25+
apiParams: {},
26+
minTemperature: 1.0,
27+
},
28+
{
29+
name: 'LFM',
30+
match: (m) => m.includes('lfm'),
31+
apiParams: {},
32+
minTemperature: 1.0,
2133
},
22-
// Qwen3.5: no extra per-request params needed (handled by prompt + abort logic)
2334
];
2435

25-
function getModelApiParams(modelName) {
36+
function getModelFamily(modelName) {
2637
if (!modelName) return {};
2738
const lower = modelName.toLowerCase();
2839
for (const family of MODEL_FAMILIES) {
29-
if (family.match(lower)) return family.apiParams || {};
40+
if (family.match(lower)) return family;
3041
}
3142
return {};
3243
}
3344

45+
function getModelApiParams(modelName) {
46+
return getModelFamily(modelName).apiParams || {};
47+
}
48+
49+
/** Simulate the temperature clamping logic from llmCall(). */
50+
function resolveTemperature(modelName, requestedTemp, expectJSON) {
51+
const family = getModelFamily(modelName);
52+
let temperature = requestedTemp;
53+
if (temperature === undefined && expectJSON) temperature = 0.7;
54+
if (temperature !== undefined && family.minTemperature !== undefined) {
55+
temperature = Math.max(temperature, family.minTemperature);
56+
}
57+
return temperature;
58+
}
59+
3460
// ── Mirror the server-manager detection ──────────────────────────────────────
3561
function getServerFlags(modelFilePath) {
3662
const lower = modelFilePath.toLowerCase();
3763
const isMistralFamily = lower.includes('mistral') ||
3864
lower.includes('magistral') ||
3965
lower.includes('mixtral');
4066
return isMistralFamily
41-
? { flag: '--reasoning-budget', value: '0' }
67+
? { flag: '--chat-template-kwargs', value: '{"reasoning_effort":"none"}' }
4268
: { flag: '--chat-template-kwargs', value: '{"enable_thinking":false}' };
4369
}
4470

@@ -72,98 +98,139 @@ function assertDeepEqual(a, b, msg) {
7298
console.log('\n=== MODEL_FAMILIES / getModelApiParams ===\n');
7399

74100
// ── Mistral detection ─────────────────────────────────────────────────────────
75-
test('Mistral-Small-4-119B GGUF filename → reasoning_effort:none', () => {
76-
const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
77-
assertDeepEqual(p, { reasoning_effort: 'none' });
101+
test('Mistral-Small-4-119B GGUF → reasoning_effort:none', () => {
102+
assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf'), { reasoning_effort: 'none' });
78103
});
79104

80-
test('Mistral-Small-4 Q2_K_XL variant → reasoning_effort:none', () => {
81-
const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf');
82-
assertDeepEqual(p, { reasoning_effort: 'none' });
105+
test('Mistral-Small-4 Q2_K_XL → reasoning_effort:none', () => {
106+
assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf'), { reasoning_effort: 'none' });
83107
});
84108

85109
test('Magistral model → reasoning_effort:none', () => {
86-
const p = getModelApiParams('magistral-medium-2506.gguf');
87-
assertDeepEqual(p, { reasoning_effort: 'none' });
110+
assertDeepEqual(getModelApiParams('magistral-medium-2506.gguf'), { reasoning_effort: 'none' });
88111
});
89112

90113
test('Mixtral-8x7B → reasoning_effort:none', () => {
91-
const p = getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf');
92-
assertDeepEqual(p, { reasoning_effort: 'none' });
114+
assertDeepEqual(getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf'), { reasoning_effort: 'none' });
93115
});
94116

95117
test('Mistral cloud API model ID → reasoning_effort:none', () => {
96-
const p = getModelApiParams('mistral-small-latest');
97-
assertDeepEqual(p, { reasoning_effort: 'none' });
118+
assertDeepEqual(getModelApiParams('mistral-small-latest'), { reasoning_effort: 'none' });
98119
});
99120

100-
// ── Non-Mistral: should get no extra params ───────────────────────────────────
101-
test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
102-
const p = getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf');
103-
assertDeepEqual(p, {});
121+
// ── Nemotron detection ────────────────────────────────────────────────────────
122+
test('Nemotron-4B → no extra apiParams', () => {
123+
assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'), {});
124+
});
125+
126+
test('Nemotron-30B → no extra apiParams', () => {
127+
assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf'), {});
128+
});
129+
130+
test('Nemotron-30B → minTemperature = 1.0', () => {
131+
const f = getModelFamily('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
132+
assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
104133
});
105134

106-
test('Qwen3.5-27B → no extra params', () => {
107-
const p = getModelApiParams('Qwen3.5-27B-UD-Q8_K_XL.gguf');
108-
assertDeepEqual(p, {});
135+
// ── LFM detection ─────────────────────────────────────────────────────────────
136+
test('LFM2-24B → no extra apiParams', () => {
137+
assertDeepEqual(getModelApiParams('LFM2-24B-A2B-Q8_0.gguf'), {});
109138
});
110139

111-
test('NVIDIA Nemotron-30B → no extra params', () => {
112-
const p = getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
113-
assertDeepEqual(p, {});
140+
test('LFM2.5-1.2B → no extra apiParams', () => {
141+
assertDeepEqual(getModelApiParams('LFM2.5-1.2B-Instruct-BF16.gguf'), {});
114142
});
115143

116-
test('LFM2-24B → no extra params', () => {
117-
const p = getModelApiParams('LFM2-24B-A2B-Q8_0.gguf');
118-
assertDeepEqual(p, {});
144+
test('LFM2-24B → minTemperature = 1.0', () => {
145+
const f = getModelFamily('LFM2-24B-A2B-Q8_0.gguf');
146+
assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
147+
});
148+
149+
// ── Non-matching: should get no family config ─────────────────────────────────
150+
test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
151+
assertDeepEqual(getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf'), {});
119152
});
120153

121154
test('GPT-5.4 → no extra params', () => {
122-
const p = getModelApiParams('gpt-5.4-2026-03-05');
123-
assertDeepEqual(p, {});
155+
assertDeepEqual(getModelApiParams('gpt-5.4-2026-03-05'), {});
124156
});
125157

126158
test('Empty model name → no extra params', () => {
127-
const p = getModelApiParams('');
128-
assertDeepEqual(p, {});
159+
assertDeepEqual(getModelApiParams(''), {});
129160
});
130161

131162
test('Undefined model name → no extra params', () => {
132-
const p = getModelApiParams(undefined);
133-
assertDeepEqual(p, {});
163+
assertDeepEqual(getModelApiParams(undefined), {});
134164
});
135165

136-
// ── Server-manager flags (mirrors llm-server-manager.cjs logic) ───────────────
137-
console.log('\n=== Server-manager startup flags ===\n');
166+
// ── Temperature clamping ──────────────────────────────────────────────────────
167+
console.log('\n=== Temperature clamping ===\n');
138168

139-
test('Mistral GGUF path → --reasoning-budget 0', () => {
140-
const f = getServerFlags('/Users/simba/.aegis-ai/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
141-
assert(f.flag === '--reasoning-budget' && f.value === '0',
142-
`Expected --reasoning-budget 0, got ${f.flag} ${f.value}`);
169+
test('Nemotron + temp 0.1 → clamped to 1.0', () => {
170+
const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', 0.1, false);
171+
assert(t === 1.0, `Expected 1.0, got ${t}`);
143172
});
144173

145-
test('Magistral path → --reasoning-budget 0', () => {
146-
const f = getServerFlags('/models/magistral-medium.gguf');
147-
assert(f.flag === '--reasoning-budget' && f.value === '0');
174+
test('LFM2 + temp 0.1 → clamped to 1.0', () => {
175+
const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.1, false);
176+
assert(t === 1.0, `Expected 1.0, got ${t}`);
148177
});
149178

150-
test('Qwen path → --chat-template-kwargs enable_thinking:false', () => {
179+
test('LFM2 + temp 0.7 (expectJSON) → clamped to 1.0', () => {
180+
const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.7, true);
181+
assert(t === 1.0, `Expected 1.0, got ${t}`);
182+
});
183+
184+
test('LFM2 + temp undefined + expectJSON → clamped from 0.7 to 1.0', () => {
185+
const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', undefined, true);
186+
assert(t === 1.0, `Expected 1.0, got ${t}`);
187+
});
188+
189+
test('LFM2 + temp 1.5 → kept at 1.5 (above min)', () => {
190+
const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 1.5, false);
191+
assert(t === 1.5, `Expected 1.5, got ${t}`);
192+
});
193+
194+
test('Qwen + temp 0.1 → kept at 0.1 (no clamp)', () => {
195+
const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', 0.1, false);
196+
assert(t === 0.1, `Expected 0.1, got ${t}`);
197+
});
198+
199+
test('Mistral + temp 0.1 → kept at 0.1 (no minTemperature)', () => {
200+
const t = resolveTemperature('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf', 0.1, false);
201+
assert(t === 0.1, `Expected 0.1, got ${t}`);
202+
});
203+
204+
test('Qwen + temp undefined + no expectJSON → stays undefined', () => {
205+
const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', undefined, false);
206+
assert(t === undefined, `Expected undefined, got ${t}`);
207+
});
208+
209+
test('Nemotron + temp undefined + no expectJSON → stays undefined', () => {
210+
const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', undefined, false);
211+
assert(t === undefined, `Expected undefined, got ${t}`);
212+
});
213+
214+
// ── Server-manager flags ─────────────────────────────────────────────────────
215+
console.log('\n=== Server-manager startup flags ===\n');
216+
217+
test('Mistral GGUF path → chat-template-kwargs with reasoning_effort:none', () => {
218+
const f = getServerFlags('/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
219+
assert(f.flag === '--chat-template-kwargs', `Expected --chat-template-kwargs, got ${f.flag}`);
220+
assert(f.value.includes('reasoning_effort'), `Expected reasoning_effort in value`);
221+
});
222+
223+
test('Qwen path → chat-template-kwargs with enable_thinking:false', () => {
151224
const f = getServerFlags('/models/Qwen3.5-9B-Q4_K_M.gguf');
152225
assert(f.flag === '--chat-template-kwargs');
153226
assert(f.value.includes('enable_thinking'));
154-
assert(f.value.includes('false'));
155227
});
156228

157-
test('Nemotron path → --chat-template-kwargs enable_thinking:false', () => {
229+
test('Nemotron path → chat-template-kwargs (non-Mistral default)', () => {
158230
const f = getServerFlags('/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
159231
assert(f.flag === '--chat-template-kwargs');
160232
});
161233

162-
test('LFM2 path → --chat-template-kwargs enable_thinking:false', () => {
163-
const f = getServerFlags('/models/LFM2-24B-A2B-Q8_0.gguf');
164-
assert(f.flag === '--chat-template-kwargs');
165-
});
166-
167234
// ── Summary ──────────────────────────────────────────────────────────────────
168235

169236
console.log(`\n${passed + failed} tests: ${passed} passed, ${failed} failed\n`);

0 commit comments

Comments
 (0)