fix(benchmark): temperature clamping for Nemotron and LFM2

solderzzc · solderzzc · commit e4fc07697389 · 2026-03-21T08:32:15.000-07:00
Add Nemotron and LFM2 model families to MODEL_FAMILIES with
minTemperature: 1.0 — these models reject temperature &lt; 1.0 with
HTTP 400. The benchmark now clamps temperature to the family minimum
before sending the request.

- Refactor getModelApiParams → getModelFamily (returns full config)
- Add resolveTemperature logic in llmCall params builder
- Update test-model-config.cjs: 27 tests including temperature clamp
- Fix Mistral serverFlags to match current llm-server-manager.cjs
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -155,31 +155,45 @@ const MODEL_FAMILIES = [
         // Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
         // Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
         apiParams: { reasoning_effort: 'none' },
-        serverFlags: '--reasoning-budget 0',
+        serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
+    },
+    {
+        name: 'Nemotron',
+        // NVIDIA Nemotron-3-Nano (4B, 30B) — rejects temperature < 1.0 with HTTP 400:
+        // "Unsupported value: 'temperature' does not support 0.1 with this model"
+        match: (m) => m.includes('nemotron'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+    {
+        name: 'LFM',
+        // Liquid LFM2 / LFM2.5 — same temperature restriction as Nemotron
+        match: (m) => m.includes('lfm'),
+        apiParams: {},
+        minTemperature: 1.0,
     },
     // Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
     // abort in llmCall — no extra per-request params needed.
-    // {
-    //   name: 'Qwen3',
-    //   match: (m) => m.includes('qwen') || m.includes('qwq'),
-    //   apiParams: {},  // could add: { chat_template_kwargs: { enable_thinking: false } }
-    //   serverFlags: "--chat-template-kwargs '{\"enable_thinking\":false}'",
-    // },
 ];
 
 /**
- * Return the merged extra API params for the given model name.
+ * Return the matched MODEL_FAMILIES entry for the given model name.
  * Returns {} if the model is not in any known family.
  */
-function getModelApiParams(modelName) {
+function getModelFamily(modelName) {
     if (!modelName) return {};
     const lower = modelName.toLowerCase();
     for (const family of MODEL_FAMILIES) {
-        if (family.match(lower)) return family.apiParams || {};
+        if (family.match(lower)) return family;
     }
     return {};
 }
 
+/** Return extra API params for the model (e.g. reasoning_effort for Mistral). */
+function getModelApiParams(modelName) {
+    return getModelFamily(modelName).apiParams || {};
+}
+
 // ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
 
 /**
@@ -286,9 +300,19 @@ async function llmCall(messages, opts = {}) {
     // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
     // reasoning_content counts against the limit.
 
-    // Lookup model-family-specific extra params (e.g. reasoning_effort for Mistral).
+    // Lookup model-family-specific config (e.g. reasoning_effort for Mistral,
+    // minTemperature for Nemotron/LFM2).
     // VLM calls skip the LLM family table — VLM models are always local llava-compatible.
-    const modelFamilyParams = opts.vlm ? {} : getModelApiParams(model || LLM_MODEL);
+    const modelFamily = opts.vlm ? {} : getModelFamily(model || LLM_MODEL);
+    const modelFamilyParams = modelFamily.apiParams || {};
+
+    // Resolve temperature: apply model-specific minimum if needed.
+    // Nemotron and LFM2 reject temperature < 1.0 with HTTP 400.
+    let temperature = opts.temperature;
+    if (temperature === undefined && opts.expectJSON) temperature = 0.7;
+    if (temperature !== undefined && modelFamily.minTemperature !== undefined) {
+        temperature = Math.max(temperature, modelFamily.minTemperature);
+    }
 
     // Build request params
     const params = {
@@ -298,8 +322,7 @@ async function llmCall(messages, opts = {}) {
         // llama-server crashes with "Failed to parse input" when stream_options is present)
         ...(isCloudApi && { stream_options: { include_usage: true } }),
         ...(model && { model }),
-        ...(opts.temperature !== undefined && { temperature: opts.temperature }),
-        ...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
+        ...(temperature !== undefined && { temperature }),
         ...(opts.expectJSON && { top_p: 0.8 }),
         ...(opts.tools && { tools: opts.tools }),
         // Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
diff --git a/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs b/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs
@@ -1,9 +1,9 @@
 #!/usr/bin/env node
 /**
- * Unit tests for MODEL_FAMILIES / getModelApiParams logic.
+ * Unit tests for MODEL_FAMILIES / getModelFamily / getModelApiParams logic.
  *
- * Tests the model-family detection and per-request param injection
- * without needing a running LLM server.
+ * Tests the model-family detection, per-request param injection,
+ * and temperature clamping without needing a running LLM server.
  *
  * Usage:
  *   node scripts/test-model-config.cjs
@@ -17,28 +17,54 @@ const MODEL_FAMILIES = [
         name: 'Mistral',
         match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
         apiParams: { reasoning_effort: 'none' },
-        serverFlags: '--reasoning-budget 0',
+        serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
+    },
+    {
+        name: 'Nemotron',
+        match: (m) => m.includes('nemotron'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+    {
+        name: 'LFM',
+        match: (m) => m.includes('lfm'),
+        apiParams: {},
+        minTemperature: 1.0,
     },
-    // Qwen3.5: no extra per-request params needed (handled by prompt + abort logic)
 ];
 
-function getModelApiParams(modelName) {
+function getModelFamily(modelName) {
     if (!modelName) return {};
     const lower = modelName.toLowerCase();
     for (const family of MODEL_FAMILIES) {
-        if (family.match(lower)) return family.apiParams || {};
+        if (family.match(lower)) return family;
     }
     return {};
 }
 
+function getModelApiParams(modelName) {
+    return getModelFamily(modelName).apiParams || {};
+}
+
+/** Simulate the temperature clamping logic from llmCall(). */
+function resolveTemperature(modelName, requestedTemp, expectJSON) {
+    const family = getModelFamily(modelName);
+    let temperature = requestedTemp;
+    if (temperature === undefined && expectJSON) temperature = 0.7;
+    if (temperature !== undefined && family.minTemperature !== undefined) {
+        temperature = Math.max(temperature, family.minTemperature);
+    }
+    return temperature;
+}
+
 // ── Mirror the server-manager detection ──────────────────────────────────────
 function getServerFlags(modelFilePath) {
     const lower = modelFilePath.toLowerCase();
     const isMistralFamily = lower.includes('mistral') ||
                             lower.includes('magistral') ||
                             lower.includes('mixtral');
     return isMistralFamily
-        ? { flag: '--reasoning-budget', value: '0' }
+        ? { flag: '--chat-template-kwargs', value: '{"reasoning_effort":"none"}' }
         : { flag: '--chat-template-kwargs', value: '{"enable_thinking":false}' };
 }
 
@@ -72,98 +98,139 @@ function assertDeepEqual(a, b, msg) {
 console.log('\n=== MODEL_FAMILIES / getModelApiParams ===\n');
 
 // ── Mistral detection ─────────────────────────────────────────────────────────
-test('Mistral-Small-4-119B GGUF filename → reasoning_effort:none', () => {
-    const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
-    assertDeepEqual(p, { reasoning_effort: 'none' });
+test('Mistral-Small-4-119B GGUF → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf'), { reasoning_effort: 'none' });
 });
 
-test('Mistral-Small-4 Q2_K_XL variant → reasoning_effort:none', () => {
-    const p = getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf');
-    assertDeepEqual(p, { reasoning_effort: 'none' });
+test('Mistral-Small-4 Q2_K_XL → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf'), { reasoning_effort: 'none' });
 });
 
 test('Magistral model → reasoning_effort:none', () => {
-    const p = getModelApiParams('magistral-medium-2506.gguf');
-    assertDeepEqual(p, { reasoning_effort: 'none' });
+    assertDeepEqual(getModelApiParams('magistral-medium-2506.gguf'), { reasoning_effort: 'none' });
 });
 
 test('Mixtral-8x7B → reasoning_effort:none', () => {
-    const p = getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf');
-    assertDeepEqual(p, { reasoning_effort: 'none' });
+    assertDeepEqual(getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf'), { reasoning_effort: 'none' });
 });
 
 test('Mistral cloud API model ID → reasoning_effort:none', () => {
-    const p = getModelApiParams('mistral-small-latest');
-    assertDeepEqual(p, { reasoning_effort: 'none' });
+    assertDeepEqual(getModelApiParams('mistral-small-latest'), { reasoning_effort: 'none' });
 });
 
-// ── Non-Mistral: should get no extra params ───────────────────────────────────
-test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
-    const p = getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf');
-    assertDeepEqual(p, {});
+// ── Nemotron detection ────────────────────────────────────────────────────────
+test('Nemotron-4B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'), {});
+});
+
+test('Nemotron-30B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf'), {});
+});
+
+test('Nemotron-30B → minTemperature = 1.0', () => {
+    const f = getModelFamily('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
+    assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
 });
 
-test('Qwen3.5-27B → no extra params', () => {
-    const p = getModelApiParams('Qwen3.5-27B-UD-Q8_K_XL.gguf');
-    assertDeepEqual(p, {});
+// ── LFM detection ─────────────────────────────────────────────────────────────
+test('LFM2-24B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('LFM2-24B-A2B-Q8_0.gguf'), {});
 });
 
-test('NVIDIA Nemotron-30B → no extra params', () => {
-    const p = getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
-    assertDeepEqual(p, {});
+test('LFM2.5-1.2B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('LFM2.5-1.2B-Instruct-BF16.gguf'), {});
 });
 
-test('LFM2-24B → no extra params', () => {
-    const p = getModelApiParams('LFM2-24B-A2B-Q8_0.gguf');
-    assertDeepEqual(p, {});
+test('LFM2-24B → minTemperature = 1.0', () => {
+    const f = getModelFamily('LFM2-24B-A2B-Q8_0.gguf');
+    assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
+});
+
+// ── Non-matching: should get no family config ─────────────────────────────────
+test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
+    assertDeepEqual(getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf'), {});
 });
 
 test('GPT-5.4 → no extra params', () => {
-    const p = getModelApiParams('gpt-5.4-2026-03-05');
-    assertDeepEqual(p, {});
+    assertDeepEqual(getModelApiParams('gpt-5.4-2026-03-05'), {});
 });
 
 test('Empty model name → no extra params', () => {
-    const p = getModelApiParams('');
-    assertDeepEqual(p, {});
+    assertDeepEqual(getModelApiParams(''), {});
 });
 
 test('Undefined model name → no extra params', () => {
-    const p = getModelApiParams(undefined);
-    assertDeepEqual(p, {});
+    assertDeepEqual(getModelApiParams(undefined), {});
 });
 
-// ── Server-manager flags (mirrors llm-server-manager.cjs logic) ───────────────
-console.log('\n=== Server-manager startup flags ===\n');
+// ── Temperature clamping ──────────────────────────────────────────────────────
+console.log('\n=== Temperature clamping ===\n');
 
-test('Mistral GGUF path → --reasoning-budget 0', () => {
-    const f = getServerFlags('/Users/simba/.aegis-ai/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
-    assert(f.flag === '--reasoning-budget' && f.value === '0',
-        `Expected --reasoning-budget 0, got ${f.flag} ${f.value}`);
+test('Nemotron + temp 0.1 → clamped to 1.0', () => {
+    const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', 0.1, false);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
 });
 
-test('Magistral path → --reasoning-budget 0', () => {
-    const f = getServerFlags('/models/magistral-medium.gguf');
-    assert(f.flag === '--reasoning-budget' && f.value === '0');
+test('LFM2 + temp 0.1 → clamped to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.1, false);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
 });
 
-test('Qwen path → --chat-template-kwargs enable_thinking:false', () => {
+test('LFM2 + temp 0.7 (expectJSON) → clamped to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.7, true);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp undefined + expectJSON → clamped from 0.7 to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', undefined, true);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp 1.5 → kept at 1.5 (above min)', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 1.5, false);
+    assert(t === 1.5, `Expected 1.5, got ${t}`);
+});
+
+test('Qwen + temp 0.1 → kept at 0.1 (no clamp)', () => {
+    const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', 0.1, false);
+    assert(t === 0.1, `Expected 0.1, got ${t}`);
+});
+
+test('Mistral + temp 0.1 → kept at 0.1 (no minTemperature)', () => {
+    const t = resolveTemperature('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf', 0.1, false);
+    assert(t === 0.1, `Expected 0.1, got ${t}`);
+});
+
+test('Qwen + temp undefined + no expectJSON → stays undefined', () => {
+    const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', undefined, false);
+    assert(t === undefined, `Expected undefined, got ${t}`);
+});
+
+test('Nemotron + temp undefined + no expectJSON → stays undefined', () => {
+    const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', undefined, false);
+    assert(t === undefined, `Expected undefined, got ${t}`);
+});
+
+// ── Server-manager flags ─────────────────────────────────────────────────────
+console.log('\n=== Server-manager startup flags ===\n');
+
+test('Mistral GGUF path → chat-template-kwargs with reasoning_effort:none', () => {
+    const f = getServerFlags('/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
+    assert(f.flag === '--chat-template-kwargs', `Expected --chat-template-kwargs, got ${f.flag}`);
+    assert(f.value.includes('reasoning_effort'), `Expected reasoning_effort in value`);
+});
+
+test('Qwen path → chat-template-kwargs with enable_thinking:false', () => {
     const f = getServerFlags('/models/Qwen3.5-9B-Q4_K_M.gguf');
     assert(f.flag === '--chat-template-kwargs');
     assert(f.value.includes('enable_thinking'));
-    assert(f.value.includes('false'));
 });
 
-test('Nemotron path → --chat-template-kwargs enable_thinking:false', () => {
+test('Nemotron path → chat-template-kwargs (non-Mistral default)', () => {
     const f = getServerFlags('/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
     assert(f.flag === '--chat-template-kwargs');
 });
 
-test('LFM2 path → --chat-template-kwargs enable_thinking:false', () => {
-    const f = getServerFlags('/models/LFM2-24B-A2B-Q8_0.gguf');
-    assert(f.flag === '--chat-template-kwargs');
-});
-
 // ── Summary ──────────────────────────────────────────────────────────────────
 
 console.log(`\n${passed + failed} tests: ${passed} passed, ${failed} failed\n`);