feat(proxy): upgrade LLM to 405B Hermes cascade + improve prompts (DAK-6944) (#224)

ferhimedamine · claude · web-flow · commit f2bfbbeee049 · 2026-06-17T22:10:50.000+02:00
- Model cascade: Hermes 3 405B → gpt-oss-120b → Nemotron Ultra 550B → Llama 3.3 70B
  (replaces single gemma-4-26b model)
- max_tokens: 512 → 800 for thorough answers
- System prompts rewritten: natural memory synthesis, no raw scores or source listing
- Cascade tries each model with 10s timeout, skips on 429/error, returns first success
- Tests updated: 58/58 pass (model override → cascade behavior, 402 → all_models_failed)

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docker/playground/proxy/llm-compare.js b/docker/playground/proxy/llm-compare.js
@@ -77,7 +77,15 @@ const SEED_MEMORIES = [
   },
 ];
 
-const DEFAULT_MODEL = 'google/gemma-4-26b-a4b-it:free';
+// Model cascade (DAK-6944): 405B Hermes primary, then gpt-oss-120b, Nemotron Ultra, Llama 70b.
+const MODEL_CASCADE = [
+  'nousresearch/hermes-3-llama-3.1-405b:free',
+  'openai/gpt-oss-120b:free',
+  'nvidia/nemotron-3-ultra-550b-a55b:free',
+  'meta-llama/llama-3.3-70b-instruct:free',
+];
+const DEFAULT_MODEL = MODEL_CASCADE[0];
+const LLM_MAX_TOKENS = 800;
 const SEED_TIMEOUT_MS = 8_000;
 
 // ---------------------------------------------------------------------------
@@ -86,7 +94,7 @@ const SEED_TIMEOUT_MS = 8_000;
 
 function _callOpenRouter(apiKey, model, messages, timeoutMs) {
   return new Promise((resolve, reject) => {
-    const body = JSON.stringify({ model, messages, max_tokens: 512 });
+    const body = JSON.stringify({ model, messages, max_tokens: LLM_MAX_TOKENS, temperature: 0.7 });
     const req = https.request(
       {
         hostname: 'openrouter.ai',
@@ -234,7 +242,7 @@ async function handleLlmCompare(config, store, resolved, bodyBuf, opts) {
   if (!question) {
     return { status: 400, error: 'bad_request', message: 'Field "question" is required and must be a non-empty string.' };
   }
-  const model = typeof parsed.model === 'string' && parsed.model.trim() ? parsed.model.trim() : DEFAULT_MODEL;
+  // Model selection is now handled by the cascade — client-specified model is ignored.
 
   const ns = sessionNamespace(resolved.id);
   const timeout = config.llmCompareTimeoutMs || 30_000;
@@ -271,56 +279,57 @@ async function handleLlmCompare(config, store, resolved, bodyBuf, opts) {
   }
 
   // Steps 2 + 3: OpenRouter calls — without and with memory context (parallel).
-  const withoutMessages = [{ role: 'user', content: question }];
+  const withoutMessages = [
+    {
+      role: 'system',
+      content: 'You are a general-purpose AI assistant. Answer the user\'s question based only on your training knowledge. Be helpful, thorough, and accurate. Where your answer would benefit from access to specific domain records, user history, or organizational context, acknowledge that gap naturally.',
+    },
+    { role: 'user', content: question },
+  ];
   const withMessages =
     memories.length > 0
       ? [
           {
             role: 'system',
             content:
-              'You have access to the following relevant records and memories:\n\n' +
-              memories.join('\n\n') +
-              '\n\nUse this context to provide an accurate, specific answer.',
+              'You are a knowledgeable AI assistant with access to the user\'s stored records. Here is the relevant context:\n\n' +
+              memories.map(m => `- ${m}`).join('\n') +
+              '\n\nInstructions:\n- Answer the question directly and confidently, weaving in specifics from the context above.\n- Write as if you naturally know this information — do NOT say "according to the records" or "based on stored memories."\n- Never list sources, show scores, or mention a memory system.\n- Be thorough: give actionable, specific details. If the context contains dates, numbers, or names, use them.\n- If the context doesn\'t fully answer the question, supplement with general knowledge but prioritize the stored context.',
           },
           { role: 'user', content: question },
         ]
       : withoutMessages;
 
-  const [withoutSettled, withSettled] = await Promise.allSettled([
-    callOR(config.openRouterApiKey, model, withoutMessages, timeout),
-    callOR(config.openRouterApiKey, model, withMessages, timeout),
-  ]);
-
-  const processingTimeMs = Date.now() - startMs;
-
-  function resolveResult(settled, includeMemories) {
-    if (settled.status === 'rejected') {
-      const base = { error: 'request_failed', message: 'Failed to call OpenRouter.', model };
-      return includeMemories ? { ...base, memories_used: memories } : base;
-    }
-    const { status: httpStatus, body } = settled.value;
-    if (httpStatus === 402) {
-      const base = { error: 'credits_exhausted', message: 'OpenRouter free-tier credits exhausted. Please try again later.', model };
-      return includeMemories ? { ...base, memories_used: memories } : base;
-    }
-    if (httpStatus >= 400) {
-      let msg = `OpenRouter returned HTTP ${httpStatus}.`;
+  // Model cascade: try each model in MODEL_CASCADE with 10s per attempt.
+  async function callCascade(messages) {
+    for (const m of MODEL_CASCADE) {
       try {
-        const p = JSON.parse(body);
-        if (p.error && p.error.message) msg = p.error.message;
-      } catch { /**/ }
-      const base = { error: 'openrouter_error', message: msg, model };
-      return includeMemories ? { ...base, memories_used: memories } : base;
+        const res = await callOR(config.openRouterApiKey, m, messages, 10000);
+        if (res.status === 429) continue;
+        if (res.status >= 400) continue;
+        const parsed = _parseOrResponse(res.body, m);
+        if (parsed.error) continue;
+        if (!parsed.response) continue;
+        return parsed;
+      } catch {
+        continue;
+      }
     }
-    const base = _parseOrResponse(body, model);
-    return includeMemories ? { ...base, memories_used: memories } : base;
+    return { error: 'all_models_failed', message: 'All LLM models are currently unavailable. Please try again later.', model: DEFAULT_MODEL };
   }
 
-  const withoutMemory = resolveResult(withoutSettled, false);
-  let withMemory = resolveResult(withSettled, true);
+  const [withoutResult, withResult] = await Promise.all([
+    callCascade(withoutMessages),
+    callCascade(withMessages),
+  ]);
+
+  const processingTimeMs = Date.now() - startMs;
+
+  const withoutMemory = withoutResult;
+  let withMemory = { ...withResult, memories_used: memories };
   if (recallWarning) withMemory = { ...withMemory, recall_warning: recallWarning };
 
   return { status: 200, without_memory: withoutMemory, with_memory: withMemory, processing_time_ms: processingTimeMs };
 }
 
-module.exports = { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL };
+module.exports = { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL, MODEL_CASCADE, LLM_MAX_TOKENS };
diff --git a/docker/playground/proxy/proxy.test.js b/docker/playground/proxy/proxy.test.js
@@ -704,11 +704,11 @@ test('batch store namespaces every item against the session (DAK-6757)', async (
 // unit + integration: LLM compare (DAK-6845)
 // ---------------------------------------------------------------------------
 
-const { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL } = require('./llm-compare');
+const { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL, MODEL_CASCADE } = require('./llm-compare');
 
 // Minimal noop mocks for the internal I/O helpers.
 function makeOrMock(response) {
-  return async () => ({ status: 200, body: JSON.stringify({ model: 'google/gemma-4-26b-a4b-it:free', choices: [{ message: { content: response } }] }) });
+  return async (_key, model) => ({ status: 200, body: JSON.stringify({ model: model || DEFAULT_MODEL, choices: [{ message: { content: response } }] }) });
 }
 const noopSeed = async () => ({ status: 200 });
 const emptyRecall = async () => ({ status: 200, body: JSON.stringify({ results: [] }) });
@@ -788,15 +788,15 @@ test('llm-compare successful call returns correct structure (DAK-6845)', async (
   assert.equal(typeof result.processing_time_ms, 'number');
   assert.ok(Array.isArray(result.with_memory.memories_used));
   assert.ok(result.with_memory.memories_used.length > 0);
-  assert.equal(result.without_memory.model, DEFAULT_MODEL);
+  assert.equal(result.without_memory.model, MODEL_CASCADE[0]);
   assert.equal(result.without_memory.response, 'Some medication answer');
 });
 
-test('llm-compare passes model override to OpenRouter (DAK-6845)', async () => {
+test('llm-compare uses model cascade — first successful model wins (DAK-6944)', async () => {
   const store = makeStore();
   const resolved = makeResolved(store);
   const capturedModels = [];
-  const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'hello', model: 'google/gemma-3-27b-it:free' })), {
+  const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'hello' })), {
     _callOpenRouter: async (key, model) => {
       capturedModels.push(model);
       return { status: 200, body: JSON.stringify({ model, choices: [{ message: { content: 'hi' } }] }) };
@@ -805,7 +805,7 @@ test('llm-compare passes model override to OpenRouter (DAK-6845)', async () => {
     _callDakeraStoreBatch: noopSeed,
   });
   assert.equal(result.status, 200);
-  assert.ok(capturedModels.every((m) => m === 'google/gemma-3-27b-it:free'));
+  assert.ok(capturedModels.every((m) => m === MODEL_CASCADE[0]));
 });
 
 test('llm-compare LLM-specific rate limit blocks after 5 calls per 10 min (DAK-6845)', async () => {
@@ -831,7 +831,7 @@ test('llm-compare LLM-specific rate limit blocks after 5 calls per 10 min (DAK-6
   assert.equal(allowed.status, 200);
 });
 
-test('llm-compare handles OpenRouter 402 gracefully (DAK-6845)', async () => {
+test('llm-compare handles all models failing gracefully via cascade (DAK-6944)', async () => {
   const store = makeStore();
   const resolved = makeResolved(store);
   const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'test' })), {
@@ -840,8 +840,8 @@ test('llm-compare handles OpenRouter 402 gracefully (DAK-6845)', async () => {
     _callDakeraStoreBatch: noopSeed,
   });
   assert.equal(result.status, 200);
-  assert.equal(result.without_memory.error, 'credits_exhausted');
-  assert.equal(result.with_memory.error, 'credits_exhausted');
+  assert.equal(result.without_memory.error, 'all_models_failed');
+  assert.equal(result.with_memory.error, 'all_models_failed');
 });
 
 test('llm-compare proceeds when Dakera recall fails (DAK-6845)', async () => {