Skip to content

Commit f2bfbbe

Browse files
feat(proxy): upgrade LLM to 405B Hermes cascade + improve prompts (DAK-6944) (#224)
- Model cascade: Hermes 3 405B → gpt-oss-120b → Nemotron Ultra 550B → Llama 3.3 70B (replaces single gemma-4-26b model) - max_tokens: 512 → 800 for thorough answers - System prompts rewritten: natural memory synthesis, no raw scores or source listing - Cascade tries each model with 10s timeout, skips on 429/error, returns first success - Tests updated: 58/58 pass (model override → cascade behavior, 402 → all_models_failed) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 7234b42 commit f2bfbbe

2 files changed

Lines changed: 54 additions & 45 deletions

File tree

docker/playground/proxy/llm-compare.js

Lines changed: 45 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,15 @@ const SEED_MEMORIES = [
7777
},
7878
];
7979

80-
const DEFAULT_MODEL = 'google/gemma-4-26b-a4b-it:free';
80+
// Model cascade (DAK-6944): 405B Hermes primary, then gpt-oss-120b, Nemotron Ultra, Llama 70b.
81+
const MODEL_CASCADE = [
82+
'nousresearch/hermes-3-llama-3.1-405b:free',
83+
'openai/gpt-oss-120b:free',
84+
'nvidia/nemotron-3-ultra-550b-a55b:free',
85+
'meta-llama/llama-3.3-70b-instruct:free',
86+
];
87+
const DEFAULT_MODEL = MODEL_CASCADE[0];
88+
const LLM_MAX_TOKENS = 800;
8189
const SEED_TIMEOUT_MS = 8_000;
8290

8391
// ---------------------------------------------------------------------------
@@ -86,7 +94,7 @@ const SEED_TIMEOUT_MS = 8_000;
8694

8795
function _callOpenRouter(apiKey, model, messages, timeoutMs) {
8896
return new Promise((resolve, reject) => {
89-
const body = JSON.stringify({ model, messages, max_tokens: 512 });
97+
const body = JSON.stringify({ model, messages, max_tokens: LLM_MAX_TOKENS, temperature: 0.7 });
9098
const req = https.request(
9199
{
92100
hostname: 'openrouter.ai',
@@ -234,7 +242,7 @@ async function handleLlmCompare(config, store, resolved, bodyBuf, opts) {
234242
if (!question) {
235243
return { status: 400, error: 'bad_request', message: 'Field "question" is required and must be a non-empty string.' };
236244
}
237-
const model = typeof parsed.model === 'string' && parsed.model.trim() ? parsed.model.trim() : DEFAULT_MODEL;
245+
// Model selection is now handled by the cascade — client-specified model is ignored.
238246

239247
const ns = sessionNamespace(resolved.id);
240248
const timeout = config.llmCompareTimeoutMs || 30_000;
@@ -271,56 +279,57 @@ async function handleLlmCompare(config, store, resolved, bodyBuf, opts) {
271279
}
272280

273281
// Steps 2 + 3: OpenRouter calls — without and with memory context (parallel).
274-
const withoutMessages = [{ role: 'user', content: question }];
282+
const withoutMessages = [
283+
{
284+
role: 'system',
285+
content: 'You are a general-purpose AI assistant. Answer the user\'s question based only on your training knowledge. Be helpful, thorough, and accurate. Where your answer would benefit from access to specific domain records, user history, or organizational context, acknowledge that gap naturally.',
286+
},
287+
{ role: 'user', content: question },
288+
];
275289
const withMessages =
276290
memories.length > 0
277291
? [
278292
{
279293
role: 'system',
280294
content:
281-
'You have access to the following relevant records and memories:\n\n' +
282-
memories.join('\n\n') +
283-
'\n\nUse this context to provide an accurate, specific answer.',
295+
'You are a knowledgeable AI assistant with access to the user\'s stored records. Here is the relevant context:\n\n' +
296+
memories.map(m => `- ${m}`).join('\n') +
297+
'\n\nInstructions:\n- Answer the question directly and confidently, weaving in specifics from the context above.\n- Write as if you naturally know this information — do NOT say "according to the records" or "based on stored memories."\n- Never list sources, show scores, or mention a memory system.\n- Be thorough: give actionable, specific details. If the context contains dates, numbers, or names, use them.\n- If the context doesn\'t fully answer the question, supplement with general knowledge but prioritize the stored context.',
284298
},
285299
{ role: 'user', content: question },
286300
]
287301
: withoutMessages;
288302

289-
const [withoutSettled, withSettled] = await Promise.allSettled([
290-
callOR(config.openRouterApiKey, model, withoutMessages, timeout),
291-
callOR(config.openRouterApiKey, model, withMessages, timeout),
292-
]);
293-
294-
const processingTimeMs = Date.now() - startMs;
295-
296-
function resolveResult(settled, includeMemories) {
297-
if (settled.status === 'rejected') {
298-
const base = { error: 'request_failed', message: 'Failed to call OpenRouter.', model };
299-
return includeMemories ? { ...base, memories_used: memories } : base;
300-
}
301-
const { status: httpStatus, body } = settled.value;
302-
if (httpStatus === 402) {
303-
const base = { error: 'credits_exhausted', message: 'OpenRouter free-tier credits exhausted. Please try again later.', model };
304-
return includeMemories ? { ...base, memories_used: memories } : base;
305-
}
306-
if (httpStatus >= 400) {
307-
let msg = `OpenRouter returned HTTP ${httpStatus}.`;
303+
// Model cascade: try each model in MODEL_CASCADE with 10s per attempt.
304+
async function callCascade(messages) {
305+
for (const m of MODEL_CASCADE) {
308306
try {
309-
const p = JSON.parse(body);
310-
if (p.error && p.error.message) msg = p.error.message;
311-
} catch { /**/ }
312-
const base = { error: 'openrouter_error', message: msg, model };
313-
return includeMemories ? { ...base, memories_used: memories } : base;
307+
const res = await callOR(config.openRouterApiKey, m, messages, 10000);
308+
if (res.status === 429) continue;
309+
if (res.status >= 400) continue;
310+
const parsed = _parseOrResponse(res.body, m);
311+
if (parsed.error) continue;
312+
if (!parsed.response) continue;
313+
return parsed;
314+
} catch {
315+
continue;
316+
}
314317
}
315-
const base = _parseOrResponse(body, model);
316-
return includeMemories ? { ...base, memories_used: memories } : base;
318+
return { error: 'all_models_failed', message: 'All LLM models are currently unavailable. Please try again later.', model: DEFAULT_MODEL };
317319
}
318320

319-
const withoutMemory = resolveResult(withoutSettled, false);
320-
let withMemory = resolveResult(withSettled, true);
321+
const [withoutResult, withResult] = await Promise.all([
322+
callCascade(withoutMessages),
323+
callCascade(withMessages),
324+
]);
325+
326+
const processingTimeMs = Date.now() - startMs;
327+
328+
const withoutMemory = withoutResult;
329+
let withMemory = { ...withResult, memories_used: memories };
321330
if (recallWarning) withMemory = { ...withMemory, recall_warning: recallWarning };
322331

323332
return { status: 200, without_memory: withoutMemory, with_memory: withMemory, processing_time_ms: processingTimeMs };
324333
}
325334

326-
module.exports = { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL };
335+
module.exports = { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL, MODEL_CASCADE, LLM_MAX_TOKENS };

docker/playground/proxy/proxy.test.js

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -704,11 +704,11 @@ test('batch store namespaces every item against the session (DAK-6757)', async (
704704
// unit + integration: LLM compare (DAK-6845)
705705
// ---------------------------------------------------------------------------
706706

707-
const { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL } = require('./llm-compare');
707+
const { handleLlmCompare, SEED_MEMORIES, DEFAULT_MODEL, MODEL_CASCADE } = require('./llm-compare');
708708

709709
// Minimal noop mocks for the internal I/O helpers.
710710
function makeOrMock(response) {
711-
return async () => ({ status: 200, body: JSON.stringify({ model: 'google/gemma-4-26b-a4b-it:free', choices: [{ message: { content: response } }] }) });
711+
return async (_key, model) => ({ status: 200, body: JSON.stringify({ model: model || DEFAULT_MODEL, choices: [{ message: { content: response } }] }) });
712712
}
713713
const noopSeed = async () => ({ status: 200 });
714714
const emptyRecall = async () => ({ status: 200, body: JSON.stringify({ results: [] }) });
@@ -788,15 +788,15 @@ test('llm-compare successful call returns correct structure (DAK-6845)', async (
788788
assert.equal(typeof result.processing_time_ms, 'number');
789789
assert.ok(Array.isArray(result.with_memory.memories_used));
790790
assert.ok(result.with_memory.memories_used.length > 0);
791-
assert.equal(result.without_memory.model, DEFAULT_MODEL);
791+
assert.equal(result.without_memory.model, MODEL_CASCADE[0]);
792792
assert.equal(result.without_memory.response, 'Some medication answer');
793793
});
794794

795-
test('llm-compare passes model override to OpenRouter (DAK-6845)', async () => {
795+
test('llm-compare uses model cascade — first successful model wins (DAK-6944)', async () => {
796796
const store = makeStore();
797797
const resolved = makeResolved(store);
798798
const capturedModels = [];
799-
const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'hello', model: 'google/gemma-3-27b-it:free' })), {
799+
const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'hello' })), {
800800
_callOpenRouter: async (key, model) => {
801801
capturedModels.push(model);
802802
return { status: 200, body: JSON.stringify({ model, choices: [{ message: { content: 'hi' } }] }) };
@@ -805,7 +805,7 @@ test('llm-compare passes model override to OpenRouter (DAK-6845)', async () => {
805805
_callDakeraStoreBatch: noopSeed,
806806
});
807807
assert.equal(result.status, 200);
808-
assert.ok(capturedModels.every((m) => m === 'google/gemma-3-27b-it:free'));
808+
assert.ok(capturedModels.every((m) => m === MODEL_CASCADE[0]));
809809
});
810810

811811
test('llm-compare LLM-specific rate limit blocks after 5 calls per 10 min (DAK-6845)', async () => {
@@ -831,7 +831,7 @@ test('llm-compare LLM-specific rate limit blocks after 5 calls per 10 min (DAK-6
831831
assert.equal(allowed.status, 200);
832832
});
833833

834-
test('llm-compare handles OpenRouter 402 gracefully (DAK-6845)', async () => {
834+
test('llm-compare handles all models failing gracefully via cascade (DAK-6944)', async () => {
835835
const store = makeStore();
836836
const resolved = makeResolved(store);
837837
const result = await handleLlmCompare(makeConfig(), store, resolved, Buffer.from(JSON.stringify({ question: 'test' })), {
@@ -840,8 +840,8 @@ test('llm-compare handles OpenRouter 402 gracefully (DAK-6845)', async () => {
840840
_callDakeraStoreBatch: noopSeed,
841841
});
842842
assert.equal(result.status, 200);
843-
assert.equal(result.without_memory.error, 'credits_exhausted');
844-
assert.equal(result.with_memory.error, 'credits_exhausted');
843+
assert.equal(result.without_memory.error, 'all_models_failed');
844+
assert.equal(result.with_memory.error, 'all_models_failed');
845845
});
846846

847847
test('llm-compare proceeds when Dakera recall fails (DAK-6845)', async () => {

0 commit comments

Comments
 (0)