Skip to content

Commit 9d207d5

Browse files
author
Brendan Gray
committed
fix: Issue1 toolFeedback\\n\\n boundary + Issue2 partial GPU layer fallback
1 parent e4e6796 commit 9d207d5

2 files changed

Lines changed: 45 additions & 10 deletions

File tree

main/agenticChat.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2289,6 +2289,11 @@ function register(ctx) {
22892289
// of each iteration's prompt (before tool results) instead of appended at the end.
22902290
// This ensures the model sees ground truth FIRST, not as an afterthought.
22912291

2292+
// Normalize toolFeedback to end with \n\n so ChatPanel's trailingProse regex
2293+
// finds a clean paragraph boundary before model synthesis. Without this, the
2294+
// synthesis paragraph is consumed by the tool section stripper.
2295+
if (!toolFeedback.endsWith('\n\n')) toolFeedback = toolFeedback.trimEnd() + '\n\n';
2296+
22922297
// Send progress update to UI
22932298
if (mainWindow) {
22942299
mainWindow.webContents.send('llm-token', toolFeedback);

main/llmEngine.js

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class LLMEngine extends EventEmitter {
5454
this._initializingPromise = null; // Tracks in-flight initialize() for serialization (prevents native C++ double-op crash)
5555
this.gpuInfo = null;
5656
this.gpuPreference = 'auto'; // 'auto' = prefer GPU, 'cpu' = force CPU only
57+
this.requireMinContextForGpu = false; // if true: discard GPU load when context < 4096 and retry CPU for more context
5758
this.reasoningEffort = 'medium'; // 'low', 'medium', 'high'
5859
this.thoughtTokenBudget = 2048; // Updated from ModelProfile after model load
5960

@@ -221,7 +222,8 @@ class LLMEngine extends EventEmitter {
221222
// Uses node-llama-cpp's gpuLayers: "auto" — automatically detects available VRAM
222223
// and offloads the optimal number of layers. ONE load attempt, not 7+.
223224
// This is exactly how LM Studio achieves instant loads.
224-
const gpuModes = this.gpuPreference === 'cpu' ? [false] : ['auto', false];
225+
// gpuModes is a let — may be expanded with a partial layer fallback after nvidia-smi
226+
let gpuModes = this.gpuPreference === 'cpu' ? [false] : ['auto', false];
225227

226228
// Detect real dedicated VRAM via nvidia-smi BEFORE calling getLlama().
227229
// Problem: Vulkan on systems with GTT/shared memory reports dedicated VRAM + system RAM
@@ -245,6 +247,20 @@ class LLMEngine extends EventEmitter {
245247
}
246248
}
247249

250+
// If the model is too large for full GPU offload, insert a partial layer fallback
251+
// between 'auto' and false so we get partial offload instead of pure CPU.
252+
// LM Studio does the same. Without this, auto fails → 0 layers every time.
253+
if (this.gpuPreference !== 'cpu' && nvidiaDedicatedVramBytes > 0) {
254+
const usableVram = nvidiaDedicatedVramBytes * 0.75; // 75%: leaves 25% for KV cache
255+
if (modelStats.size > usableVram) {
256+
const fraction = usableVram / modelStats.size;
257+
// 80 layers is a safe upper bound for any model up to 200B
258+
const partialLayers = Math.max(1, Math.floor(80 * fraction));
259+
console.log(`[LLM] Model (${(modelStats.size/(1024**3)).toFixed(1)}GB) exceeds usable VRAM (${(usableVram/(1024**3)).toFixed(1)}GB) — partial fallback: ${partialLayers} layers (~${(fraction*100).toFixed(0)}% offloaded)`);
260+
gpuModes = ['auto', partialLayers, false];
261+
}
262+
}
263+
248264
let gpuLayers = 0;
249265
let contextSize = 8192;
250266
let gpuMode = 'auto';
@@ -256,16 +272,18 @@ class LLMEngine extends EventEmitter {
256272

257273
this.emit('status', {
258274
state: 'loading',
259-
message: tryGpuMode === 'auto' ? 'Initializing GPU...' : 'Falling back to CPU...',
275+
message: tryGpuMode === 'auto' ? 'Initializing GPU...' : (typeof tryGpuMode === 'number' ? `Trying partial GPU (${tryGpuMode} layers)...` : 'Falling back to CPU...'),
260276
progress: 0.05
261277
});
262278

263279
if (this.model) { try { await this.model.dispose(); } catch(e) {} this.model = null; }
264280
if (this.context) { try { await this.context.dispose(); } catch(e) {} this.context = null; }
265281

266282
try {
267-
// Reuse existing llama instance if same GPU mode (skip expensive CUDA init)
268-
const canReuse = this.llamaInstance && this._lastGpuMode === tryGpuMode;
283+
// Reuse existing llama instance if same GPU mode (skip expensive CUDA init).
284+
// Numeric fallback modes reuse the 'auto' instance — same GPU backend, different gpuLayers.
285+
const canReuse = this.llamaInstance &&
286+
(this._lastGpuMode === tryGpuMode || (typeof tryGpuMode === 'number' && this._lastGpuMode === 'auto'));
269287
if (canReuse) {
270288
console.log(`[LLM] Reusing existing llama instance (gpu=${tryGpuMode})`);
271289
} else {
@@ -276,7 +294,9 @@ class LLMEngine extends EventEmitter {
276294
// (GTT/shared memory case), cap the usable budget to real dedicated VRAM only.
277295
// Otherwise gpuLayers:'auto' over-allocates onto non-existent memory and fails.
278296
this.llamaInstance = await this._withTimeout(getLlama({
279-
gpu: tryGpuMode,
297+
// Numeric modes still use gpu:'auto' for backend init — the layer count
298+
// is passed to loadModel, not getLlama. Only false disables GPU entirely.
299+
gpu: (tryGpuMode === false) ? false : 'auto',
280300
vramPadding: (totalVram) => {
281301
// Use nvidia-smi value if Vulkan is reporting GTT-inflated total
282302
const effectiveBudget = (nvidiaDedicatedVramBytes > 0 && nvidiaDedicatedVramBytes < totalVram * 0.7)
@@ -316,7 +336,7 @@ class LLMEngine extends EventEmitter {
316336
// internally). Do NOT convert to a file:// URL here — that breaks path.resolve.
317337
this.model = await this._withTimeout(this.llamaInstance.loadModel({
318338
modelPath: modelPath,
319-
gpuLayers: tryGpuMode === 'auto' ? 'auto' : 0,
339+
gpuLayers: tryGpuMode === 'auto' ? 'auto' : (typeof tryGpuMode === 'number' ? tryGpuMode : 0),
320340
defaultContextFlashAttention: true,
321341
useMmap: true,
322342
onLoadProgress: (progress) => {
@@ -328,6 +348,11 @@ class LLMEngine extends EventEmitter {
328348
// Read actual GPU layers from the loaded model
329349
try { gpuLayers = this.model.gpuLayers ?? 0; } catch (_) { gpuLayers = 0; }
330350
console.log(`[LLM] Model loaded: ${gpuLayers} GPU layers (mode: ${tryGpuMode})`);
351+
// If auto returned 0 layers and we have a partial fallback waiting, skip to it
352+
if (tryGpuMode === 'auto' && gpuLayers === 0 && gpuModes.some(m => typeof m === 'number')) {
353+
console.log('[LLM] Auto returned 0 GPU layers — skipping to partial layer fallback');
354+
continue;
355+
}
331356
} catch (loadErr) {
332357
console.log(`[LLM] Model load (gpu=${tryGpuMode}) failed: ${loadErr.message?.substring(0, 120)}`);
333358
continue;
@@ -378,11 +403,11 @@ class LLMEngine extends EventEmitter {
378403
}
379404
}
380405

381-
// If context is critically small (< 4096), don't accept — fall through to CPU.
382-
// 4096 is the absolute minimum for any useful agentic chat with tool definitions.
406+
// If context is critically small (< 4096), optionally fall through to CPU.
407+
// Controlled by requireMinContextForGpu setting (default: false = always keep GPU).
383408
const MIN_AGENTIC_CONTEXT = 4096;
384-
if (success && contextSize < MIN_AGENTIC_CONTEXT && tryGpuMode !== false) {
385-
console.log(`[LLM] GPU context too small (${contextSize} < ${MIN_AGENTIC_CONTEXT}) — retrying with CPU for larger context`);
409+
if (this.requireMinContextForGpu && success && contextSize < MIN_AGENTIC_CONTEXT && tryGpuMode !== false) {
410+
console.log(`[LLM] GPU context too small (${contextSize} < ${MIN_AGENTIC_CONTEXT}) — requireMinContextForGpu=true, retrying with CPU for larger context`);
386411
success = false;
387412
if (this.context) { try { await this.context.dispose(); } catch(e) {} this.context = null; }
388413
}
@@ -1509,6 +1534,11 @@ PERSISTENCE:
15091534
}
15101535
}
15111536

1537+
setRequireMinContextForGpu(val) {
1538+
this.requireMinContextForGpu = !!val;
1539+
console.log(`[LLM] requireMinContextForGpu set to: ${this.requireMinContextForGpu}`);
1540+
}
1541+
15121542
updateParams(params) {
15131543
this.defaultParams = { ...this.defaultParams, ...params };
15141544
}

0 commit comments

Comments
 (0)