Skip to content

Commit 8083b35

Browse files
author
Brendan Gray
committed
v1.8.2: Fix fallback filename detection, agentic loop completion, compact preamble stop guidance
1 parent 18caa3d commit 8083b35

8 files changed

Lines changed: 128 additions & 122 deletions

File tree

electron-main.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ app.on('before-quit', () => {
692692
try { log.close(); } catch (_) {} // Flush persistent log file
693693
// Fire-and-forget async cleanup with a hard deadline
694694
const cleanupDone = Promise.all([
695-
memoryStore.dispose().catch(() => {}),
695+
Promise.resolve(memoryStore.dispose()).catch(() => {}),
696696
llmEngine.dispose().catch(() => {}),
697697
]);
698698
// Give async cleanup 3 seconds max, then force exit

main/agenticChat.js

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,7 +1054,22 @@ function register(ctx) {
10541054
const _stitchedForMcp = _pendingPartialBlock ? _pendingPartialBlock + responseText : responseText;
10551055
_pendingPartialBlock = null;
10561056
const _fenceIdx = _stitchedForMcp.search(/```(?:json|tool_call|tool)\b/);
1057-
const _hasUnclosedToolFence = _fenceIdx !== -1 && !_stitchedForMcp.slice(_fenceIdx).includes('\n```');
1057+
let _hasUnclosedToolFence = _fenceIdx !== -1 && !_stitchedForMcp.slice(_fenceIdx).includes('\n```');
1058+
1059+
// If the unclosed fence contains a complete JSON tool call, don't treat as truncated
1060+
if (_hasUnclosedToolFence) {
1061+
const fenceContent = _stitchedForMcp.slice(_fenceIdx);
1062+
const jsonMatch = fenceContent.match(/```(?:json|tool_call|tool)\s*\n?([\s\S]*)/);
1063+
if (jsonMatch) {
1064+
try {
1065+
const parsed = JSON.parse(jsonMatch[1].trim());
1066+
if (parsed && typeof parsed.tool === 'string') {
1067+
_hasUnclosedToolFence = false;
1068+
}
1069+
} catch {}
1070+
}
1071+
}
1072+
10581073
const _wasTruncated = (
10591074
(result?.stopReason === 'maxTokens' || result?.stopReason === 'max-tokens') ||
10601075
_hasUnclosedToolFence
@@ -1219,8 +1234,9 @@ function register(ctx) {
12191234
break;
12201235
}
12211236

1222-
// Code-dump nudge
1223-
const hasCodeBlocks = /```(?:html?|css|javascript|js|typescript|ts|python|py|json)\s*\n[\s\S]{50,}/i.test(responseText);
1237+
// Code-dump nudge — only for large blocks likely to be full files
1238+
const _codeBlockMatch = responseText.match(/```(?:html?|css|javascript|js|typescript|ts|python|py|json)\s*\n([\s\S]*?)```/i);
1239+
const hasCodeBlocks = _codeBlockMatch && _codeBlockMatch[1].length > 500;
12241240
if (hasCodeBlocks && nudgesRemaining > 0 && iteration < MAX_AGENTIC_ITERATIONS - 1) {
12251241
nudgesRemaining--;
12261242
currentPrompt = {
@@ -1325,7 +1341,7 @@ function register(ctx) {
13251341
const hasBrowserAction = toolResults.results.some(tr => tr.tool?.startsWith('browser_'));
13261342
const continueInstruction = hasBrowserAction
13271343
? '\n\nThe snapshot above has [ref=N]. Use browser_click/type with ref. Output next tool call now.'
1328-
: '\n\nOutput the next tool call to make progress. Only summarize when ALL steps are complete.';
1344+
: '\n\nIf more steps are needed, output the next tool call. If the task is complete, summarize what was done — do not call more tools.';
13291345

13301346
const iterContext = executionBlock + stepDirective + taskReminder;
13311347
const allFeedback = toolFeedback + snapFeedback;

main/constants.js

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -61,55 +61,25 @@ const DEFAULT_SYSTEM_PREAMBLE = `You are a local AI coding assistant with tools.
6161
- Browser: browser_navigate → browser_snapshot → browser_click/type using refs from snapshot
6262
- Multi-step tasks (3+ steps): use write_todos to plan, update_todo as each step completes`;
6363

64-
const DEFAULT_COMPACT_PREAMBLE = `You are a local AI coding assistant with tools. Use them to take real action — never just describe what you'd do.
64+
const DEFAULT_COMPACT_PREAMBLE = `You are a local AI coding assistant with tools. Call tools to take action — never just describe what you'd do.
6565
6666
## Tools
67-
- read_file: read a file from the project
68-
- write_file: create or OVERWRITE a file (replaces entire content)
69-
- append_to_file: add content to the end of a file without overwriting — use when building a large file across multiple calls
70-
- edit_file: modify a file using exact oldText + newText (read_file first)
71-
- list_directory: list files in a directory — use "." to list the project root
72-
- find_files: find files by name or pattern
73-
- grep_search: search file contents for a string or pattern
74-
- run_command: run a shell command (Windows PowerShell)
75-
- web_search: search for live/current external information only
76-
- fetch_webpage: fetch content from a URL
77-
- browser_navigate: open a URL in real Chrome
78-
- browser_snapshot: read the current browser page (call before clicking)
79-
- browser_click / browser_type: interact with elements by ref from snapshot
80-
- search_codebase: search indexed project code
81-
- analyze_error: analyze an error against the codebase
82-
83-
## Behavior
84-
- **Your tools are real and execute in the live environment.** Call them — do not describe what you would do instead of doing it.
85-
- **When asked to CREATE a new file that does not exist yet — call write_file immediately. Do not call list_directory or read_file first. Exploration tools are for tasks involving existing files, not for creating new ones.**
86-
- **To generate any file with multiple required sections or components (HTML page, multi-part document, large code file): always write the file section by section. Call write_file with ONLY the first section (e.g. HTML head + styles, or opening boilerplate). After it succeeds, call append_to_file with the NEXT section. Repeat until ALL sections are complete. Do NOT write the entire file in a single write_file call. One section per tool call — so every section gets full content, not an abbreviated placeholder. Never call write_file again on a file you are already building — it erases everything. Only append_to_file after the first call.**
87-
- **When your response would contain a complete file (code, markup, config, data) — call write_file. File content belongs in the filesystem, not in chat.**
88-
- **For tasks that require creating multiple files: write ONE file per tool call — do not enumerate all files or steps first, and do not output file content as prose. Call write_file immediately for the first file; after it succeeds, write the next file, and so on, one file per turn.**
89-
- **Never say you created, saved, ran, or navigated to something unless you called a tool that did it.**
90-
- **Never claim you searched for something, looked it up, or checked a source unless you actually called web_search or fetch_webpage in this response.**
91-
- **You do not know today's date or current real-world state. If asked for the date, time, or any live or time-sensitive information — call web_search immediately. Never state a current date, time, or real-world value from memory.**
92-
- Acknowledge the user's request, then call the tools needed — you have no knowledge of file contents until you read them
93-
- After tools return, explain what you found — don't just say a tool ran
94-
- After completing a tool call, always write at least one sentence confirming what was done — never end your response on a bare tool call with no acknowledgment
95-
- Never copy or repeat sentences you have already written in this response.
96-
- Ask a specific follow-up if you need more context
97-
- When asked to visit, open, navigate to, or browse a URL or website, call \`browser_navigate\` as your first action.
98-
- When asked to save, write, or store data, results, or any content to a file, call \`write_file\` to create that file.
67+
read_file, write_file, edit_file, list_directory, find_files, grep_search, run_command, web_search, fetch_webpage, browser_navigate, browser_snapshot, browser_click, browser_type, search_codebase, analyze_error, append_to_file
9968
10069
## Rules
101-
- **You have no knowledge of what any file contains until you call read_file.** Never guess or invent file contents.
102-
- **You have no knowledge of what files exist in the project until you call list_directory.** Never list, name, or assume project files from memory — always call list_directory first.
103-
- Use tools when action is required: reading files, running commands, browsing, writing code
104-
- For general knowledge questions (concepts, how-to, code explanations), write your full answer immediately — start your response with the content, not a statement about tools
105-
- When the user describes a bug, error, or unexpected behavior in their project: call read_file on the relevant file first, then diagnose — name the file
106-
- If a bug is described with no file name or error message, ask ONE clarifying question — do not call tools yet
107-
- When asked about anything that may have changed since your training — live data, current events, real-time information, or anything time-sensitive — call web_search immediately. You have real internet access. Never say you cannot access live information — use the tool. Do not use for static programming knowledge you can answer directly.
108-
- If a tool fails, retry once with corrected parameters — never give up on the first failure or invent a result
109-
- Never claim a task is done before calling the tool that completes it — writing a file requires write_file, searching requires web_search
110-
- When read_file fails with ENOENT, call find_files to locate the file by name
111-
- Tool format: {"tool":"read_file","params":{"filePath":"src/app.js"}}
112-
- For conversational messages — greetings, casual chat, simple questions — respond directly with text. No tools needed.`;
70+
- Tools execute in the live environment. Call them — do not describe what you would do.
71+
- Never say you did something unless you called the tool that did it.
72+
- You do not know file contents until you call read_file. Never guess.
73+
- You do not know what files exist until you call list_directory.
74+
- For general knowledge or concept questions, answer directly — no tools needed.
75+
- For bugs: read_file the relevant file first, then diagnose.
76+
- For live/current/time-sensitive info: call web_search. Never guess dates or current state.
77+
- To visit a URL: call browser_navigate. To read a page: browser_snapshot first.
78+
- If a tool fails, retry once with corrected parameters.
79+
- For new files: call write_file immediately.
80+
- For large files: write_file first section, then append_to_file for each remaining section.
81+
- For conversations, greetings, simple questions: respond with text, no tools needed.
82+
- Once a task is complete (file written, question answered, error explained), respond with a brief summary. Do not call more tools after the task is done.`;
11383

11484
const DEFAULT_CHAT_PREAMBLE = `Answer questions, help with code and concepts, and have normal conversations.
11585
Be concise, direct, and helpful.`;

main/llmEngine.js

Lines changed: 65 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ const TOOL_DETECT_BUFFER_MAX = 60_000;
2222
const KV_REUSE_COOLDOWN_TURNS = 2;
2323
const MAX_PARALLEL_FUNCTION_CALLS = 4;
2424
const CONTEXT_ABSOLUTE_CEILING = 131_072;
25-
const VRAM_PADDING_FLOOR_MB = 800;
25+
const VRAM_PADDING_FLOOR_MB = 0;
2626

2727
let _genCounter = 0;
2828

@@ -92,7 +92,7 @@ class LLMEngine extends EventEmitter {
9292
const modelSizeGB = (modelSizeBytes || 0) / (1024 ** 3);
9393
// Heuristic: estimate layers from model size
9494
const estLayers = modelSizeGB < 2 ? 32 : modelSizeGB < 8 ? 40 : modelSizeGB < 20 ? 48 : 80;
95-
const usableVram = Math.max(0, vramGB - 0.8); // reserve ~800MB
95+
const usableVram = Math.max(0, vramGB - VRAM_PADDING_FLOOR_MB / 1024);
9696
const fitsRatio = Math.min(1, usableVram / Math.max(0.1, modelSizeGB));
9797
const roughMaxLayers = Math.floor(estLayers * fitsRatio);
9898
return { roughMaxLayers, estimatedLayers: estLayers, vramGB, modelSizeGB };
@@ -226,36 +226,31 @@ class LLMEngine extends EventEmitter {
226226
const modelStats = fs.statSync(modelPath);
227227
const gpuConfig = this._getGPUConfig(modelStats.size);
228228

229-
// GPU mode fallback chain
229+
// GPU mode fallback chain — model LOAD + CONTEXT creation together
230230
const gpuModes = this._buildGpuModeList(gpuConfig);
231231
let loadedModel = null;
232+
let loadedContext = null;
232233
let usedGpuMode = false;
233234
let bestAutoGpuLayers = 0;
234235

235236
for (const mode of gpuModes) {
236237
if (loadSignal.aborted) throw new Error('Load cancelled');
237238
try {
238239
// Create or reuse Llama backend instance
239-
if (!this.llamaInstance || this._lastGpuMode !== mode) {
240+
const backendMode = mode === false ? false : (typeof mode === 'number' ? 'cuda' : mode);
241+
if (!this.llamaInstance || this._lastGpuMode !== backendMode) {
240242
if (this.llamaInstance) {
241243
// Don't dispose — reuse for CUDA kernel caching
242244
}
243245
this.llamaInstance = await this._withTimeout(
244246
getLlama({
245-
gpu: mode === false ? false : mode,
246-
vramPadding: (ctx) => {
247-
const padding = Math.max(VRAM_PADDING_FLOOR_MB * 1024 * 1024, ctx.totalVram * 0.05);
248-
return padding;
249-
},
250-
ramPadding: () => {
251-
const totalRam = os.totalmem();
252-
return Math.min(totalRam * 0.08, 2 * 1024 ** 3);
253-
},
247+
gpu: backendMode,
248+
vramPadding: 0,
254249
}),
255250
GPU_INIT_TIMEOUT,
256251
'GPU initialization',
257252
);
258-
this._lastGpuMode = mode;
253+
this._lastGpuMode = backendMode;
259254
}
260255

261256
this.emit('status', { state: 'loading', message: `Trying GPU mode: ${mode}...` });
@@ -274,46 +269,56 @@ class LLMEngine extends EventEmitter {
274269
'Model loading',
275270
);
276271

277-
// Check if auto mode returned fewer layers than partial fallback
272+
// Track auto mode GPU layer usage (do not reject — auto mode optimizes layer split)
278273
if (mode === 'auto' && loadedModel.gpuLayers != null) {
279274
bestAutoGpuLayers = loadedModel.gpuLayers;
280-
if (gpuConfig.roughMaxLayers > 0 && loadedModel.gpuLayers < gpuConfig.roughMaxLayers) {
281-
loadedModel.dispose?.();
282-
loadedModel = null;
283-
continue;
284-
}
275+
}
276+
277+
// Now try to create context on this model
278+
const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
279+
let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
280+
// CPU mode: cap context for responsive generation
281+
if (mode === false) maxCtx = Math.min(maxCtx, 8192);
282+
loadedContext = await this._withTimeout(
283+
loadedModel.createContext({
284+
contextSize: { min: 512, max: maxCtx },
285+
flashAttention: true,
286+
ignoreMemorySafetyChecks: true,
287+
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
288+
}),
289+
ctxTimeout,
290+
'Context creation',
291+
);
292+
293+
// Verify context is usable (at least 512 tokens after system prompt)
294+
const actualCtx = loadedContext.contextSize || 0;
295+
if (actualCtx < 1024 && mode !== false) {
296+
const log = require('./logger');
297+
log.warn(`GPU mode ${mode} context too small (${actualCtx}), trying next mode`);
298+
loadedContext.dispose?.();
299+
loadedContext = null;
300+
loadedModel.dispose?.();
301+
loadedModel = null;
302+
continue;
285303
}
286304

287305
usedGpuMode = mode;
288306
break;
289307
} catch (err) {
290308
const log = require('./logger');
291309
log.warn(`GPU mode ${mode} failed: ${err.message}`);
292-
loadedModel = null;
310+
if (loadedModel) { loadedModel.dispose?.(); loadedModel = null; }
311+
if (loadedContext) { loadedContext.dispose?.(); loadedContext = null; }
293312
}
294313
}
295314

296-
if (!loadedModel) throw new Error(`Failed to load model from ${modelPath} on any GPU mode`);
297-
if (loadSignal.aborted) { loadedModel.dispose?.(); throw new Error('Load cancelled'); }
315+
if (!loadedModel || !loadedContext) throw new Error(`Failed to load model from ${modelPath} on any GPU mode`);
316+
if (loadSignal.aborted) { loadedContext.dispose?.(); loadedModel.dispose?.(); throw new Error('Load cancelled'); }
298317

299318
this.model = loadedModel;
319+
this.context = loadedContext;
300320
this.currentModelPath = modelPath;
301321

302-
// Context creation with retry/shrink
303-
const ctxTimeout = usedGpuMode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
304-
const maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
305-
306-
this.context = await this._withTimeout(
307-
this.model.createContext({
308-
contextSize: { min: 2048, max: maxCtx },
309-
flashAttention: true,
310-
ignoreMemorySafetyChecks: usedGpuMode === false,
311-
failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
312-
}),
313-
ctxTimeout,
314-
'Context creation',
315-
);
316-
317322
// Reject GPU context if too small
318323
if (this.requireMinContextForGpu && usedGpuMode !== false) {
319324
const actualCtxSize = this.context.contextSize || 0;
@@ -415,6 +420,11 @@ class LLMEngine extends EventEmitter {
415420
const modes = ['cuda', 'auto'];
416421
if (gpuConfig.roughMaxLayers > 0) {
417422
modes.push(gpuConfig.roughMaxLayers);
423+
// Add partial GPU modes (half layers, quarter layers) for small VRAM GPUs
424+
const half = Math.floor(gpuConfig.roughMaxLayers / 2);
425+
const quarter = Math.floor(gpuConfig.roughMaxLayers / 4);
426+
if (half >= 4) modes.push(half);
427+
if (quarter >= 4 && quarter !== half) modes.push(quarter);
418428
}
419429
modes.push(false); // CPU fallback
420430
return modes;
@@ -711,7 +721,9 @@ class LLMEngine extends EventEmitter {
711721

712722
// Context overflow detection
713723
const msg = (err.message || '').toLowerCase();
724+
console.error(`[LLM] Generation error (non-abort): name=${err.name}, message=${err.message}, stack=${err.stack?.split('\n').slice(0,3).join(' | ')}`);
714725
if (msg.includes('compress') || msg.includes('context') || msg.includes('too long')) {
726+
console.error(`[LLM] Treating as CONTEXT_OVERFLOW (matched: ${msg.includes('compress') ? 'compress' : msg.includes('context') ? 'context' : 'too long'})`);
715727
const summary = this.getConversationSummary();
716728
this.resetSession(true);
717729
const overflowErr = new Error(`CONTEXT_OVERFLOW:${summary}`);
@@ -1012,17 +1024,23 @@ class LLMEngine extends EventEmitter {
10121024

10131025
// ─── Session Management ───
10141026
async resetSession(useCompactPrompt = false) {
1027+
// Wait for any in-flight model load to finish first
1028+
if (this._initializingPromise) {
1029+
try { await this._initializingPromise; } catch {}
1030+
}
1031+
1032+
if (!this.model || !this.isReady) {
1033+
throw new Error('Cannot reset session — no model loaded');
1034+
}
1035+
10151036
// Check if context is still usable
10161037
if (!this.context || this.context._disposed) {
1017-
if (this.model) {
1018-
this.context = await this.model.createContext({
1019-
contextSize: { min: 2048, max: this._computeMaxContext(0) },
1020-
flashAttention: true,
1021-
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
1022-
});
1023-
} else {
1024-
throw new Error('Cannot reset session — no model loaded');
1025-
}
1038+
this.context = await this.model.createContext({
1039+
contextSize: { min: 512, max: this._computeMaxContext(0) },
1040+
flashAttention: true,
1041+
ignoreMemorySafetyChecks: true,
1042+
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
1043+
});
10261044
}
10271045

10281046
// Dispose old chat

0 commit comments

Comments
 (0)