FileShot
diff --git a/‎electron-main.js‎
Lines changed: 1 addition & 1 deletion b/‎electron-main.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎main/agenticChat.js‎
Lines changed: 20 additions & 4 deletions b/‎main/agenticChat.js‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎main/constants.js‎
Lines changed: 15 additions & 45 deletions b/‎main/constants.js‎
Lines changed: 15 additions & 45 deletions
diff --git a/‎main/llmEngine.js‎
Lines changed: 65 additions & 47 deletions b/‎main/llmEngine.js‎
Lines changed: 65 additions & 47 deletions
@@ -692,7 +692,7 @@ app.on('before-quit', () => {
   try { log.close(); } catch (_) {} // Flush persistent log file
   // Fire-and-forget async cleanup with a hard deadline
   const cleanupDone = Promise.all([
-    memoryStore.dispose().catch(() => {}),
+    Promise.resolve(memoryStore.dispose()).catch(() => {}),
     llmEngine.dispose().catch(() => {}),
   ]);
   // Give async cleanup 3 seconds max, then force exit
 
@@ -1054,7 +1054,22 @@ function register(ctx) {
       const _stitchedForMcp = _pendingPartialBlock ? _pendingPartialBlock + responseText : responseText;
       _pendingPartialBlock = null;
       const _fenceIdx = _stitchedForMcp.search(/```(?:json|tool_call|tool)\b/);
-      const _hasUnclosedToolFence = _fenceIdx !== -1 && !_stitchedForMcp.slice(_fenceIdx).includes('\n```');
+      let _hasUnclosedToolFence = _fenceIdx !== -1 && !_stitchedForMcp.slice(_fenceIdx).includes('\n```');
+
+      // If the unclosed fence contains a complete JSON tool call, don't treat as truncated
+      if (_hasUnclosedToolFence) {
+        const fenceContent = _stitchedForMcp.slice(_fenceIdx);
+        const jsonMatch = fenceContent.match(/```(?:json|tool_call|tool)\s*\n?([\s\S]*)/);
+        if (jsonMatch) {
+          try {
+            const parsed = JSON.parse(jsonMatch[1].trim());
+            if (parsed && typeof parsed.tool === 'string') {
+              _hasUnclosedToolFence = false;
+            }
+          } catch {}
+        }
+      }
+
       const _wasTruncated = (
         (result?.stopReason === 'maxTokens' || result?.stopReason === 'max-tokens') ||
         _hasUnclosedToolFence
@@ -1219,8 +1234,9 @@ function register(ctx) {
           break;
         }
 
-        // Code-dump nudge
-        const hasCodeBlocks = /```(?:html?|css|javascript|js|typescript|ts|python|py|json)\s*\n[\s\S]{50,}/i.test(responseText);
+        // Code-dump nudge — only for large blocks likely to be full files
+        const _codeBlockMatch = responseText.match(/```(?:html?|css|javascript|js|typescript|ts|python|py|json)\s*\n([\s\S]*?)```/i);
+        const hasCodeBlocks = _codeBlockMatch && _codeBlockMatch[1].length > 500;
         if (hasCodeBlocks && nudgesRemaining > 0 && iteration < MAX_AGENTIC_ITERATIONS - 1) {
           nudgesRemaining--;
           currentPrompt = {
@@ -1325,7 +1341,7 @@ function register(ctx) {
       const hasBrowserAction = toolResults.results.some(tr => tr.tool?.startsWith('browser_'));
       const continueInstruction = hasBrowserAction
         ? '\n\nThe snapshot above has [ref=N]. Use browser_click/type with ref. Output next tool call now.'
-        : '\n\nOutput the next tool call to make progress. Only summarize when ALL steps are complete.';
+        : '\n\nIf more steps are needed, output the next tool call. If the task is complete, summarize what was done — do not call more tools.';
 
       const iterContext = executionBlock + stepDirective + taskReminder;
       const allFeedback = toolFeedback + snapFeedback;
 
@@ -61,55 +61,25 @@ const DEFAULT_SYSTEM_PREAMBLE = `You are a local AI coding assistant with tools.
 - Browser: browser_navigate → browser_snapshot → browser_click/type using refs from snapshot
 - Multi-step tasks (3+ steps): use write_todos to plan, update_todo as each step completes`;
 
-const DEFAULT_COMPACT_PREAMBLE = `You are a local AI coding assistant with tools. Use them to take real action — never just describe what you'd do.
+const DEFAULT_COMPACT_PREAMBLE = `You are a local AI coding assistant with tools. Call tools to take action — never just describe what you'd do.
 
 ## Tools
-- read_file: read a file from the project
-- write_file: create or OVERWRITE a file (replaces entire content)
-- append_to_file: add content to the end of a file without overwriting — use when building a large file across multiple calls
-- edit_file: modify a file using exact oldText + newText (read_file first)
-- list_directory: list files in a directory — use "." to list the project root
-- find_files: find files by name or pattern
-- grep_search: search file contents for a string or pattern
-- run_command: run a shell command (Windows PowerShell)
-- web_search: search for live/current external information only
-- fetch_webpage: fetch content from a URL
-- browser_navigate: open a URL in real Chrome
-- browser_snapshot: read the current browser page (call before clicking)
-- browser_click / browser_type: interact with elements by ref from snapshot
-- search_codebase: search indexed project code
-- analyze_error: analyze an error against the codebase
-
-## Behavior
-- **Your tools are real and execute in the live environment.** Call them — do not describe what you would do instead of doing it.
-- **When asked to CREATE a new file that does not exist yet — call write_file immediately. Do not call list_directory or read_file first. Exploration tools are for tasks involving existing files, not for creating new ones.**
-- **To generate any file with multiple required sections or components (HTML page, multi-part document, large code file): always write the file section by section. Call write_file with ONLY the first section (e.g. HTML head + styles, or opening boilerplate). After it succeeds, call append_to_file with the NEXT section. Repeat until ALL sections are complete. Do NOT write the entire file in a single write_file call. One section per tool call — so every section gets full content, not an abbreviated placeholder. Never call write_file again on a file you are already building — it erases everything. Only append_to_file after the first call.**
-- **When your response would contain a complete file (code, markup, config, data) — call write_file. File content belongs in the filesystem, not in chat.**
-- **For tasks that require creating multiple files: write ONE file per tool call — do not enumerate all files or steps first, and do not output file content as prose. Call write_file immediately for the first file; after it succeeds, write the next file, and so on, one file per turn.**
-- **Never say you created, saved, ran, or navigated to something unless you called a tool that did it.**
-- **Never claim you searched for something, looked it up, or checked a source unless you actually called web_search or fetch_webpage in this response.**
-- **You do not know today's date or current real-world state. If asked for the date, time, or any live or time-sensitive information — call web_search immediately. Never state a current date, time, or real-world value from memory.**
-- Acknowledge the user's request, then call the tools needed — you have no knowledge of file contents until you read them
-- After tools return, explain what you found — don't just say a tool ran
-- After completing a tool call, always write at least one sentence confirming what was done — never end your response on a bare tool call with no acknowledgment
-- Never copy or repeat sentences you have already written in this response.
-- Ask a specific follow-up if you need more context
-- When asked to visit, open, navigate to, or browse a URL or website, call \`browser_navigate\` as your first action.
-- When asked to save, write, or store data, results, or any content to a file, call \`write_file\` to create that file.
+read_file, write_file, edit_file, list_directory, find_files, grep_search, run_command, web_search, fetch_webpage, browser_navigate, browser_snapshot, browser_click, browser_type, search_codebase, analyze_error, append_to_file
 
 ## Rules
-- **You have no knowledge of what any file contains until you call read_file.** Never guess or invent file contents.
-- **You have no knowledge of what files exist in the project until you call list_directory.** Never list, name, or assume project files from memory — always call list_directory first.
-- Use tools when action is required: reading files, running commands, browsing, writing code
-- For general knowledge questions (concepts, how-to, code explanations), write your full answer immediately — start your response with the content, not a statement about tools
-- When the user describes a bug, error, or unexpected behavior in their project: call read_file on the relevant file first, then diagnose — name the file
-- If a bug is described with no file name or error message, ask ONE clarifying question — do not call tools yet
-- When asked about anything that may have changed since your training — live data, current events, real-time information, or anything time-sensitive — call web_search immediately. You have real internet access. Never say you cannot access live information — use the tool. Do not use for static programming knowledge you can answer directly.
-- If a tool fails, retry once with corrected parameters — never give up on the first failure or invent a result
-- Never claim a task is done before calling the tool that completes it — writing a file requires write_file, searching requires web_search
-- When read_file fails with ENOENT, call find_files to locate the file by name
-- Tool format: {"tool":"read_file","params":{"filePath":"src/app.js"}}
-- For conversational messages — greetings, casual chat, simple questions — respond directly with text. No tools needed.`;
+- Tools execute in the live environment. Call them — do not describe what you would do.
+- Never say you did something unless you called the tool that did it.
+- You do not know file contents until you call read_file. Never guess.
+- You do not know what files exist until you call list_directory.
+- For general knowledge or concept questions, answer directly — no tools needed.
+- For bugs: read_file the relevant file first, then diagnose.
+- For live/current/time-sensitive info: call web_search. Never guess dates or current state.
+- To visit a URL: call browser_navigate. To read a page: browser_snapshot first.
+- If a tool fails, retry once with corrected parameters.
+- For new files: call write_file immediately.
+- For large files: write_file first section, then append_to_file for each remaining section.
+- For conversations, greetings, simple questions: respond with text, no tools needed.
+- Once a task is complete (file written, question answered, error explained), respond with a brief summary. Do not call more tools after the task is done.`;
 
 const DEFAULT_CHAT_PREAMBLE = `Answer questions, help with code and concepts, and have normal conversations.
 Be concise, direct, and helpful.`;
 
@@ -22,7 +22,7 @@ const TOOL_DETECT_BUFFER_MAX = 60_000;
 const KV_REUSE_COOLDOWN_TURNS = 2;
 const MAX_PARALLEL_FUNCTION_CALLS = 4;
 const CONTEXT_ABSOLUTE_CEILING = 131_072;
-const VRAM_PADDING_FLOOR_MB = 800;
+const VRAM_PADDING_FLOOR_MB = 0;
 
 let _genCounter = 0;
 
@@ -92,7 +92,7 @@ class LLMEngine extends EventEmitter {
       const modelSizeGB = (modelSizeBytes || 0) / (1024 ** 3);
       // Heuristic: estimate layers from model size
       const estLayers = modelSizeGB < 2 ? 32 : modelSizeGB < 8 ? 40 : modelSizeGB < 20 ? 48 : 80;
-      const usableVram = Math.max(0, vramGB - 0.8); // reserve ~800MB
+      const usableVram = Math.max(0, vramGB - VRAM_PADDING_FLOOR_MB / 1024);
       const fitsRatio = Math.min(1, usableVram / Math.max(0.1, modelSizeGB));
       const roughMaxLayers = Math.floor(estLayers * fitsRatio);
       return { roughMaxLayers, estimatedLayers: estLayers, vramGB, modelSizeGB };
@@ -226,36 +226,31 @@ class LLMEngine extends EventEmitter {
       const modelStats = fs.statSync(modelPath);
       const gpuConfig = this._getGPUConfig(modelStats.size);
 
-      // GPU mode fallback chain
+      // GPU mode fallback chain — model LOAD + CONTEXT creation together
       const gpuModes = this._buildGpuModeList(gpuConfig);
       let loadedModel = null;
+      let loadedContext = null;
       let usedGpuMode = false;
       let bestAutoGpuLayers = 0;
 
       for (const mode of gpuModes) {
         if (loadSignal.aborted) throw new Error('Load cancelled');
         try {
           // Create or reuse Llama backend instance
-          if (!this.llamaInstance || this._lastGpuMode !== mode) {
+          const backendMode = mode === false ? false : (typeof mode === 'number' ? 'cuda' : mode);
+          if (!this.llamaInstance || this._lastGpuMode !== backendMode) {
             if (this.llamaInstance) {
               // Don't dispose — reuse for CUDA kernel caching
             }
             this.llamaInstance = await this._withTimeout(
               getLlama({
-                gpu: mode === false ? false : mode,
-                vramPadding: (ctx) => {
-                  const padding = Math.max(VRAM_PADDING_FLOOR_MB * 1024 * 1024, ctx.totalVram * 0.05);
-                  return padding;
-                },
-                ramPadding: () => {
-                  const totalRam = os.totalmem();
-                  return Math.min(totalRam * 0.08, 2 * 1024 ** 3);
-                },
+                gpu: backendMode,
+                vramPadding: 0,
               }),
               GPU_INIT_TIMEOUT,
               'GPU initialization',
             );
-            this._lastGpuMode = mode;
+            this._lastGpuMode = backendMode;
           }
 
           this.emit('status', { state: 'loading', message: `Trying GPU mode: ${mode}...` });
@@ -274,46 +269,56 @@ class LLMEngine extends EventEmitter {
             'Model loading',
           );
 
-          // Check if auto mode returned fewer layers than partial fallback
+          // Track auto mode GPU layer usage (do not reject — auto mode optimizes layer split)
           if (mode === 'auto' && loadedModel.gpuLayers != null) {
             bestAutoGpuLayers = loadedModel.gpuLayers;
-            if (gpuConfig.roughMaxLayers > 0 && loadedModel.gpuLayers < gpuConfig.roughMaxLayers) {
-              loadedModel.dispose?.();
-              loadedModel = null;
-              continue;
-            }
+          }
+
+          // Now try to create context on this model
+          const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
+          let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
+          // CPU mode: cap context for responsive generation
+          if (mode === false) maxCtx = Math.min(maxCtx, 8192);
+          loadedContext = await this._withTimeout(
+            loadedModel.createContext({
+              contextSize: { min: 512, max: maxCtx },
+              flashAttention: true,
+              ignoreMemorySafetyChecks: true,
+              failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+            }),
+            ctxTimeout,
+            'Context creation',
+          );
+
+          // Verify context is usable (at least 512 tokens after system prompt)
+          const actualCtx = loadedContext.contextSize || 0;
+          if (actualCtx < 1024 && mode !== false) {
+            const log = require('./logger');
+            log.warn(`GPU mode ${mode} context too small (${actualCtx}), trying next mode`);
+            loadedContext.dispose?.();
+            loadedContext = null;
+            loadedModel.dispose?.();
+            loadedModel = null;
+            continue;
           }
 
           usedGpuMode = mode;
           break;
         } catch (err) {
           const log = require('./logger');
           log.warn(`GPU mode ${mode} failed: ${err.message}`);
-          loadedModel = null;
+          if (loadedModel) { loadedModel.dispose?.(); loadedModel = null; }
+          if (loadedContext) { loadedContext.dispose?.(); loadedContext = null; }
         }
       }
 
-      if (!loadedModel) throw new Error(`Failed to load model from ${modelPath} on any GPU mode`);
-      if (loadSignal.aborted) { loadedModel.dispose?.(); throw new Error('Load cancelled'); }
+      if (!loadedModel || !loadedContext) throw new Error(`Failed to load model from ${modelPath} on any GPU mode`);
+      if (loadSignal.aborted) { loadedContext.dispose?.(); loadedModel.dispose?.(); throw new Error('Load cancelled'); }
 
       this.model = loadedModel;
+      this.context = loadedContext;
       this.currentModelPath = modelPath;
 
-      // Context creation with retry/shrink
-      const ctxTimeout = usedGpuMode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
-      const maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
-
-      this.context = await this._withTimeout(
-        this.model.createContext({
-          contextSize: { min: 2048, max: maxCtx },
-          flashAttention: true,
-          ignoreMemorySafetyChecks: usedGpuMode === false,
-          failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
-        }),
-        ctxTimeout,
-        'Context creation',
-      );
-
       // Reject GPU context if too small
       if (this.requireMinContextForGpu && usedGpuMode !== false) {
         const actualCtxSize = this.context.contextSize || 0;
@@ -415,6 +420,11 @@ class LLMEngine extends EventEmitter {
     const modes = ['cuda', 'auto'];
     if (gpuConfig.roughMaxLayers > 0) {
       modes.push(gpuConfig.roughMaxLayers);
+      // Add partial GPU modes (half layers, quarter layers) for small VRAM GPUs
+      const half = Math.floor(gpuConfig.roughMaxLayers / 2);
+      const quarter = Math.floor(gpuConfig.roughMaxLayers / 4);
+      if (half >= 4) modes.push(half);
+      if (quarter >= 4 && quarter !== half) modes.push(quarter);
     }
     modes.push(false); // CPU fallback
     return modes;
@@ -711,7 +721,9 @@ class LLMEngine extends EventEmitter {
 
     // Context overflow detection
     const msg = (err.message || '').toLowerCase();
+    console.error(`[LLM] Generation error (non-abort): name=${err.name}, message=${err.message}, stack=${err.stack?.split('\n').slice(0,3).join(' | ')}`);
     if (msg.includes('compress') || msg.includes('context') || msg.includes('too long')) {
+      console.error(`[LLM] Treating as CONTEXT_OVERFLOW (matched: ${msg.includes('compress') ? 'compress' : msg.includes('context') ? 'context' : 'too long'})`);
       const summary = this.getConversationSummary();
       this.resetSession(true);
       const overflowErr = new Error(`CONTEXT_OVERFLOW:${summary}`);
@@ -1012,17 +1024,23 @@ class LLMEngine extends EventEmitter {
 
   // ─── Session Management ───
   async resetSession(useCompactPrompt = false) {
+    // Wait for any in-flight model load to finish first
+    if (this._initializingPromise) {
+      try { await this._initializingPromise; } catch {}
+    }
+
+    if (!this.model || !this.isReady) {
+      throw new Error('Cannot reset session — no model loaded');
+    }
+
     // Check if context is still usable
     if (!this.context || this.context._disposed) {
-      if (this.model) {
-        this.context = await this.model.createContext({
-          contextSize: { min: 2048, max: this._computeMaxContext(0) },
-          flashAttention: true,
-          failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
-        });
-      } else {
-        throw new Error('Cannot reset session — no model loaded');
-      }
+      this.context = await this.model.createContext({
+        contextSize: { min: 512, max: this._computeMaxContext(0) },
+        flashAttention: true,
+        ignoreMemorySafetyChecks: true,
+        failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+      });
     }
 
     // Dispose old chat