v1.8.37: Fix 6 regression groups from v1.8.2 (context creation, streaming display, overlap dedup, duplicate code blocks, compaction, GPT profile)

Brendan Gray · Brendan Gray · commit a05a260037ab · 2026-03-13T18:16:31.000-04:00
diff --git a/main/agenticChat.js b/main/agenticChat.js
@@ -1439,11 +1439,10 @@ function register(ctx) {
       displayResponseText += displayChunk;
 
       // Correct UI stream buffer: the overlapping tokens were already streamed
-      // during generation. Trim them by resetting to iteration start and
-      // re-sending just the de-duplicated new content.
+      // during generation. Replace the current iteration's display with the
+      // de-duplicated content instead of resetting (avoids visual flash/jarring).
       if (_overlapLen > 0 && mainWindow && !mainWindow.isDestroyed()) {
-        mainWindow.webContents.send('llm-stream-reset');
-        if (displayChunk) mainWindow.webContents.send('llm-token', displayChunk);
+        mainWindow.webContents.send('llm-replace-last', displayChunk);
       }
 
       // ── SEAMLESS CONTINUATION — stitch for MCP tool detection ──
@@ -1751,6 +1750,16 @@ function register(ctx) {
               systemContext: currentPrompt.systemContext,
               userMessage: continuationMsg,
             };
+
+            // Sync frontend buffer before continuation: strip tool-fence fragments
+            // from raw streamed tokens so the committed message doesn't contain broken
+            // code fences. The overlap-based llm-replace-last (line ~1444) only fires
+            // when _overlapLen > 0. For the first pass or when overlap detection fails,
+            // raw partial fences persist in the frontend buffer. Fix that here.
+            if (_overlapLen === 0 && mainWindow && !mainWindow.isDestroyed()) {
+              mainWindow.webContents.send('llm-replace-last', displayChunk);
+            }
+
             continue;
           }
         }
diff --git a/main/agenticChatHelpers.js b/main/agenticChatHelpers.js
@@ -369,7 +369,17 @@ function progressiveContextCompaction(options) {
       }
     }
     if (newFullResponseText.length > 15000) {
-      newFullResponseText = newFullResponseText.substring(newFullResponseText.length - 15000);
+      // Find a paragraph or line boundary near the truncation point instead of
+      // slicing blindly through code blocks or sentences.
+      const target = newFullResponseText.length - 15000;
+      let cutPoint = newFullResponseText.indexOf('\n\n', target);
+      if (cutPoint === -1 || cutPoint > target + 500) {
+        cutPoint = newFullResponseText.indexOf('\n', target);
+      }
+      if (cutPoint === -1 || cutPoint > target + 500) {
+        cutPoint = target;
+      }
+      newFullResponseText = newFullResponseText.substring(cutPoint);
       pruned++;
     }
     if (chatHistory) pruned += pruneVerboseHistory(chatHistory, 2);
diff --git a/main/llmEngine.js b/main/llmEngine.js
@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
 const { sanitizeResponse } = require('./sanitize');
 
 // ─── Constants ───
-const STALL_TIMEOUT_MS = 45_000;
+const STALL_TIMEOUT_MS = 90_000;
 const MAX_HISTORY_ENTRIES = 40;
 const GPU_INIT_TIMEOUT = 120_000;
 const MODEL_LOAD_TIMEOUT = 180_000;
@@ -23,7 +23,7 @@ const TOOL_DETECT_BUFFER_MAX = 60_000;
 const KV_REUSE_COOLDOWN_TURNS = 2;
 const MAX_PARALLEL_FUNCTION_CALLS = 4;
 const CONTEXT_ABSOLUTE_CEILING = 131_072;
-const VRAM_PADDING_FLOOR_MB = 0;
+const VRAM_PADDING_FLOOR_MB = 800;
 
 let _genCounter = 0;
 
@@ -276,7 +276,14 @@ class LLMEngine extends EventEmitter {
             this.llamaInstance = await this._withTimeout(
               getLlama({
                 gpu: backendMode,
-                vramPadding: 0,
+                vramPadding: (ctx) => {
+                  const padding = Math.max(VRAM_PADDING_FLOOR_MB * 1024 * 1024, ctx.totalVram * 0.05);
+                  return padding;
+                },
+                ramPadding: () => {
+                  const totalRam = os.totalmem();
+                  return Math.min(totalRam * 0.08, 2 * 1024 ** 3);
+                },
               }),
               GPU_INIT_TIMEOUT,
               'GPU initialization',
@@ -320,14 +327,14 @@ class LLMEngine extends EventEmitter {
           const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
           let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
           // CPU mode uses same RAM-based context sizing as GPU — no artificial cap
-          const contextMin = MIN_USABLE_GPU_CONTEXT;
+          const contextMin = (mode === false) ? 2048 : MIN_USABLE_GPU_CONTEXT;
           console.log(`[LLM DIAG] Context creation: mode=${mode}, maxCtx=${maxCtx}, contextMin=${contextMin}, modelSizeGB=${gpuConfig.modelSizeGB.toFixed(2)}`);
           loadedContext = await this._withTimeout(
             loadedModel.createContext({
               contextSize: { min: contextMin, max: maxCtx },
               flashAttention: true,
-              ignoreMemorySafetyChecks: true,
-              failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+              ignoreMemorySafetyChecks: mode === false,
+              failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
             }),
             ctxTimeout,
             'Context creation',
@@ -1145,10 +1152,10 @@ class LLMEngine extends EventEmitter {
     // Check if context is still usable
     if (!this.context || this.context._disposed) {
       this.context = await this.model.createContext({
-        contextSize: { min: 512, max: this._computeMaxContext(0) },
+        contextSize: { min: 2048, max: this._computeMaxContext(0) },
         flashAttention: true,
-        ignoreMemorySafetyChecks: true,
-        failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+        ignoreMemorySafetyChecks: !this.modelInfo || this.modelInfo.gpuMode === false,
+        failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
       });
     }
 
@@ -1174,10 +1181,10 @@ class LLMEngine extends EventEmitter {
         // Context is exhausted, recreate it
         try { this.context.dispose?.(); } catch {}
         this.context = await this.model.createContext({
-          contextSize: { min: 512, max: this._computeMaxContext(0) },
+          contextSize: { min: 2048, max: this._computeMaxContext(0) },
           flashAttention: true,
-          ignoreMemorySafetyChecks: true,
-          failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+          ignoreMemorySafetyChecks: !this.modelInfo || this.modelInfo.gpuMode === false,
+          failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
         });
         
         if (this.context) {
diff --git a/main/modelProfiles.js b/main/modelProfiles.js
@@ -419,6 +419,33 @@ const FAMILY_PROFILES = {
       context: { effectiveContextSize: 65536, maxResponseTokens: 16384 },
     },
   },
+
+  gpt: {
+    base: {
+      sampling: { temperature: 0.50, topP: 0.90, topK: 40, repeatPenalty: 1.08 },
+      thinkTokens: { mode: 'none', budget: 0 },
+    },
+    small: {
+      sampling: { temperature: 0.40, topP: 0.85, topK: 25, repeatPenalty: 1.12 },
+      prompt: { style: 'compact', fewShotExamples: 1 },
+      generation: { maxToolsPerTurn: 14 },
+    },
+    medium: {
+      sampling: { temperature: 0.50, topP: 0.90, topK: 30, repeatPenalty: 1.08 },
+      context: { effectiveContextSize: 16384 },
+      prompt: { style: 'full' },
+      generation: { maxToolsPerTurn: 15 },
+    },
+    large: {
+      context: { effectiveContextSize: 32768, maxResponseTokens: 8192 },
+      prompt: { style: 'full' },
+      generation: { maxToolsPerTurn: 25 },
+    },
+    xlarge: {
+      context: { effectiveContextSize: 65536, maxResponseTokens: 16384 },
+      generation: { maxToolsPerTurn: 50 },
+    },
+  },
 };
 
 // ─── Deep Merge (non-mutating, arrays replaced not concatenated) ───
diff --git a/src/components/Chat/ChatPanel.tsx b/src/components/Chat/ChatPanel.tsx
@@ -2078,13 +2078,20 @@ ${e.message}`,
         // "name" keys to avoid false-positives on non-tool JSON (e.g. {"name": "John"}).
         const incompleteToolMatch = remaining.match(/```(?:json|tool)?\s*\n?\s*\{[\s\S]*?"tool"/);
         if (incompleteToolMatch) {
-          // Suppress — incomplete tool block mid-stream is already tracked in executingTools/ToolCallGroup.
-          // Only render any text that appeared before the opening ```.
+          // Suppress the incomplete tool JSON block mid-stream.
+          // Render any text that appeared before the opening ```.
           const beforeBlock = remaining.substring(0, remaining.indexOf('```'));
           if (beforeBlock.trim()) {
             parts.push(<InlineMarkdownText key={`s-${idx}`} content={beforeBlock} />);
             idx++;
           }
+          // Show a generating indicator so the user sees activity instead of blank screen
+          parts.push(
+            <div key={`tool-gen-${idx}`} className="flex items-center gap-2 py-1 text-xs text-[#858585]">
+              <span className="animate-pulse">Generating tool call...</span>
+            </div>
+          );
+          idx++;
         } else if (remaining.trim()) {
           // Suppress incomplete non-tool fence artifact (e.g. ```html\n<div being typed)
           // — the raw backtick+language glyph shows as loose text until the block is complete.

Original file line number	Diff line number	Diff line change
`@@ -369,7 +369,17 @@ function progressiveContextCompaction(options) {`
`369`	`369`	`}`
`370`	`370`	`}`
`371`	`371`	`if (newFullResponseText.length > 15000) {`
`372`		`- newFullResponseText = newFullResponseText.substring(newFullResponseText.length - 15000);`
	`372`	`+ // Find a paragraph or line boundary near the truncation point instead of`
	`373`	`+ // slicing blindly through code blocks or sentences.`
	`374`	`+ const target = newFullResponseText.length - 15000;`
	`375`	`+ let cutPoint = newFullResponseText.indexOf('\n\n', target);`
	`376`	`+ if (cutPoint === -1 \|\| cutPoint > target + 500) {`
	`377`	`+ cutPoint = newFullResponseText.indexOf('\n', target);`
	`378`	`+ }`
	`379`	`+ if (cutPoint === -1 \|\| cutPoint > target + 500) {`
	`380`	`+ cutPoint = target;`
	`381`	`+ }`
	`382`	`+ newFullResponseText = newFullResponseText.substring(cutPoint);`
`373`	`383`	`pruned++;`
`374`	`384`	`}`
`375`	`385`	`if (chatHistory) pruned += pruneVerboseHistory(chatHistory, 2);`