fix: v1.6.9 - remove Auto Mode provider label from chat, fix truncated cloud responses, cache nvidia-smi VRAM per session

Brendan Gray · Brendan Gray · commit 5f5b5e18cf56 · 2026-03-01T19:13:38.000-05:00
diff --git a/main/agenticChat.js b/main/agenticChat.js
@@ -295,10 +295,7 @@ function register(ctx) {
           context.cloudProvider = autoSelect.provider;
           context.cloudModel = autoSelect.model;
           console.log(`[Auto Mode] Selected: ${autoSelect.provider} / ${autoSelect.model}`);
-          // Notify the UI which model was auto-selected
-          if (mainWindow) {
-            mainWindow.webContents.send('llm-token', `*Auto Mode selected: ${cloudLLM._getProviderLabel(autoSelect.provider)} / ${autoSelect.model}*\n\n`);
-          }
+          // Model selection is intentionally not shown in chat — auto mode is meant to be seamless
         } else {
           console.log('[Auto Mode] No cloud providers available, falling back to local model');
           if (mainWindow) {
diff --git a/main/agenticChatHelpers.js b/main/agenticChatHelpers.js
@@ -267,15 +267,13 @@ function createIpcTokenBatcher(mainWindow, channel, canSend, opts = {}) {
   };
 
   const dispose = () => {
-    if (charsPerFlush) {
-      // Paced batcher: do NOT cancel the timer or instant-dump the buffer.
-      // Let the existing timer continue draining at the charsPerFlush rate.
-      // Instant-dumping here bypasses pacing and causes a wall-of-text flash
-      // when a fast cloud API has buffered many tokens by response end.
-      // canSend() will block stale sends if a new request starts before drain completes.
-      return;
-    }
-    // Non-paced batcher: cancel timer and flush remaining buffer immediately as before.
+    // Always cancel the timer and flush any remaining buffer immediately.
+    // For paced batchers (charsPerFlush > 0), pacing is for smooth live display
+    // DURING streaming. At dispose() time streaming is complete — leaving chars
+    // stranded in the buffer means the aiChat() IPC resolve races ahead of the
+    // remaining tokens, causing the frontend to commit a truncated message
+    // (e.g. "Hell" instead of "Hello. How can I help you?").
+    // Flushing immediately here ensures all tokens arrive BEFORE the IPC resolve.
     if (timer) {
       clearTimeout(timer);
       timer = null;
diff --git a/main/llmEngine.js b/main/llmEngine.js
@@ -257,19 +257,25 @@ class LLMEngine extends EventEmitter {
       // Windows), then CPU. 'auto' left in the chain so non-NVIDIA systems still get GPU.
       let nvidiaDedicatedVramBytes = 0;
       if (this.gpuPreference !== 'cpu') {
-        try {
-          const { execSync } = require('child_process');
-          const nvOut = execSync('nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits', {
-            timeout: 3000, encoding: 'utf8', windowsHide: true,
-          });
-          const mib = parseFloat(nvOut.trim());
-          if (mib > 0) {
-            nvidiaDedicatedVramBytes = mib * 1024 * 1024; // MiB → bytes
-            console.log(`[LLM] nvidia-smi dedicated VRAM: ${(nvidiaDedicatedVramBytes / (1024 ** 3)).toFixed(1)}GB`);
+        // Cache nvidia-smi result — only probe once per session. Avoids a 100–300ms sync
+        // block (or 3s timeout on non-NVIDIA systems) on every model load/switch.
+        if (this._cachedNvidiaDedicatedVramBytes === undefined) {
+          this._cachedNvidiaDedicatedVramBytes = 0; // default: unknown / non-NVIDIA
+          try {
+            const { execSync } = require('child_process');
+            const nvOut = execSync('nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits', {
+              timeout: 3000, encoding: 'utf8', windowsHide: true,
+            });
+            const mib = parseFloat(nvOut.trim());
+            if (mib > 0) {
+              this._cachedNvidiaDedicatedVramBytes = mib * 1024 * 1024; // MiB → bytes
+              console.log(`[LLM] nvidia-smi dedicated VRAM: ${(this._cachedNvidiaDedicatedVramBytes / (1024 ** 3)).toFixed(1)}GB (cached for session)`);
+            }
+          } catch (_) {
+            console.log('[LLM] nvidia-smi unavailable — Vulkan total VRAM used as-is for padding');
           }
-        } catch (_) {
-          console.log('[LLM] nvidia-smi unavailable — Vulkan total VRAM used as-is for padding');
         }
+        nvidiaDedicatedVramBytes = this._cachedNvidiaDedicatedVramBytes;
       }
 
       // If the model is too large for full GPU offload, insert a partial layer fallback
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "guide-ide",
-  "version": "1.6.8",
+  "version": "1.6.9",
   "description": "guIDE - AI-Powered Offline IDE with local LLM, RAG, MCP tools, browser automation, and integrated terminal",
   "author": {
     "name": "Brendan Gray",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "guide-ide",`
`3`		`- "version": "1.6.8",`
	`3`	`+ "version": "1.6.9",`
`4`	`4`	`"description": "guIDE - AI-Powered Offline IDE with local LLM, RAG, MCP tools, browser automation, and integrated terminal",`
`5`	`5`	`"author": {`
`6`	`6`	`"name": "Brendan Gray",`