Skip to content

Commit a05a260

Browse files
author
Brendan Gray
committed
v1.8.37: Fix 6 regression groups from v1.8.2 (context creation, streaming display, overlap dedup, duplicate code blocks, compaction, GPT profile)
1 parent fc6e8d8 commit a05a260

5 files changed

Lines changed: 79 additions & 19 deletions

File tree

main/agenticChat.js

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,11 +1439,10 @@ function register(ctx) {
14391439
displayResponseText += displayChunk;
14401440

14411441
// Correct UI stream buffer: the overlapping tokens were already streamed
1442-
// during generation. Trim them by resetting to iteration start and
1443-
// re-sending just the de-duplicated new content.
1442+
// during generation. Replace the current iteration's display with the
1443+
// de-duplicated content instead of resetting (avoids visual flash/jarring).
14441444
if (_overlapLen > 0 && mainWindow && !mainWindow.isDestroyed()) {
1445-
mainWindow.webContents.send('llm-stream-reset');
1446-
if (displayChunk) mainWindow.webContents.send('llm-token', displayChunk);
1445+
mainWindow.webContents.send('llm-replace-last', displayChunk);
14471446
}
14481447

14491448
// ── SEAMLESS CONTINUATION — stitch for MCP tool detection ──
@@ -1751,6 +1750,16 @@ function register(ctx) {
17511750
systemContext: currentPrompt.systemContext,
17521751
userMessage: continuationMsg,
17531752
};
1753+
1754+
// Sync frontend buffer before continuation: strip tool-fence fragments
1755+
// from raw streamed tokens so the committed message doesn't contain broken
1756+
// code fences. The overlap-based llm-replace-last (line ~1444) only fires
1757+
// when _overlapLen > 0. For the first pass or when overlap detection fails,
1758+
// raw partial fences persist in the frontend buffer. Fix that here.
1759+
if (_overlapLen === 0 && mainWindow && !mainWindow.isDestroyed()) {
1760+
mainWindow.webContents.send('llm-replace-last', displayChunk);
1761+
}
1762+
17541763
continue;
17551764
}
17561765
}

main/agenticChatHelpers.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,17 @@ function progressiveContextCompaction(options) {
369369
}
370370
}
371371
if (newFullResponseText.length > 15000) {
372-
newFullResponseText = newFullResponseText.substring(newFullResponseText.length - 15000);
372+
// Find a paragraph or line boundary near the truncation point instead of
373+
// slicing blindly through code blocks or sentences.
374+
const target = newFullResponseText.length - 15000;
375+
let cutPoint = newFullResponseText.indexOf('\n\n', target);
376+
if (cutPoint === -1 || cutPoint > target + 500) {
377+
cutPoint = newFullResponseText.indexOf('\n', target);
378+
}
379+
if (cutPoint === -1 || cutPoint > target + 500) {
380+
cutPoint = target;
381+
}
382+
newFullResponseText = newFullResponseText.substring(cutPoint);
373383
pruned++;
374384
}
375385
if (chatHistory) pruned += pruneVerboseHistory(chatHistory, 2);

main/llmEngine.js

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
1010
const { sanitizeResponse } = require('./sanitize');
1111

1212
// ─── Constants ───
13-
const STALL_TIMEOUT_MS = 45_000;
13+
const STALL_TIMEOUT_MS = 90_000;
1414
const MAX_HISTORY_ENTRIES = 40;
1515
const GPU_INIT_TIMEOUT = 120_000;
1616
const MODEL_LOAD_TIMEOUT = 180_000;
@@ -23,7 +23,7 @@ const TOOL_DETECT_BUFFER_MAX = 60_000;
2323
const KV_REUSE_COOLDOWN_TURNS = 2;
2424
const MAX_PARALLEL_FUNCTION_CALLS = 4;
2525
const CONTEXT_ABSOLUTE_CEILING = 131_072;
26-
const VRAM_PADDING_FLOOR_MB = 0;
26+
const VRAM_PADDING_FLOOR_MB = 800;
2727

2828
let _genCounter = 0;
2929

@@ -276,7 +276,14 @@ class LLMEngine extends EventEmitter {
276276
this.llamaInstance = await this._withTimeout(
277277
getLlama({
278278
gpu: backendMode,
279-
vramPadding: 0,
279+
vramPadding: (ctx) => {
280+
const padding = Math.max(VRAM_PADDING_FLOOR_MB * 1024 * 1024, ctx.totalVram * 0.05);
281+
return padding;
282+
},
283+
ramPadding: () => {
284+
const totalRam = os.totalmem();
285+
return Math.min(totalRam * 0.08, 2 * 1024 ** 3);
286+
},
280287
}),
281288
GPU_INIT_TIMEOUT,
282289
'GPU initialization',
@@ -320,14 +327,14 @@ class LLMEngine extends EventEmitter {
320327
const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
321328
let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
322329
// CPU mode uses same RAM-based context sizing as GPU — no artificial cap
323-
const contextMin = MIN_USABLE_GPU_CONTEXT;
330+
const contextMin = (mode === false) ? 2048 : MIN_USABLE_GPU_CONTEXT;
324331
console.log(`[LLM DIAG] Context creation: mode=${mode}, maxCtx=${maxCtx}, contextMin=${contextMin}, modelSizeGB=${gpuConfig.modelSizeGB.toFixed(2)}`);
325332
loadedContext = await this._withTimeout(
326333
loadedModel.createContext({
327334
contextSize: { min: contextMin, max: maxCtx },
328335
flashAttention: true,
329-
ignoreMemorySafetyChecks: true,
330-
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
336+
ignoreMemorySafetyChecks: mode === false,
337+
failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
331338
}),
332339
ctxTimeout,
333340
'Context creation',
@@ -1145,10 +1152,10 @@ class LLMEngine extends EventEmitter {
11451152
// Check if context is still usable
11461153
if (!this.context || this.context._disposed) {
11471154
this.context = await this.model.createContext({
1148-
contextSize: { min: 512, max: this._computeMaxContext(0) },
1155+
contextSize: { min: 2048, max: this._computeMaxContext(0) },
11491156
flashAttention: true,
1150-
ignoreMemorySafetyChecks: true,
1151-
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
1157+
ignoreMemorySafetyChecks: !this.modelInfo || this.modelInfo.gpuMode === false,
1158+
failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
11521159
});
11531160
}
11541161

@@ -1174,10 +1181,10 @@ class LLMEngine extends EventEmitter {
11741181
// Context is exhausted, recreate it
11751182
try { this.context.dispose?.(); } catch {}
11761183
this.context = await this.model.createContext({
1177-
contextSize: { min: 512, max: this._computeMaxContext(0) },
1184+
contextSize: { min: 2048, max: this._computeMaxContext(0) },
11781185
flashAttention: true,
1179-
ignoreMemorySafetyChecks: true,
1180-
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
1186+
ignoreMemorySafetyChecks: !this.modelInfo || this.modelInfo.gpuMode === false,
1187+
failedCreationRemedy: { retries: 8, autoContextSizeShrink: 0.5 },
11811188
});
11821189

11831190
if (this.context) {

main/modelProfiles.js

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,33 @@ const FAMILY_PROFILES = {
419419
context: { effectiveContextSize: 65536, maxResponseTokens: 16384 },
420420
},
421421
},
422+
423+
gpt: {
424+
base: {
425+
sampling: { temperature: 0.50, topP: 0.90, topK: 40, repeatPenalty: 1.08 },
426+
thinkTokens: { mode: 'none', budget: 0 },
427+
},
428+
small: {
429+
sampling: { temperature: 0.40, topP: 0.85, topK: 25, repeatPenalty: 1.12 },
430+
prompt: { style: 'compact', fewShotExamples: 1 },
431+
generation: { maxToolsPerTurn: 14 },
432+
},
433+
medium: {
434+
sampling: { temperature: 0.50, topP: 0.90, topK: 30, repeatPenalty: 1.08 },
435+
context: { effectiveContextSize: 16384 },
436+
prompt: { style: 'full' },
437+
generation: { maxToolsPerTurn: 15 },
438+
},
439+
large: {
440+
context: { effectiveContextSize: 32768, maxResponseTokens: 8192 },
441+
prompt: { style: 'full' },
442+
generation: { maxToolsPerTurn: 25 },
443+
},
444+
xlarge: {
445+
context: { effectiveContextSize: 65536, maxResponseTokens: 16384 },
446+
generation: { maxToolsPerTurn: 50 },
447+
},
448+
},
422449
};
423450

424451
// ─── Deep Merge (non-mutating, arrays replaced not concatenated) ───

src/components/Chat/ChatPanel.tsx

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2078,13 +2078,20 @@ ${e.message}`,
20782078
// "name" keys to avoid false-positives on non-tool JSON (e.g. {"name": "John"}).
20792079
const incompleteToolMatch = remaining.match(/```(?:json|tool)?\s*\n?\s*\{[\s\S]*?"tool"/);
20802080
if (incompleteToolMatch) {
2081-
// Suppress incomplete tool block mid-stream is already tracked in executingTools/ToolCallGroup.
2082-
// Only render any text that appeared before the opening ```.
2081+
// Suppress the incomplete tool JSON block mid-stream.
2082+
// Render any text that appeared before the opening ```.
20832083
const beforeBlock = remaining.substring(0, remaining.indexOf('```'));
20842084
if (beforeBlock.trim()) {
20852085
parts.push(<InlineMarkdownText key={`s-${idx}`} content={beforeBlock} />);
20862086
idx++;
20872087
}
2088+
// Show a generating indicator so the user sees activity instead of blank screen
2089+
parts.push(
2090+
<div key={`tool-gen-${idx}`} className="flex items-center gap-2 py-1 text-xs text-[#858585]">
2091+
<span className="animate-pulse">Generating tool call...</span>
2092+
</div>
2093+
);
2094+
idx++;
20882095
} else if (remaining.trim()) {
20892096
// Suppress incomplete non-tool fence artifact (e.g. ```html\n<div being typed)
20902097
// — the raw backtick+language glyph shows as loose text until the block is complete.

0 commit comments

Comments
 (0)