@@ -785,7 +785,10 @@ function register(ctx) {
785785 const hwContextSize = modelStatus . modelInfo ?. contextSize || 32768 ;
786786
787787 // Helper functions (defined early — needed for budget calculation)
788- const estimateTokens = ( text ) => Math . ceil ( ( text || '' ) . length / 4 ) ;
788+ // /3.5 gives ~14% more conservative token estimate than /4 — real LLM tokenizers
789+ // produce 3–3.5 chars/token for code and JSON (shorter than English prose).
790+ // This prevents buildStaticPrompt + buildDynamicContext from overcommitting budget.
791+ const estimateTokens = ( text ) => Math . ceil ( ( text || '' ) . length / 3.5 ) ;
789792
790793 // ── ModelProfile-driven budgeting ──
791794 // The ModelProfile registry provides effective context size, response reserve %,
@@ -931,11 +934,13 @@ function register(ctx) {
931934 // Dynamic context: memory, RAG, file, error — changes between iterations.
932935 // Injected into user message instead of system context to avoid KV cache invalidation.
933936 // Chat mode: skip ALL dynamic context to maximize conversation space.
934- const buildDynamicContext = ( taskTypeOverride ) => {
937+ // budgetOverride: optional cap for dynamic context tokens — used by overflow retry
938+ // to shed memory/RAG/file context while preserving tools and preamble.
939+ const buildDynamicContext = ( taskTypeOverride , budgetOverride ) => {
935940 const effectiveTaskType = taskTypeOverride || taskType ;
936941 // Chat mode: no dynamic context injection — keep the full context for conversation
937942 if ( effectiveTaskType === 'chat' ) return '' ;
938- let tokenBudget = Math . floor ( maxPromptTokens * 0.4 ) ; // Reserve budget for dynamic context
943+ let tokenBudget = budgetOverride !== undefined ? budgetOverride : Math . floor ( maxPromptTokens * 0.4 ) ; // default: 40% of prompt budget
939944 let prompt = '' ;
940945
941946 const appendIfBudget = ( text , label ) => {
@@ -1469,9 +1474,20 @@ function register(ctx) {
14691474 try { await llmEngine . resetSession ( true ) ; } catch ( _ ) { }
14701475 sessionJustRotated = true ;
14711476 const rotatedBase = buildStaticPrompt ( ) ;
1477+ // Fix C: use 10% of prompt budget for dynamic context on retry — drops memory/RAG/file
1478+ // context but keeps tools and preamble fully intact. Prevents repeat overflow on
1479+ // small-context models without touching the model's tool access.
1480+ // Fix D: if partial content was generated before the overflow, inject it so the model
1481+ // continues from where it left off rather than restarting the response from scratch.
1482+ const _firstTurnPartial = fullResponseText . trim ( ) . length > 0
1483+ ? fullResponseText . substring ( Math . max ( 0 , fullResponseText . length - 1500 ) )
1484+ : '' ;
1485+ const _firstTurnHint = _firstTurnPartial
1486+ ? `\n\nYou were generating a response and the context was reset due to size constraints. Here is the end of what you wrote:\n---\n${ _firstTurnPartial } \n---\nContinue directly from where you left off without repeating what you already wrote.`
1487+ : '' ;
14721488 currentPrompt = {
14731489 systemContext : rotatedBase ,
1474- userMessage : buildDynamicContext ( ) + '\n' + message
1490+ userMessage : buildDynamicContext ( undefined , Math . floor ( maxPromptTokens * 0.10 ) ) + '\n' + message + _firstTurnHint
14751491 } ;
14761492 continue ;
14771493 }
@@ -1570,9 +1586,17 @@ function register(ctx) {
15701586 }
15711587
15721588 const rotatedBase = buildStaticPrompt ( ) ;
1589+ // Fix D: include the end of what was generated so far so the model continues
1590+ // seamlessly rather than restarting the response after context rotation.
1591+ const _rotationPartial = fullResponseText . trim ( ) . length > 0
1592+ ? fullResponseText . substring ( Math . max ( 0 , fullResponseText . length - 1500 ) )
1593+ : '' ;
1594+ const _rotationHint = _rotationPartial
1595+ ? `\n\nYou were generating a response and context was rotated. Here is the end of what you wrote:\n---\n${ _rotationPartial } \n---\nContinue directly from where you left off without repeating what you already wrote.`
1596+ : `\nContext was rotated. The current user request is: ${ message . substring ( 0 , 300 ) } ${ message . length > 300 ? '...' : '' } ` ;
15731597 currentPrompt = {
15741598 systemContext : rotatedBase ,
1575- userMessage : buildDynamicContext ( ) + '\n' + convSummary + `\nContext was rotated. The current user request is: ${ message . substring ( 0 , 300 ) } ${ message . length > 300 ? '...' : '' } `
1599+ userMessage : buildDynamicContext ( ) + '\n' + convSummary + _rotationHint
15761600 } ;
15771601 sessionJustRotated = true ;
15781602 lastConvSummary = convSummary ;
@@ -1937,17 +1961,25 @@ function register(ctx) {
19371961 }
19381962
19391963 // ── Strip code-fence artifacts from displayed text ──
1940- // Re-enabled with per-iteration offset tracking (llm-iteration-begin + iterationStartOffsetRef).
1941- // The frontend prepends prior iterations' text so only the current iteration's portion is replaced.
1964+ // Route any conversational planning text to the thinking panel, then wipe the
1965+ // main chat iteration slot clean. This prevents raw JSON tool calls from flashing
1966+ // in the chat bubble and matches the cloud path behavior.
19421967 if ( toolResults . hasToolCalls && toolResults . results . length > 0 && mainWindow ) {
1943- let cleaned = responseText ;
1944- cleaned = cleaned . replace ( / ` ` ` (?: t o o l _ c a l l | t o o l | j s o n ) [ ^ \n ] * \n [ \s \S ] * ?` ` ` / g, '' ) ;
1945- cleaned = cleaned . replace ( / < t o o l _ c a l l > [ \s \S ] * ?< \/ t o o l _ c a l l > / g, '' ) ;
1946- cleaned = cleaned . replace ( / \{ \s * " (?: t o o l | n a m e ) " \s * : \s * " [ ^ " ] + " \s * , \s * " (?: p a r a m s | a r g u m e n t s ) " [ \s \S ] * ?\} \s * \} / g, '' ) ;
1947- cleaned = cleaned . replace ( / \n { 3 , } / g, '\n\n' ) . trim ( ) ;
1948- if ( cleaned !== responseText ) {
1949- mainWindow . webContents . send ( 'llm-replace-last' , cleaned ) ;
1950- }
1968+ // Extract planning text — everything the model wrote before the first tool call indicator
1969+ const toolIndicators = [ '{"tool":' , '```tool_call' , '```json\n{"tool"' , '<tool_call>' ] ;
1970+ let splitIdx = responseText . length ;
1971+ for ( const indicator of toolIndicators ) {
1972+ const idx = responseText . indexOf ( indicator ) ;
1973+ if ( idx >= 0 && idx < splitIdx ) splitIdx = idx ;
1974+ }
1975+ const planningText = responseText . substring ( 0 , splitIdx ) . trim ( ) ;
1976+ if ( planningText ) {
1977+ // Planning text belongs in the thinking panel, not the main chat bubble
1978+ mainWindow . webContents . send ( 'llm-thinking-token' , planningText ) ;
1979+ }
1980+ // Wipe this iteration's streamed content from main chat — the final answer
1981+ // streams clean in the last iteration that produces no tool calls.
1982+ mainWindow . webContents . send ( 'llm-replace-last' , '' ) ;
19511983 }
19521984
19531985 if ( ! toolResults . hasToolCalls || toolResults . results . length === 0 ) {
0 commit comments