@@ -18,6 +18,7 @@ const CTX_CREATE_TIMEOUT_GPU = 15_000;
1818const CTX_CREATE_TIMEOUT_CPU = 60_000 ;
1919const DISPOSE_TIMEOUT = 10_000 ;
2020const MIN_AGENTIC_CONTEXT = 4096 ;
21+ const MIN_USABLE_GPU_CONTEXT = 8192 ;
2122const TOOL_DETECT_BUFFER_MAX = 60_000 ;
2223const KV_REUSE_COOLDOWN_TURNS = 2 ;
2324const MAX_PARALLEL_FUNCTION_CALLS = 4 ;
@@ -287,9 +288,10 @@ class LLMEngine extends EventEmitter {
287288 let maxCtx = this . _computeMaxContext ( gpuConfig . modelSizeGB ) ;
288289 // CPU mode: cap context for responsive generation
289290 if ( mode === false ) maxCtx = Math . min ( maxCtx , 8192 ) ;
291+ const contextMin = ( mode === false ) ? 512 : MIN_USABLE_GPU_CONTEXT ;
290292 loadedContext = await this . _withTimeout (
291293 loadedModel . createContext ( {
292- contextSize : { min : 512 , max : maxCtx } ,
294+ contextSize : { min : contextMin , max : maxCtx } ,
293295 flashAttention : true ,
294296 ignoreMemorySafetyChecks : true ,
295297 failedCreationRemedy : { retries : 4 , autoContextSizeShrink : 0.5 } ,
@@ -298,9 +300,9 @@ class LLMEngine extends EventEmitter {
298300 'Context creation' ,
299301 ) ;
300302
301- // Verify context is usable (at least 512 tokens after system prompt)
303+ // Verify context is usable (need enough for system prompt + meaningful generation )
302304 const actualCtx = loadedContext . contextSize || 0 ;
303- if ( actualCtx < 1024 && mode !== false ) {
305+ if ( actualCtx < MIN_USABLE_GPU_CONTEXT && mode !== false ) {
304306 const log = require ( './logger' ) ;
305307 log . warn ( `GPU mode ${ mode } context too small (${ actualCtx } ), trying next mode` ) ;
306308 loadedContext . dispose ?. ( ) ;
@@ -1071,16 +1073,12 @@ class LLMEngine extends EventEmitter {
10711073 try { this . chat . dispose ?. ( ) ; } catch { }
10721074 }
10731075
1074- // Reuse existing sequence — just clear KV cache
1075- if ( this . sequence && ! this . sequence . _disposed ) {
1076- try {
1077- // Await the erase to prevent race with pending async operations
1078- await this . sequence . eraseContextTokenRanges ( [ { start : 0 , end : this . sequence . nTokens } ] ) ;
1079- } catch {
1080- // If erase fails (e.g. sequence disposed mid-flight), get a new sequence
1081- try { this . sequence = this . context . getSequence ( ) ; } catch { /* context may also be gone */ }
1082- }
1083- } else if ( this . context ) {
1076+ // Dispose old sequence and get a fresh one (avoids eraseContextTokenRanges hang on degraded KV cache)
1077+ if ( this . sequence ) {
1078+ try { this . sequence . dispose ?. ( ) ; } catch { }
1079+ this . sequence = null ;
1080+ }
1081+ if ( this . context ) {
10841082 this . sequence = this . context . getSequence ( ) ;
10851083 }
10861084
0 commit comments