@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
1010const { sanitizeResponse } = require ( './sanitize' ) ;
1111
1212// ─── Constants ───
13- const STALL_TIMEOUT_MS = 45_000 ;
13+ const STALL_TIMEOUT_MS = 90_000 ;
1414const MAX_HISTORY_ENTRIES = 40 ;
1515const GPU_INIT_TIMEOUT = 120_000 ;
1616const MODEL_LOAD_TIMEOUT = 180_000 ;
@@ -23,7 +23,7 @@ const TOOL_DETECT_BUFFER_MAX = 60_000;
2323const KV_REUSE_COOLDOWN_TURNS = 2 ;
2424const MAX_PARALLEL_FUNCTION_CALLS = 4 ;
2525const CONTEXT_ABSOLUTE_CEILING = 131_072 ;
26- const VRAM_PADDING_FLOOR_MB = 0 ;
26+ const VRAM_PADDING_FLOOR_MB = 800 ;
2727
2828let _genCounter = 0 ;
2929
@@ -276,7 +276,14 @@ class LLMEngine extends EventEmitter {
276276 this . llamaInstance = await this . _withTimeout (
277277 getLlama ( {
278278 gpu : backendMode ,
279- vramPadding : 0 ,
279+ vramPadding : ( ctx ) => {
280+ const padding = Math . max ( VRAM_PADDING_FLOOR_MB * 1024 * 1024 , ctx . totalVram * 0.05 ) ;
281+ return padding ;
282+ } ,
283+ ramPadding : ( ) => {
284+ const totalRam = os . totalmem ( ) ;
285+ return Math . min ( totalRam * 0.08 , 2 * 1024 ** 3 ) ;
286+ } ,
280287 } ) ,
281288 GPU_INIT_TIMEOUT ,
282289 'GPU initialization' ,
@@ -320,14 +327,14 @@ class LLMEngine extends EventEmitter {
320327 const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU ;
321328 let maxCtx = this . _computeMaxContext ( gpuConfig . modelSizeGB ) ;
322329 // CPU mode uses same RAM-based context sizing as GPU — no artificial cap
323- const contextMin = MIN_USABLE_GPU_CONTEXT ;
330+ const contextMin = ( mode === false ) ? 2048 : MIN_USABLE_GPU_CONTEXT ;
324331 console . log ( `[LLM DIAG] Context creation: mode=${ mode } , maxCtx=${ maxCtx } , contextMin=${ contextMin } , modelSizeGB=${ gpuConfig . modelSizeGB . toFixed ( 2 ) } ` ) ;
325332 loadedContext = await this . _withTimeout (
326333 loadedModel . createContext ( {
327334 contextSize : { min : contextMin , max : maxCtx } ,
328335 flashAttention : true ,
329- ignoreMemorySafetyChecks : true ,
330- failedCreationRemedy : { retries : 4 , autoContextSizeShrink : 0.5 } ,
336+ ignoreMemorySafetyChecks : mode === false ,
337+ failedCreationRemedy : { retries : 8 , autoContextSizeShrink : 0.5 } ,
331338 } ) ,
332339 ctxTimeout ,
333340 'Context creation' ,
@@ -1145,10 +1152,10 @@ class LLMEngine extends EventEmitter {
11451152 // Check if context is still usable
11461153 if ( ! this . context || this . context . _disposed ) {
11471154 this . context = await this . model . createContext ( {
1148- contextSize : { min : 512 , max : this . _computeMaxContext ( 0 ) } ,
1155+ contextSize : { min : 2048 , max : this . _computeMaxContext ( 0 ) } ,
11491156 flashAttention : true ,
1150- ignoreMemorySafetyChecks : true ,
1151- failedCreationRemedy : { retries : 4 , autoContextSizeShrink : 0.5 } ,
1157+ ignoreMemorySafetyChecks : ! this . modelInfo || this . modelInfo . gpuMode === false ,
1158+ failedCreationRemedy : { retries : 8 , autoContextSizeShrink : 0.5 } ,
11521159 } ) ;
11531160 }
11541161
@@ -1174,10 +1181,10 @@ class LLMEngine extends EventEmitter {
11741181 // Context is exhausted, recreate it
11751182 try { this . context . dispose ?. ( ) ; } catch { }
11761183 this . context = await this . model . createContext ( {
1177- contextSize : { min : 512 , max : this . _computeMaxContext ( 0 ) } ,
1184+ contextSize : { min : 2048 , max : this . _computeMaxContext ( 0 ) } ,
11781185 flashAttention : true ,
1179- ignoreMemorySafetyChecks : true ,
1180- failedCreationRemedy : { retries : 4 , autoContextSizeShrink : 0.5 } ,
1186+ ignoreMemorySafetyChecks : ! this . modelInfo || this . modelInfo . gpuMode === false ,
1187+ failedCreationRemedy : { retries : 8 , autoContextSizeShrink : 0.5 } ,
11811188 } ) ;
11821189
11831190 if ( this . context ) {
0 commit comments