@@ -54,6 +54,7 @@ class LLMEngine extends EventEmitter {
5454 this . _initializingPromise = null ; // Tracks in-flight initialize() for serialization (prevents native C++ double-op crash)
5555 this . gpuInfo = null ;
5656 this . gpuPreference = 'auto' ; // 'auto' = prefer GPU, 'cpu' = force CPU only
57+ this . requireMinContextForGpu = false ; // if true: discard GPU load when context < 4096 and retry CPU for more context
5758 this . reasoningEffort = 'medium' ; // 'low', 'medium', 'high'
5859 this . thoughtTokenBudget = 2048 ; // Updated from ModelProfile after model load
5960
@@ -221,7 +222,8 @@ class LLMEngine extends EventEmitter {
221222 // Uses node-llama-cpp's gpuLayers: "auto" — automatically detects available VRAM
222223 // and offloads the optimal number of layers. ONE load attempt, not 7+.
223224 // This is exactly how LM Studio achieves instant loads.
224- const gpuModes = this . gpuPreference === 'cpu' ? [ false ] : [ 'auto' , false ] ;
225+ // gpuModes is a let — may be expanded with a partial layer fallback after nvidia-smi
226+ let gpuModes = this . gpuPreference === 'cpu' ? [ false ] : [ 'auto' , false ] ;
225227
226228 // Detect real dedicated VRAM via nvidia-smi BEFORE calling getLlama().
227229 // Problem: Vulkan on systems with GTT/shared memory reports dedicated VRAM + system RAM
@@ -245,6 +247,20 @@ class LLMEngine extends EventEmitter {
245247 }
246248 }
247249
250+ // If the model is too large for full GPU offload, insert a partial layer fallback
251+ // between 'auto' and false so we get partial offload instead of pure CPU.
252+ // LM Studio does the same. Without this, auto fails → 0 layers every time.
253+ if ( this . gpuPreference !== 'cpu' && nvidiaDedicatedVramBytes > 0 ) {
254+ const usableVram = nvidiaDedicatedVramBytes * 0.75 ; // 75%: leaves 25% for KV cache
255+ if ( modelStats . size > usableVram ) {
256+ const fraction = usableVram / modelStats . size ;
257+ // 80 layers is a safe upper bound for any model up to 200B
258+ const partialLayers = Math . max ( 1 , Math . floor ( 80 * fraction ) ) ;
259+ console . log ( `[LLM] Model (${ ( modelStats . size / ( 1024 ** 3 ) ) . toFixed ( 1 ) } GB) exceeds usable VRAM (${ ( usableVram / ( 1024 ** 3 ) ) . toFixed ( 1 ) } GB) — partial fallback: ${ partialLayers } layers (~${ ( fraction * 100 ) . toFixed ( 0 ) } % offloaded)` ) ;
260+ gpuModes = [ 'auto' , partialLayers , false ] ;
261+ }
262+ }
263+
248264 let gpuLayers = 0 ;
249265 let contextSize = 8192 ;
250266 let gpuMode = 'auto' ;
@@ -256,16 +272,18 @@ class LLMEngine extends EventEmitter {
256272
257273 this . emit ( 'status' , {
258274 state : 'loading' ,
259- message : tryGpuMode === 'auto' ? 'Initializing GPU...' : ' Falling back to CPU...',
275+ message : tryGpuMode === 'auto' ? 'Initializing GPU...' : ( typeof tryGpuMode === 'number' ? `Trying partial GPU ( ${ tryGpuMode } layers)...` : ' Falling back to CPU...') ,
260276 progress : 0.05
261277 } ) ;
262278
263279 if ( this . model ) { try { await this . model . dispose ( ) ; } catch ( e ) { } this . model = null ; }
264280 if ( this . context ) { try { await this . context . dispose ( ) ; } catch ( e ) { } this . context = null ; }
265281
266282 try {
267- // Reuse existing llama instance if same GPU mode (skip expensive CUDA init)
268- const canReuse = this . llamaInstance && this . _lastGpuMode === tryGpuMode ;
283+ // Reuse existing llama instance if same GPU mode (skip expensive CUDA init).
284+ // Numeric fallback modes reuse the 'auto' instance — same GPU backend, different gpuLayers.
285+ const canReuse = this . llamaInstance &&
286+ ( this . _lastGpuMode === tryGpuMode || ( typeof tryGpuMode === 'number' && this . _lastGpuMode === 'auto' ) ) ;
269287 if ( canReuse ) {
270288 console . log ( `[LLM] Reusing existing llama instance (gpu=${ tryGpuMode } )` ) ;
271289 } else {
@@ -276,7 +294,9 @@ class LLMEngine extends EventEmitter {
276294 // (GTT/shared memory case), cap the usable budget to real dedicated VRAM only.
277295 // Otherwise gpuLayers:'auto' over-allocates onto non-existent memory and fails.
278296 this . llamaInstance = await this . _withTimeout ( getLlama ( {
279- gpu : tryGpuMode ,
297+ // Numeric modes still use gpu:'auto' for backend init — the layer count
298+ // is passed to loadModel, not getLlama. Only false disables GPU entirely.
299+ gpu : ( tryGpuMode === false ) ? false : 'auto' ,
280300 vramPadding : ( totalVram ) => {
281301 // Use nvidia-smi value if Vulkan is reporting GTT-inflated total
282302 const effectiveBudget = ( nvidiaDedicatedVramBytes > 0 && nvidiaDedicatedVramBytes < totalVram * 0.7 )
@@ -316,7 +336,7 @@ class LLMEngine extends EventEmitter {
316336 // internally). Do NOT convert to a file:// URL here — that breaks path.resolve.
317337 this . model = await this . _withTimeout ( this . llamaInstance . loadModel ( {
318338 modelPath : modelPath ,
319- gpuLayers : tryGpuMode === 'auto' ? 'auto' : 0 ,
339+ gpuLayers : tryGpuMode === 'auto' ? 'auto' : ( typeof tryGpuMode === 'number' ? tryGpuMode : 0 ) ,
320340 defaultContextFlashAttention : true ,
321341 useMmap : true ,
322342 onLoadProgress : ( progress ) => {
@@ -328,6 +348,11 @@ class LLMEngine extends EventEmitter {
328348 // Read actual GPU layers from the loaded model
329349 try { gpuLayers = this . model . gpuLayers ?? 0 ; } catch ( _ ) { gpuLayers = 0 ; }
330350 console . log ( `[LLM] Model loaded: ${ gpuLayers } GPU layers (mode: ${ tryGpuMode } )` ) ;
351+ // If auto returned 0 layers and we have a partial fallback waiting, skip to it
352+ if ( tryGpuMode === 'auto' && gpuLayers === 0 && gpuModes . some ( m => typeof m === 'number' ) ) {
353+ console . log ( '[LLM] Auto returned 0 GPU layers — skipping to partial layer fallback' ) ;
354+ continue ;
355+ }
331356 } catch ( loadErr ) {
332357 console . log ( `[LLM] Model load (gpu=${ tryGpuMode } ) failed: ${ loadErr . message ?. substring ( 0 , 120 ) } ` ) ;
333358 continue ;
@@ -378,11 +403,11 @@ class LLMEngine extends EventEmitter {
378403 }
379404 }
380405
381- // If context is critically small (< 4096), don't accept — fall through to CPU.
382- // 4096 is the absolute minimum for any useful agentic chat with tool definitions .
406+ // If context is critically small (< 4096), optionally fall through to CPU.
407+ // Controlled by requireMinContextForGpu setting (default: false = always keep GPU) .
383408 const MIN_AGENTIC_CONTEXT = 4096 ;
384- if ( success && contextSize < MIN_AGENTIC_CONTEXT && tryGpuMode !== false ) {
385- console . log ( `[LLM] GPU context too small (${ contextSize } < ${ MIN_AGENTIC_CONTEXT } ) — retrying with CPU for larger context` ) ;
409+ if ( this . requireMinContextForGpu && success && contextSize < MIN_AGENTIC_CONTEXT && tryGpuMode !== false ) {
410+ console . log ( `[LLM] GPU context too small (${ contextSize } < ${ MIN_AGENTIC_CONTEXT } ) — requireMinContextForGpu=true, retrying with CPU for larger context` ) ;
386411 success = false ;
387412 if ( this . context ) { try { await this . context . dispose ( ) ; } catch ( e ) { } this . context = null ; }
388413 }
@@ -1509,6 +1534,11 @@ PERSISTENCE:
15091534 }
15101535 }
15111536
1537+ setRequireMinContextForGpu ( val ) {
1538+ this . requireMinContextForGpu = ! ! val ;
1539+ console . log ( `[LLM] requireMinContextForGpu set to: ${ this . requireMinContextForGpu } ` ) ;
1540+ }
1541+
15121542 updateParams ( params ) {
15131543 this . defaultParams = { ...this . defaultParams , ...params } ;
15141544 }
0 commit comments