@@ -108,6 +108,44 @@ function parsePositiveInteger(value) {
108108 return Math . trunc ( numeric ) ;
109109}
110110
111+ function parseInteger ( value , fallback = 0 ) {
112+ const numeric = Number ( value ) ;
113+ if ( ! Number . isFinite ( numeric ) ) {
114+ return fallback ;
115+ }
116+ return Math . trunc ( numeric ) ;
117+ }
118+
119+ function parseBooleanFlag ( value , fallback = false ) {
120+ if ( typeof value === 'boolean' ) {
121+ return value ;
122+ }
123+ if ( typeof value === 'number' && Number . isFinite ( value ) ) {
124+ return value !== 0 ;
125+ }
126+ return fallback ;
127+ }
128+
129+ function parseOptionalBooleanFlag ( value ) {
130+ if ( typeof value === 'boolean' ) {
131+ return value ? 1 : 0 ;
132+ }
133+ if ( typeof value === 'number' && Number . isFinite ( value ) ) {
134+ return value !== 0 ? 1 : 0 ;
135+ }
136+ return - 1 ;
137+ }
138+
139+ function parseEnumValue ( value , allowed , fallback ) {
140+ const parsed = parseInteger ( value , fallback ) ;
141+ return allowed . includes ( parsed ) ? parsed : fallback ;
142+ }
143+
144+ function parsePositiveNumber ( value ) {
145+ const numeric = Number ( value ) ;
146+ return Number . isFinite ( numeric ) && numeric > 0 ? numeric : 0 ;
147+ }
148+
111149function parseTotalFromContentRangeHeader ( contentRangeHeader ) {
112150 if ( typeof contentRangeHeader !== 'string' || contentRangeHeader . length === 0 ) {
113151 return 0 ;
@@ -1329,6 +1367,17 @@ class LlamaWebGpuBridgeRuntime {
13291367 this . _nGpuLayers = Number . isFinite ( config . nGpuLayers )
13301368 ? Number ( config . nGpuLayers )
13311369 : - 1 ;
1370+ this . _nSeqMax = 0 ;
1371+ this . _useMmap = false ;
1372+ this . _useMlock = false ;
1373+ this . _flashAttention = - 1 ;
1374+ this . _cacheTypeK = 1 ;
1375+ this . _cacheTypeV = 1 ;
1376+ this . _kvUnified = - 1 ;
1377+ this . _ropeFrequencyBase = 0 ;
1378+ this . _ropeFrequencyScale = 0 ;
1379+ this . _splitMode = - 1 ;
1380+ this . _mainGpu = - 1 ;
13321381 this . _isSafari = isSafariUserAgent ( this . _config . userAgent ?? globalThis . navigator ?. userAgent ?? '' ) ;
13331382 this . _coreVariant = 'uninitialized' ;
13341383 this . _preferMemory64 = this . _config . preferMemory64 !== false ;
@@ -1963,6 +2012,70 @@ class LlamaWebGpuBridgeRuntime {
19632012 }
19642013 }
19652014
2015+ _resolveNativeLoadOptions ( options = { } ) {
2016+ this . _nSeqMax = parsePositiveInteger ( options . nSeqMax ) ;
2017+ this . _useMmap = parseBooleanFlag ( options . useMmap , false ) ;
2018+ this . _useMlock = parseBooleanFlag ( options . useMlock , false ) ;
2019+ this . _flashAttention = parseEnumValue ( options . flashAttention , [ - 1 , 0 , 1 ] , - 1 ) ;
2020+ this . _cacheTypeK = parseEnumValue ( options . cacheTypeK , [ 1 , 2 , 8 ] , 1 ) ;
2021+ this . _cacheTypeV = parseEnumValue ( options . cacheTypeV , [ 1 , 2 , 8 ] , 1 ) ;
2022+ this . _kvUnified = parseOptionalBooleanFlag ( options . kvUnified ) ;
2023+ this . _ropeFrequencyBase = parsePositiveNumber ( options . ropeFrequencyBase ) ;
2024+ this . _ropeFrequencyScale = parsePositiveNumber ( options . ropeFrequencyScale ) ;
2025+ this . _splitMode = parseEnumValue ( options . splitMode , [ 0 , 1 , 2 , 3 ] , - 1 ) ;
2026+ this . _mainGpu = parseInteger ( options . mainGpu , - 1 ) ;
2027+ if ( this . _mainGpu < 0 ) {
2028+ this . _mainGpu = - 1 ;
2029+ }
2030+
2031+ const wantsQuantizedKvCache = this . _cacheTypeK !== 1 || this . _cacheTypeV !== 1 ;
2032+ if ( this . _flashAttention === 0 && wantsQuantizedKvCache ) {
2033+ throw new Error (
2034+ 'Non-F16 KV cache requires flashAttention to be auto or enabled.' ,
2035+ ) ;
2036+ }
2037+ if ( this . _flashAttention === - 1 && wantsQuantizedKvCache ) {
2038+ this . _flashAttention = 1 ;
2039+ this . _runtimeNotes . push ( 'flash_attention:auto_enabled_for_kv_cache' ) ;
2040+ }
2041+ if ( this . _kvUnified < 0 && this . _nSeqMax > 1 ) {
2042+ this . _kvUnified = 1 ;
2043+ this . _runtimeNotes . push ( 'kv_unified:auto_enabled_for_sequences' ) ;
2044+ }
2045+ }
2046+
2047+ _nativeLoadOptionValues ( ) {
2048+ return [
2049+ this . _nSeqMax ,
2050+ this . _useMmap ? 1 : 0 ,
2051+ this . _useMlock ? 1 : 0 ,
2052+ this . _flashAttention ,
2053+ this . _cacheTypeK ,
2054+ this . _cacheTypeV ,
2055+ this . _kvUnified ,
2056+ this . _ropeFrequencyBase ,
2057+ this . _ropeFrequencyScale ,
2058+ this . _splitMode ,
2059+ this . _mainGpu ,
2060+ ] ;
2061+ }
2062+
2063+ _nativeLoadOptionTypes ( ) {
2064+ return [
2065+ 'number' ,
2066+ 'number' ,
2067+ 'number' ,
2068+ 'number' ,
2069+ 'number' ,
2070+ 'number' ,
2071+ 'number' ,
2072+ 'number' ,
2073+ 'number' ,
2074+ 'number' ,
2075+ 'number' ,
2076+ ] ;
2077+ }
2078+
19662079 async _tryLoadModelFromRemoteFetchBackend ( core , url , options = { } ) {
19672080 if ( ! this . _canUseRemoteFetchBackend ( options ) ) {
19682081 return { loaded : false , sizeBytes : null } ;
@@ -2031,6 +2144,7 @@ class LlamaWebGpuBridgeRuntime {
20312144 'number' ,
20322145 'number' ,
20332146 'number' ,
2147+ ...this . _nativeLoadOptionTypes ( ) ,
20342148 ] ,
20352149 [
20362150 remoteFetchUrl ,
@@ -2041,6 +2155,7 @@ class LlamaWebGpuBridgeRuntime {
20412155 this . _nUbatch ,
20422156 this . _nGpuLayers ,
20432157 chunkBytes ,
2158+ ...this . _nativeLoadOptionValues ( ) ,
20442159 ] ,
20452160 { async : true } ,
20462161 ) ,
@@ -2926,6 +3041,8 @@ class LlamaWebGpuBridgeRuntime {
29263041 this . _nUbatch = this . _nBatch ;
29273042 }
29283043
3044+ this . _resolveNativeLoadOptions ( options ) ;
3045+
29293046 if ( Number . isFinite ( this . _threadPoolSizeHint ) && this . _threadPoolSizeHint > 0 ) {
29303047 this . _pushRuntimeNote ( `thread_pool_size:${ this . _threadPoolSizeHint } ` ) ;
29313048 }
@@ -2947,6 +3064,9 @@ class LlamaWebGpuBridgeRuntime {
29473064 if ( this . _nUbatch > 0 ) {
29483065 this . _pushRuntimeNote ( `n_ubatch:${ this . _nUbatch } ` ) ;
29493066 }
3067+ if ( this . _nSeqMax > 0 ) {
3068+ this . _pushRuntimeNote ( `n_seq_max:${ this . _nSeqMax } ` ) ;
3069+ }
29503070 if ( isCpuModelMode && ! Number . isFinite ( requestedBatch ) && ! Number . isFinite ( requestedUbatch ) ) {
29513071 this . _runtimeNotes . push ( 'cpu_batch_tuned_default' ) ;
29523072 }
@@ -3174,7 +3294,16 @@ class LlamaWebGpuBridgeRuntime {
31743294 await core . ccall (
31753295 'llamadart_webgpu_load_model' ,
31763296 'number' ,
3177- [ 'string' , 'number' , 'number' , 'number' , 'number' , 'number' , 'number' ] ,
3297+ [
3298+ 'string' ,
3299+ 'number' ,
3300+ 'number' ,
3301+ 'number' ,
3302+ 'number' ,
3303+ 'number' ,
3304+ 'number' ,
3305+ ...this . _nativeLoadOptionTypes ( ) ,
3306+ ] ,
31783307 [
31793308 this . _modelPath ,
31803309 this . _nCtx ,
@@ -3183,6 +3312,7 @@ class LlamaWebGpuBridgeRuntime {
31833312 this . _nBatch ,
31843313 this . _nUbatch ,
31853314 this . _nGpuLayers ,
3315+ ...this . _nativeLoadOptionValues ( ) ,
31863316 ] ,
31873317 { async : true } ,
31883318 ) ,
@@ -3307,6 +3437,7 @@ class LlamaWebGpuBridgeRuntime {
33073437 'number' ,
33083438 'number' ,
33093439 'number' ,
3440+ ...this . _nativeLoadOptionTypes ( ) ,
33103441 ] ,
33113442 [
33123443 reloadUrl ,
@@ -3317,6 +3448,7 @@ class LlamaWebGpuBridgeRuntime {
33173448 this . _nUbatch ,
33183449 candidateLayers ,
33193450 remoteFetchReloadChunkBytes ,
3451+ ...this . _nativeLoadOptionValues ( ) ,
33203452 ] ,
33213453 { async : true } ,
33223454 ) ,
@@ -3326,7 +3458,16 @@ class LlamaWebGpuBridgeRuntime {
33263458 await core . ccall (
33273459 'llamadart_webgpu_load_model' ,
33283460 'number' ,
3329- [ 'string' , 'number' , 'number' , 'number' , 'number' , 'number' , 'number' ] ,
3461+ [
3462+ 'string' ,
3463+ 'number' ,
3464+ 'number' ,
3465+ 'number' ,
3466+ 'number' ,
3467+ 'number' ,
3468+ 'number' ,
3469+ ...this . _nativeLoadOptionTypes ( ) ,
3470+ ] ,
33303471 [
33313472 this . _modelPath ,
33323473 this . _nCtx ,
@@ -3335,6 +3476,7 @@ class LlamaWebGpuBridgeRuntime {
33353476 this . _nBatch ,
33363477 this . _nUbatch ,
33373478 candidateLayers ,
3479+ ...this . _nativeLoadOptionValues ( ) ,
33383480 ] ,
33393481 { async : true } ,
33403482 ) ,
@@ -4079,6 +4221,20 @@ class LlamaWebGpuBridgeRuntime {
40794221 'llamadart.webgpu.n_threads_batch' : String ( this . _threadsBatch ) ,
40804222 'llamadart.webgpu.n_batch' : this . _nBatch > 0 ? String ( this . _nBatch ) : '' ,
40814223 'llamadart.webgpu.n_ubatch' : this . _nUbatch > 0 ? String ( this . _nUbatch ) : '' ,
4224+ 'llamadart.webgpu.n_seq_max' : this . _nSeqMax > 0 ? String ( this . _nSeqMax ) : '' ,
4225+ 'llamadart.webgpu.flash_attention' : String ( this . _flashAttention ) ,
4226+ 'llamadart.webgpu.cache_type_k' : String ( this . _cacheTypeK ) ,
4227+ 'llamadart.webgpu.cache_type_v' : String ( this . _cacheTypeV ) ,
4228+ 'llamadart.webgpu.kv_unified' :
4229+ this . _kvUnified >= 0 ? String ( this . _kvUnified ) : '' ,
4230+ 'llamadart.webgpu.rope_freq_base' :
4231+ this . _ropeFrequencyBase > 0 ? String ( this . _ropeFrequencyBase ) : '' ,
4232+ 'llamadart.webgpu.rope_freq_scale' :
4233+ this . _ropeFrequencyScale > 0 ? String ( this . _ropeFrequencyScale ) : '' ,
4234+ 'llamadart.webgpu.split_mode' :
4235+ this . _splitMode >= 0 ? String ( this . _splitMode ) : '' ,
4236+ 'llamadart.webgpu.main_gpu' :
4237+ this . _mainGpu >= 0 ? String ( this . _mainGpu ) : '' ,
40824238 'llamadart.webgpu.thread_pool_size' :
40834239 Number . isFinite ( this . _threadPoolSizeHint ) && this . _threadPoolSizeHint > 0
40844240 ? String ( this . _threadPoolSizeHint )
0 commit comments