@@ -549,6 +549,26 @@ function toUint8Array(value) {
549549 return null ;
550550}
551551
552+ function trimUnstableUtf8Tail ( text ) {
553+ if ( typeof text !== 'string' || text . length === 0 ) {
554+ return '' ;
555+ }
556+
557+ let end = text . length ;
558+ while ( end > 0 && text . charCodeAt ( end - 1 ) === 0xFFFD ) {
559+ end -= 1 ;
560+ }
561+
562+ if ( end > 0 ) {
563+ const tail = text . charCodeAt ( end - 1 ) ;
564+ if ( tail >= 0xD800 && tail <= 0xDBFF ) {
565+ end -= 1 ;
566+ }
567+ }
568+
569+ return end === text . length ? text : text . slice ( 0 , end ) ;
570+ }
571+
552572function toFloat32Array ( value ) {
553573 if ( ! value ) {
554574 return null ;
@@ -3831,7 +3851,8 @@ class LlamaWebGpuBridgeRuntime {
38313851 const shouldYieldForResponsiveness =
38323852 ! ( typeof WorkerGlobalScope !== 'undefined' && globalThis instanceof WorkerGlobalScope ) ;
38333853 const yieldInterval = shouldYieldForResponsiveness ? 4 : 0 ;
3834- let streamed = shouldEmitCurrentText ? '' : null ;
3854+ let streamed = '' ;
3855+ let emittedStableText = '' ;
38353856
38363857 while ( generated < nPredict ) {
38373858 if ( this . _abortRequested || options . signal ?. aborted ) {
@@ -3888,19 +3909,25 @@ class LlamaWebGpuBridgeRuntime {
38883909 }
38893910
38903911 generated += 1 ;
3891- const piece = this . _core . ccall ( 'llamadart_webgpu_last_piece' , 'string' , [ ] , [ ] ) || '' ;
3892- if ( piece . length === 0 ) {
3912+ const fullText = this . _core . ccall ( 'llamadart_webgpu_last_output' , 'string' , [ ] , [ ] ) || '' ;
3913+ streamed = fullText ;
3914+ const stableText = trimUnstableUtf8Tail ( fullText ) ;
3915+
3916+ if ( ! stableText . startsWith ( emittedStableText ) ) {
3917+ emittedStableText = '' ;
3918+ }
3919+
3920+ const deltaText = stableText . slice ( emittedStableText . length ) ;
3921+ if ( deltaText . length === 0 ) {
38933922 continue ;
38943923 }
3924+ emittedStableText = stableText ;
38953925
38963926 if ( typeof options . onToken === 'function' ) {
3897- const piecePayload = emitTokenText ? piece : textEncoder . encode ( piece ) ;
3898- if ( shouldEmitCurrentText ) {
3899- streamed += piece ;
3900- options . onToken ( piecePayload , streamed ) ;
3901- } else {
3902- options . onToken ( piecePayload , null ) ;
3903- }
3927+ const piecePayload = emitTokenText
3928+ ? deltaText
3929+ : textEncoder . encode ( deltaText ) ;
3930+ options . onToken ( piecePayload , shouldEmitCurrentText ? fullText : null ) ;
39043931 }
39053932
39063933 if ( yieldInterval > 0 && ( generated % yieldInterval ) === 0 ) {
@@ -3909,6 +3936,17 @@ class LlamaWebGpuBridgeRuntime {
39093936 }
39103937
39113938 const text = this . _core . ccall ( 'llamadart_webgpu_last_output' , 'string' , [ ] , [ ] ) || streamed || '' ;
3939+ if ( typeof options . onToken === 'function' ) {
3940+ const tailText = text . startsWith ( emittedStableText )
3941+ ? text . slice ( emittedStableText . length )
3942+ : '' ;
3943+ if ( tailText . length > 0 ) {
3944+ const piecePayload = emitTokenText
3945+ ? tailText
3946+ : textEncoder . encode ( tailText ) ;
3947+ options . onToken ( piecePayload , shouldEmitCurrentText ? text : null ) ;
3948+ }
3949+ }
39123950 return text ;
39133951 } finally {
39143952 if ( generationStarted ) {
@@ -4203,6 +4241,40 @@ export class LlamaWebGpuBridge {
42034241 return sanitized ;
42044242 }
42054243
4244+ _createCpuSafeMultimodalLoadOptions ( options = { } ) {
4245+ const sanitized = this . _sanitizeModelLoadOptions ( options ) ;
4246+ sanitized . nGpuLayers = 0 ;
4247+
4248+ if ( Number . isFinite ( Number ( sanitized . nCtx ) ) && Number ( sanitized . nCtx ) > 4096 ) {
4249+ sanitized . nCtx = 4096 ;
4250+ }
4251+
4252+ if ( ! Number . isFinite ( Number ( sanitized . nThreads ) ) || Number ( sanitized . nThreads ) <= 0 ) {
4253+ sanitized . nThreads = 4 ;
4254+ } else {
4255+ sanitized . nThreads = Math . min ( 4 , Math . max ( 1 , Math . trunc ( Number ( sanitized . nThreads ) ) ) ) ;
4256+ }
4257+
4258+ sanitized . nThreadsBatch = sanitized . nThreads ;
4259+
4260+ if ( ! Number . isFinite ( Number ( sanitized . nBatch ) ) || Number ( sanitized . nBatch ) <= 0 ) {
4261+ sanitized . nBatch = 128 ;
4262+ } else {
4263+ sanitized . nBatch = Math . min ( 128 , Math . max ( 32 , Math . trunc ( Number ( sanitized . nBatch ) ) ) ) ;
4264+ }
4265+
4266+ if ( ! Number . isFinite ( Number ( sanitized . nUbatch ) ) || Number ( sanitized . nUbatch ) <= 0 ) {
4267+ sanitized . nUbatch = Math . min ( 64 , sanitized . nBatch ) ;
4268+ } else {
4269+ sanitized . nUbatch = Math . min (
4270+ sanitized . nBatch ,
4271+ Math . min ( 64 , Math . max ( 1 , Math . trunc ( Number ( sanitized . nUbatch ) ) ) ) ,
4272+ ) ;
4273+ }
4274+
4275+ return sanitized ;
4276+ }
4277+
42064278 _rememberLoadedModel ( url , options = { } ) {
42074279 const normalizedUrl = String ( url || '' ) . trim ( ) ;
42084280 if ( normalizedUrl . length === 0 ) {
@@ -4277,7 +4349,9 @@ export class LlamaWebGpuBridge {
42774349 return false ;
42784350 }
42794351
4280- const selectedOptions = this . _sanitizeModelLoadOptions ( this . _loadedModelOptions || { } ) ;
4352+ const selectedOptions = this . _sanitizeModelLoadOptions (
4353+ this . _loadedModelOptions || { } ,
4354+ ) ;
42814355
42824356 const applyWorkerSafeMode = async ( ) => {
42834357 await this . _callWorker ( 'loadModelFromUrl' , [ this . _loadedModelUrl , selectedOptions ] ) ;
@@ -4381,12 +4455,29 @@ export class LlamaWebGpuBridge {
43814455 }
43824456
43834457 const forceReloadRequested = options ?. _llamadartForceRuntimeReload === true ;
4458+ const mediaPartsRequested = this . _hasMediaParts ( options ) ;
43844459 const shouldEnsureMultimodalInRuntime =
4385- this . _hasMediaParts ( options )
4460+ mediaPartsRequested
43864461 && typeof this . _loadedMmProjUrl === 'string'
43874462 && this . _loadedMmProjUrl . length > 0 ;
4463+ const workerTimedOut = this . _isWorkerTimeoutError ( fallbackError ) ;
4464+ const forcedCpuFallback = this . _isForcedCpuMultimodalFallbackError ( fallbackError ) ;
4465+ const dispatchWorkgroupFallback = this . _isDispatchWorkgroupLimitError ( fallbackError ) ;
4466+ const loadedGpuLayers = Number ( this . _loadedModelOptions ?. nGpuLayers ) ;
4467+ const metadataGpuLayers = Number ( this . _metadata ?. [ 'llamadart.webgpu.n_gpu_layers' ] ) ;
4468+ const modelLoadedWithGpu = Number . isFinite ( loadedGpuLayers )
4469+ ? loadedGpuLayers !== 0
4470+ : ( Number . isFinite ( metadataGpuLayers ) ? metadataGpuLayers !== 0 : true ) ;
4471+ const shouldUseCpuMultimodalFallback =
4472+ mediaPartsRequested
4473+ && modelLoadedWithGpu
4474+ && ( dispatchWorkgroupFallback || forcedCpuFallback || workerTimedOut ) ;
43884475
4389- if ( Number ( this . _runtime ?. _modelBytes ) > 0 && ! forceReloadRequested ) {
4476+ if (
4477+ Number ( this . _runtime ?. _modelBytes ) > 0
4478+ && ! forceReloadRequested
4479+ && ! shouldUseCpuMultimodalFallback
4480+ ) {
43904481 if ( shouldEnsureMultimodalInRuntime ) {
43914482 const runtimeSupportsMedia =
43924483 ( typeof this . _runtime . supportsVision === 'function' && this . _runtime . supportsVision ( ) )
@@ -4407,25 +4498,18 @@ export class LlamaWebGpuBridge {
44074498 return ;
44084499 }
44094500
4410- const loadOptions = this . _sanitizeModelLoadOptions ( this . _loadedModelOptions || { } ) ;
4411- const workerTimedOut = this . _isWorkerTimeoutError ( fallbackError ) ;
4412- const forcedCpuFallback = this . _isForcedCpuMultimodalFallbackError ( fallbackError ) ;
4413- const forceCpuMultimodalFallback =
4414- this . _hasMediaParts ( options )
4415- && ( this . _isDispatchWorkgroupLimitError ( fallbackError )
4416- || forcedCpuFallback )
4417- && Number ( loadOptions . nGpuLayers ) !== 0 ;
4418-
4419- if ( forceCpuMultimodalFallback ) {
4420- loadOptions . nGpuLayers = 0 ;
4421- if ( Number . isFinite ( loadOptions . nCtx ) && Number ( loadOptions . nCtx ) > 4096 ) {
4422- loadOptions . nCtx = 4096 ;
4423- }
4424-
4501+ const loadOptions = shouldUseCpuMultimodalFallback
4502+ ? this . _createCpuSafeMultimodalLoadOptions ( this . _loadedModelOptions || { } )
4503+ : this . _sanitizeModelLoadOptions ( this . _loadedModelOptions || { } ) ;
4504+ if ( shouldUseCpuMultimodalFallback ) {
44254505 if ( forcedCpuFallback ) {
44264506 this . _emitBridgeWarn (
44274507 'llamadart: using CPU fallback for multimodal generation stability.' ,
44284508 ) ;
4509+ } else if ( workerTimedOut ) {
4510+ this . _emitBridgeWarn (
4511+ 'llamadart: retrying multimodal generation with CPU fallback after worker timeout.' ,
4512+ ) ;
44294513 } else {
44304514 this . _emitBridgeWarn (
44314515 'llamadart: retrying multimodal generation with CPU fallback after WebGPU workgroup limit failure.' ,
@@ -4448,7 +4532,7 @@ export class LlamaWebGpuBridge {
44484532 if ( workerTimedOut ) {
44494533 this . _runtime . _runtimeNotes . push ( 'worker_fallback_timeout' ) ;
44504534 }
4451- if ( forceCpuMultimodalFallback ) {
4535+ if ( shouldUseCpuMultimodalFallback ) {
44524536 this . _runtime . _runtimeNotes . push ( 'worker_fallback_cpu_multimodal' ) ;
44534537 }
44544538 }
0 commit comments