Skip to content

Commit f07ee62

Browse files
authored
Merge pull request #4 from leehack/fix/webgpu-utf8-streaming
fix(webgpu): stabilize qwen streaming and multimodal fallback
2 parents a5317b1 + ab7ad67 commit f07ee62

2 files changed

Lines changed: 121 additions & 29 deletions

File tree

js/llama_webgpu_bridge.js

Lines changed: 113 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,26 @@ function toUint8Array(value) {
549549
return null;
550550
}
551551

552+
function trimUnstableUtf8Tail(text) {
553+
if (typeof text !== 'string' || text.length === 0) {
554+
return '';
555+
}
556+
557+
let end = text.length;
558+
while (end > 0 && text.charCodeAt(end - 1) === 0xFFFD) {
559+
end -= 1;
560+
}
561+
562+
if (end > 0) {
563+
const tail = text.charCodeAt(end - 1);
564+
if (tail >= 0xD800 && tail <= 0xDBFF) {
565+
end -= 1;
566+
}
567+
}
568+
569+
return end === text.length ? text : text.slice(0, end);
570+
}
571+
552572
function toFloat32Array(value) {
553573
if (!value) {
554574
return null;
@@ -3831,7 +3851,8 @@ class LlamaWebGpuBridgeRuntime {
38313851
const shouldYieldForResponsiveness =
38323852
!(typeof WorkerGlobalScope !== 'undefined' && globalThis instanceof WorkerGlobalScope);
38333853
const yieldInterval = shouldYieldForResponsiveness ? 4 : 0;
3834-
let streamed = shouldEmitCurrentText ? '' : null;
3854+
let streamed = '';
3855+
let emittedStableText = '';
38353856

38363857
while (generated < nPredict) {
38373858
if (this._abortRequested || options.signal?.aborted) {
@@ -3888,19 +3909,25 @@ class LlamaWebGpuBridgeRuntime {
38883909
}
38893910

38903911
generated += 1;
3891-
const piece = this._core.ccall('llamadart_webgpu_last_piece', 'string', [], []) || '';
3892-
if (piece.length === 0) {
3912+
const fullText = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || '';
3913+
streamed = fullText;
3914+
const stableText = trimUnstableUtf8Tail(fullText);
3915+
3916+
if (!stableText.startsWith(emittedStableText)) {
3917+
emittedStableText = '';
3918+
}
3919+
3920+
const deltaText = stableText.slice(emittedStableText.length);
3921+
if (deltaText.length === 0) {
38933922
continue;
38943923
}
3924+
emittedStableText = stableText;
38953925

38963926
if (typeof options.onToken === 'function') {
3897-
const piecePayload = emitTokenText ? piece : textEncoder.encode(piece);
3898-
if (shouldEmitCurrentText) {
3899-
streamed += piece;
3900-
options.onToken(piecePayload, streamed);
3901-
} else {
3902-
options.onToken(piecePayload, null);
3903-
}
3927+
const piecePayload = emitTokenText
3928+
? deltaText
3929+
: textEncoder.encode(deltaText);
3930+
options.onToken(piecePayload, shouldEmitCurrentText ? fullText : null);
39043931
}
39053932

39063933
if (yieldInterval > 0 && (generated % yieldInterval) === 0) {
@@ -3909,6 +3936,17 @@ class LlamaWebGpuBridgeRuntime {
39093936
}
39103937

39113938
const text = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || streamed || '';
3939+
if (typeof options.onToken === 'function') {
3940+
const tailText = text.startsWith(emittedStableText)
3941+
? text.slice(emittedStableText.length)
3942+
: '';
3943+
if (tailText.length > 0) {
3944+
const piecePayload = emitTokenText
3945+
? tailText
3946+
: textEncoder.encode(tailText);
3947+
options.onToken(piecePayload, shouldEmitCurrentText ? text : null);
3948+
}
3949+
}
39123950
return text;
39133951
} finally {
39143952
if (generationStarted) {
@@ -4203,6 +4241,40 @@ export class LlamaWebGpuBridge {
42034241
return sanitized;
42044242
}
42054243

4244+
_createCpuSafeMultimodalLoadOptions(options = {}) {
4245+
const sanitized = this._sanitizeModelLoadOptions(options);
4246+
sanitized.nGpuLayers = 0;
4247+
4248+
if (Number.isFinite(Number(sanitized.nCtx)) && Number(sanitized.nCtx) > 4096) {
4249+
sanitized.nCtx = 4096;
4250+
}
4251+
4252+
if (!Number.isFinite(Number(sanitized.nThreads)) || Number(sanitized.nThreads) <= 0) {
4253+
sanitized.nThreads = 4;
4254+
} else {
4255+
sanitized.nThreads = Math.min(4, Math.max(1, Math.trunc(Number(sanitized.nThreads))));
4256+
}
4257+
4258+
sanitized.nThreadsBatch = sanitized.nThreads;
4259+
4260+
if (!Number.isFinite(Number(sanitized.nBatch)) || Number(sanitized.nBatch) <= 0) {
4261+
sanitized.nBatch = 128;
4262+
} else {
4263+
sanitized.nBatch = Math.min(128, Math.max(32, Math.trunc(Number(sanitized.nBatch))));
4264+
}
4265+
4266+
if (!Number.isFinite(Number(sanitized.nUbatch)) || Number(sanitized.nUbatch) <= 0) {
4267+
sanitized.nUbatch = Math.min(64, sanitized.nBatch);
4268+
} else {
4269+
sanitized.nUbatch = Math.min(
4270+
sanitized.nBatch,
4271+
Math.min(64, Math.max(1, Math.trunc(Number(sanitized.nUbatch)))),
4272+
);
4273+
}
4274+
4275+
return sanitized;
4276+
}
4277+
42064278
_rememberLoadedModel(url, options = {}) {
42074279
const normalizedUrl = String(url || '').trim();
42084280
if (normalizedUrl.length === 0) {
@@ -4277,7 +4349,9 @@ export class LlamaWebGpuBridge {
42774349
return false;
42784350
}
42794351

4280-
const selectedOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
4352+
const selectedOptions = this._sanitizeModelLoadOptions(
4353+
this._loadedModelOptions || {},
4354+
);
42814355

42824356
const applyWorkerSafeMode = async () => {
42834357
await this._callWorker('loadModelFromUrl', [this._loadedModelUrl, selectedOptions]);
@@ -4381,12 +4455,29 @@ export class LlamaWebGpuBridge {
43814455
}
43824456

43834457
const forceReloadRequested = options?._llamadartForceRuntimeReload === true;
4458+
const mediaPartsRequested = this._hasMediaParts(options);
43844459
const shouldEnsureMultimodalInRuntime =
4385-
this._hasMediaParts(options)
4460+
mediaPartsRequested
43864461
&& typeof this._loadedMmProjUrl === 'string'
43874462
&& this._loadedMmProjUrl.length > 0;
4463+
const workerTimedOut = this._isWorkerTimeoutError(fallbackError);
4464+
const forcedCpuFallback = this._isForcedCpuMultimodalFallbackError(fallbackError);
4465+
const dispatchWorkgroupFallback = this._isDispatchWorkgroupLimitError(fallbackError);
4466+
const loadedGpuLayers = Number(this._loadedModelOptions?.nGpuLayers);
4467+
const metadataGpuLayers = Number(this._metadata?.['llamadart.webgpu.n_gpu_layers']);
4468+
const modelLoadedWithGpu = Number.isFinite(loadedGpuLayers)
4469+
? loadedGpuLayers !== 0
4470+
: (Number.isFinite(metadataGpuLayers) ? metadataGpuLayers !== 0 : true);
4471+
const shouldUseCpuMultimodalFallback =
4472+
mediaPartsRequested
4473+
&& modelLoadedWithGpu
4474+
&& (dispatchWorkgroupFallback || forcedCpuFallback || workerTimedOut);
43884475

4389-
if (Number(this._runtime?._modelBytes) > 0 && !forceReloadRequested) {
4476+
if (
4477+
Number(this._runtime?._modelBytes) > 0
4478+
&& !forceReloadRequested
4479+
&& !shouldUseCpuMultimodalFallback
4480+
) {
43904481
if (shouldEnsureMultimodalInRuntime) {
43914482
const runtimeSupportsMedia =
43924483
(typeof this._runtime.supportsVision === 'function' && this._runtime.supportsVision())
@@ -4407,25 +4498,18 @@ export class LlamaWebGpuBridge {
44074498
return;
44084499
}
44094500

4410-
const loadOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
4411-
const workerTimedOut = this._isWorkerTimeoutError(fallbackError);
4412-
const forcedCpuFallback = this._isForcedCpuMultimodalFallbackError(fallbackError);
4413-
const forceCpuMultimodalFallback =
4414-
this._hasMediaParts(options)
4415-
&& (this._isDispatchWorkgroupLimitError(fallbackError)
4416-
|| forcedCpuFallback)
4417-
&& Number(loadOptions.nGpuLayers) !== 0;
4418-
4419-
if (forceCpuMultimodalFallback) {
4420-
loadOptions.nGpuLayers = 0;
4421-
if (Number.isFinite(loadOptions.nCtx) && Number(loadOptions.nCtx) > 4096) {
4422-
loadOptions.nCtx = 4096;
4423-
}
4424-
4501+
const loadOptions = shouldUseCpuMultimodalFallback
4502+
? this._createCpuSafeMultimodalLoadOptions(this._loadedModelOptions || {})
4503+
: this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
4504+
if (shouldUseCpuMultimodalFallback) {
44254505
if (forcedCpuFallback) {
44264506
this._emitBridgeWarn(
44274507
'llamadart: using CPU fallback for multimodal generation stability.',
44284508
);
4509+
} else if (workerTimedOut) {
4510+
this._emitBridgeWarn(
4511+
'llamadart: retrying multimodal generation with CPU fallback after worker timeout.',
4512+
);
44294513
} else {
44304514
this._emitBridgeWarn(
44314515
'llamadart: retrying multimodal generation with CPU fallback after WebGPU workgroup limit failure.',
@@ -4448,7 +4532,7 @@ export class LlamaWebGpuBridge {
44484532
if (workerTimedOut) {
44494533
this._runtime._runtimeNotes.push('worker_fallback_timeout');
44504534
}
4451-
if (forceCpuMultimodalFallback) {
4535+
if (shouldUseCpuMultimodalFallback) {
44524536
this._runtime._runtimeNotes.push('worker_fallback_cpu_multimodal');
44534537
}
44544538
}

src/llama_webgpu_core.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,14 @@ std::string normalize_media_markers(const std::string & prompt, const size_t med
563563
replace_all_inplace(normalized, "<|image|>", marker);
564564
replace_all_inplace(normalized, "<img>", marker);
565565
replace_all_inplace(normalized, "<|img|>", marker);
566+
replace_all_inplace(
567+
normalized,
568+
"<|vision_start|><|image_pad|><|vision_end|>",
569+
marker);
570+
replace_all_inplace(
571+
normalized,
572+
"<|vision_start|><|video_pad|><|vision_end|>",
573+
marker);
566574
replace_all_inplace(normalized, "<audio>", marker);
567575
replace_all_inplace(normalized, "<|audio|>", marker);
568576

0 commit comments

Comments
 (0)