Skip to content

Commit ab2a6d7

Browse files
authored
Merge pull request #6 from leehack/sync-webbridge-b9016
Sync WebGPU bridge to llama.cpp b9016
2 parents f07ee62 + d28fbd7 commit ab2a6d7

6 files changed

Lines changed: 275 additions & 43 deletions

File tree

.github/workflows/ci.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
name: Build WebGPU Bridge (WASM)
1212
runs-on: ubuntu-latest
1313
env:
14-
LLAMA_CPP_TAG: b8157
14+
LLAMA_CPP_TAG: b9016
1515
steps:
1616
- uses: actions/checkout@v4
1717

@@ -27,19 +27,26 @@ jobs:
2727
- name: Build bridge artifacts
2828
env:
2929
OUT_DIR: ${{ runner.temp }}/webgpu_bridge_dist
30+
WEBGPU_BRIDGE_BUILD_MEM64: 1
3031
run: ./scripts/build_bridge.sh
3132

3233
- name: Verify outputs
3334
run: |
3435
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js"
36+
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js"
3537
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js"
3638
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm"
39+
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js"
40+
test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm"
3741
3842
- name: Upload bridge artifacts
3943
uses: actions/upload-artifact@v4
4044
with:
4145
name: webgpu-bridge-dist
4246
path: |
4347
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js
48+
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js
4449
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js
4550
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm
51+
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js
52+
${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm

.github/workflows/publish_assets.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ on:
1313
llama_cpp_tag:
1414
description: llama.cpp tag to build from
1515
required: true
16-
default: b8157
16+
default: b9016
1717
push:
1818
tags:
1919
- 'v*'
2020

2121
env:
2222
ASSETS_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_tag || github.ref_name }}
2323
ASSETS_REPO: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_repo || 'leehack/llama-web-bridge-assets' }}
24-
LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b8157' }}
24+
LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b9016' }}
2525

2626
permissions:
2727
contents: read

CMakeLists.txt

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ find_package(Threads REQUIRED)
6161
set(MTMD_AUDIO_SRC "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-audio.cpp")
6262
set(MTMD_AUDIO_PATCHED "${CMAKE_BINARY_DIR}/generated/mtmd-audio-single-thread.cpp")
6363

64+
file(GLOB LLAMADART_MTMD_MODEL_SOURCES
65+
"${LLAMA_CPP_DIR}/tools/mtmd/models/*.cpp")
66+
6467
file(READ "${MTMD_AUDIO_SRC}" MTMD_AUDIO_CONTENT)
6568
string(FIND "${MTMD_AUDIO_CONTENT}" "4, // n_threads" MTMD_AUDIO_THREAD_MARKER_INDEX)
6669
if (MTMD_AUDIO_THREAD_MARKER_INDEX EQUAL -1)
@@ -74,39 +77,10 @@ string(REPLACE
7477
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/generated")
7578
file(WRITE "${MTMD_AUDIO_PATCHED}" "${MTMD_AUDIO_CONTENT}")
7679

77-
set(LLAMADART_MTMD_MODEL_SOURCES
78-
"${LLAMA_CPP_DIR}/tools/mtmd/models/cogvlm.cpp"
79-
"${LLAMA_CPP_DIR}/tools/mtmd/models/conformer.cpp"
80-
"${LLAMA_CPP_DIR}/tools/mtmd/models/glm4v.cpp"
81-
"${LLAMA_CPP_DIR}/tools/mtmd/models/internvl.cpp"
82-
"${LLAMA_CPP_DIR}/tools/mtmd/models/kimivl.cpp"
83-
"${LLAMA_CPP_DIR}/tools/mtmd/models/kimik25.cpp"
84-
"${LLAMA_CPP_DIR}/tools/mtmd/models/llama4.cpp"
85-
"${LLAMA_CPP_DIR}/tools/mtmd/models/llava.cpp"
86-
"${LLAMA_CPP_DIR}/tools/mtmd/models/minicpmv.cpp"
87-
"${LLAMA_CPP_DIR}/tools/mtmd/models/pixtral.cpp"
88-
"${LLAMA_CPP_DIR}/tools/mtmd/models/qwen2vl.cpp"
89-
"${LLAMA_CPP_DIR}/tools/mtmd/models/qwen3vl.cpp"
90-
"${LLAMA_CPP_DIR}/tools/mtmd/models/siglip.cpp"
91-
"${LLAMA_CPP_DIR}/tools/mtmd/models/whisper-enc.cpp"
92-
"${LLAMA_CPP_DIR}/tools/mtmd/models/mobilenetv5.cpp"
93-
"${LLAMA_CPP_DIR}/tools/mtmd/models/youtuvl.cpp"
94-
)
95-
96-
set(LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES
97-
"${LLAMA_CPP_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp"
98-
"${LLAMA_CPP_DIR}/tools/mtmd/models/paddleocr.cpp"
99-
)
100-
101-
foreach(model_source IN LISTS LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES)
102-
if (EXISTS "${model_source}")
103-
list(APPEND LLAMADART_MTMD_MODEL_SOURCES "${model_source}")
104-
endif()
105-
endforeach()
106-
10780
add_library(llamadart_mtmd STATIC
10881
"${LLAMA_CPP_DIR}/tools/mtmd/mtmd.cpp"
10982
"${MTMD_AUDIO_PATCHED}"
83+
"${LLAMA_CPP_DIR}/tools/mtmd/mtmd-image.cpp"
11084
"${LLAMA_CPP_DIR}/tools/mtmd/mtmd-helper.cpp"
11185
"${LLAMA_CPP_DIR}/tools/mtmd/clip.cpp"
11286
${LLAMADART_MTMD_MODEL_SOURCES}

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ This repo includes a wasm build gate in:
5757

5858
- `.github/workflows/ci.yml`
5959

60-
It builds against pinned `llama.cpp` tag `b8157` and uploads build artifacts.
60+
It builds against pinned `llama.cpp` tag `b9016` and uploads build artifacts.
6161

6262
## Publishing
6363

@@ -93,7 +93,7 @@ Manual override example:
9393
2. Inputs:
9494
- `assets_tag`: `v0.1.5`
9595
- `assets_repo`: `leehack/llama-web-bridge-assets`
96-
- `llama_cpp_tag`: `b8157`
96+
- `llama_cpp_tag`: `b9016`
9797

9898
After publish, assets are CDN-available at:
9999

js/llama_webgpu_bridge.js

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,44 @@ function parsePositiveInteger(value) {
108108
return Math.trunc(numeric);
109109
}
110110

111+
function parseInteger(value, fallback = 0) {
112+
const numeric = Number(value);
113+
if (!Number.isFinite(numeric)) {
114+
return fallback;
115+
}
116+
return Math.trunc(numeric);
117+
}
118+
119+
function parseBooleanFlag(value, fallback = false) {
120+
if (typeof value === 'boolean') {
121+
return value;
122+
}
123+
if (typeof value === 'number' && Number.isFinite(value)) {
124+
return value !== 0;
125+
}
126+
return fallback;
127+
}
128+
129+
function parseOptionalBooleanFlag(value) {
130+
if (typeof value === 'boolean') {
131+
return value ? 1 : 0;
132+
}
133+
if (typeof value === 'number' && Number.isFinite(value)) {
134+
return value !== 0 ? 1 : 0;
135+
}
136+
return -1;
137+
}
138+
139+
function parseEnumValue(value, allowed, fallback) {
140+
const parsed = parseInteger(value, fallback);
141+
return allowed.includes(parsed) ? parsed : fallback;
142+
}
143+
144+
function parsePositiveNumber(value) {
145+
const numeric = Number(value);
146+
return Number.isFinite(numeric) && numeric > 0 ? numeric : 0;
147+
}
148+
111149
function parseTotalFromContentRangeHeader(contentRangeHeader) {
112150
if (typeof contentRangeHeader !== 'string' || contentRangeHeader.length === 0) {
113151
return 0;
@@ -1329,6 +1367,17 @@ class LlamaWebGpuBridgeRuntime {
13291367
this._nGpuLayers = Number.isFinite(config.nGpuLayers)
13301368
? Number(config.nGpuLayers)
13311369
: -1;
1370+
this._nSeqMax = 0;
1371+
this._useMmap = false;
1372+
this._useMlock = false;
1373+
this._flashAttention = -1;
1374+
this._cacheTypeK = 1;
1375+
this._cacheTypeV = 1;
1376+
this._kvUnified = -1;
1377+
this._ropeFrequencyBase = 0;
1378+
this._ropeFrequencyScale = 0;
1379+
this._splitMode = -1;
1380+
this._mainGpu = -1;
13321381
this._isSafari = isSafariUserAgent(this._config.userAgent ?? globalThis.navigator?.userAgent ?? '');
13331382
this._coreVariant = 'uninitialized';
13341383
this._preferMemory64 = this._config.preferMemory64 !== false;
@@ -1963,6 +2012,70 @@ class LlamaWebGpuBridgeRuntime {
19632012
}
19642013
}
19652014

2015+
_resolveNativeLoadOptions(options = {}) {
2016+
this._nSeqMax = parsePositiveInteger(options.nSeqMax);
2017+
this._useMmap = parseBooleanFlag(options.useMmap, false);
2018+
this._useMlock = parseBooleanFlag(options.useMlock, false);
2019+
this._flashAttention = parseEnumValue(options.flashAttention, [-1, 0, 1], -1);
2020+
this._cacheTypeK = parseEnumValue(options.cacheTypeK, [1, 2, 8], 1);
2021+
this._cacheTypeV = parseEnumValue(options.cacheTypeV, [1, 2, 8], 1);
2022+
this._kvUnified = parseOptionalBooleanFlag(options.kvUnified);
2023+
this._ropeFrequencyBase = parsePositiveNumber(options.ropeFrequencyBase);
2024+
this._ropeFrequencyScale = parsePositiveNumber(options.ropeFrequencyScale);
2025+
this._splitMode = parseEnumValue(options.splitMode, [0, 1, 2, 3], -1);
2026+
this._mainGpu = parseInteger(options.mainGpu, -1);
2027+
if (this._mainGpu < 0) {
2028+
this._mainGpu = -1;
2029+
}
2030+
2031+
const wantsQuantizedKvCache = this._cacheTypeK !== 1 || this._cacheTypeV !== 1;
2032+
if (this._flashAttention === 0 && wantsQuantizedKvCache) {
2033+
throw new Error(
2034+
'Non-F16 KV cache requires flashAttention to be auto or enabled.',
2035+
);
2036+
}
2037+
if (this._flashAttention === -1 && wantsQuantizedKvCache) {
2038+
this._flashAttention = 1;
2039+
this._runtimeNotes.push('flash_attention:auto_enabled_for_kv_cache');
2040+
}
2041+
if (this._kvUnified < 0 && this._nSeqMax > 1) {
2042+
this._kvUnified = 1;
2043+
this._runtimeNotes.push('kv_unified:auto_enabled_for_sequences');
2044+
}
2045+
}
2046+
2047+
_nativeLoadOptionValues() {
2048+
return [
2049+
this._nSeqMax,
2050+
this._useMmap ? 1 : 0,
2051+
this._useMlock ? 1 : 0,
2052+
this._flashAttention,
2053+
this._cacheTypeK,
2054+
this._cacheTypeV,
2055+
this._kvUnified,
2056+
this._ropeFrequencyBase,
2057+
this._ropeFrequencyScale,
2058+
this._splitMode,
2059+
this._mainGpu,
2060+
];
2061+
}
2062+
2063+
_nativeLoadOptionTypes() {
2064+
return [
2065+
'number',
2066+
'number',
2067+
'number',
2068+
'number',
2069+
'number',
2070+
'number',
2071+
'number',
2072+
'number',
2073+
'number',
2074+
'number',
2075+
'number',
2076+
];
2077+
}
2078+
19662079
async _tryLoadModelFromRemoteFetchBackend(core, url, options = {}) {
19672080
if (!this._canUseRemoteFetchBackend(options)) {
19682081
return { loaded: false, sizeBytes: null };
@@ -2031,6 +2144,7 @@ class LlamaWebGpuBridgeRuntime {
20312144
'number',
20322145
'number',
20332146
'number',
2147+
...this._nativeLoadOptionTypes(),
20342148
],
20352149
[
20362150
remoteFetchUrl,
@@ -2041,6 +2155,7 @@ class LlamaWebGpuBridgeRuntime {
20412155
this._nUbatch,
20422156
this._nGpuLayers,
20432157
chunkBytes,
2158+
...this._nativeLoadOptionValues(),
20442159
],
20452160
{ async: true },
20462161
),
@@ -2926,6 +3041,8 @@ class LlamaWebGpuBridgeRuntime {
29263041
this._nUbatch = this._nBatch;
29273042
}
29283043

3044+
this._resolveNativeLoadOptions(options);
3045+
29293046
if (Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0) {
29303047
this._pushRuntimeNote(`thread_pool_size:${this._threadPoolSizeHint}`);
29313048
}
@@ -2947,6 +3064,9 @@ class LlamaWebGpuBridgeRuntime {
29473064
if (this._nUbatch > 0) {
29483065
this._pushRuntimeNote(`n_ubatch:${this._nUbatch}`);
29493066
}
3067+
if (this._nSeqMax > 0) {
3068+
this._pushRuntimeNote(`n_seq_max:${this._nSeqMax}`);
3069+
}
29503070
if (isCpuModelMode && !Number.isFinite(requestedBatch) && !Number.isFinite(requestedUbatch)) {
29513071
this._runtimeNotes.push('cpu_batch_tuned_default');
29523072
}
@@ -3174,7 +3294,16 @@ class LlamaWebGpuBridgeRuntime {
31743294
await core.ccall(
31753295
'llamadart_webgpu_load_model',
31763296
'number',
3177-
['string', 'number', 'number', 'number', 'number', 'number', 'number'],
3297+
[
3298+
'string',
3299+
'number',
3300+
'number',
3301+
'number',
3302+
'number',
3303+
'number',
3304+
'number',
3305+
...this._nativeLoadOptionTypes(),
3306+
],
31783307
[
31793308
this._modelPath,
31803309
this._nCtx,
@@ -3183,6 +3312,7 @@ class LlamaWebGpuBridgeRuntime {
31833312
this._nBatch,
31843313
this._nUbatch,
31853314
this._nGpuLayers,
3315+
...this._nativeLoadOptionValues(),
31863316
],
31873317
{ async: true },
31883318
),
@@ -3307,6 +3437,7 @@ class LlamaWebGpuBridgeRuntime {
33073437
'number',
33083438
'number',
33093439
'number',
3440+
...this._nativeLoadOptionTypes(),
33103441
],
33113442
[
33123443
reloadUrl,
@@ -3317,6 +3448,7 @@ class LlamaWebGpuBridgeRuntime {
33173448
this._nUbatch,
33183449
candidateLayers,
33193450
remoteFetchReloadChunkBytes,
3451+
...this._nativeLoadOptionValues(),
33203452
],
33213453
{ async: true },
33223454
),
@@ -3326,7 +3458,16 @@ class LlamaWebGpuBridgeRuntime {
33263458
await core.ccall(
33273459
'llamadart_webgpu_load_model',
33283460
'number',
3329-
['string', 'number', 'number', 'number', 'number', 'number', 'number'],
3461+
[
3462+
'string',
3463+
'number',
3464+
'number',
3465+
'number',
3466+
'number',
3467+
'number',
3468+
'number',
3469+
...this._nativeLoadOptionTypes(),
3470+
],
33303471
[
33313472
this._modelPath,
33323473
this._nCtx,
@@ -3335,6 +3476,7 @@ class LlamaWebGpuBridgeRuntime {
33353476
this._nBatch,
33363477
this._nUbatch,
33373478
candidateLayers,
3479+
...this._nativeLoadOptionValues(),
33383480
],
33393481
{ async: true },
33403482
),
@@ -4079,6 +4221,20 @@ class LlamaWebGpuBridgeRuntime {
40794221
'llamadart.webgpu.n_threads_batch': String(this._threadsBatch),
40804222
'llamadart.webgpu.n_batch': this._nBatch > 0 ? String(this._nBatch) : '',
40814223
'llamadart.webgpu.n_ubatch': this._nUbatch > 0 ? String(this._nUbatch) : '',
4224+
'llamadart.webgpu.n_seq_max': this._nSeqMax > 0 ? String(this._nSeqMax) : '',
4225+
'llamadart.webgpu.flash_attention': String(this._flashAttention),
4226+
'llamadart.webgpu.cache_type_k': String(this._cacheTypeK),
4227+
'llamadart.webgpu.cache_type_v': String(this._cacheTypeV),
4228+
'llamadart.webgpu.kv_unified':
4229+
this._kvUnified >= 0 ? String(this._kvUnified) : '',
4230+
'llamadart.webgpu.rope_freq_base':
4231+
this._ropeFrequencyBase > 0 ? String(this._ropeFrequencyBase) : '',
4232+
'llamadart.webgpu.rope_freq_scale':
4233+
this._ropeFrequencyScale > 0 ? String(this._ropeFrequencyScale) : '',
4234+
'llamadart.webgpu.split_mode':
4235+
this._splitMode >= 0 ? String(this._splitMode) : '',
4236+
'llamadart.webgpu.main_gpu':
4237+
this._mainGpu >= 0 ? String(this._mainGpu) : '',
40824238
'llamadart.webgpu.thread_pool_size':
40834239
Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0
40844240
? String(this._threadPoolSizeHint)

0 commit comments

Comments
 (0)