Skip to content

Commit 7b312c3

Browse files
committed
Fix BERT embedding pthread pool sizing
1 parent ab2a6d7 commit 7b312c3

6 files changed

Lines changed: 98 additions & 18 deletions

File tree

AGENTS.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,18 @@ Useful environment overrides:
3232
- `OUT_DIR`
3333
- `CMAKE_BUILD_TYPE`
3434

35+
### Local Verification Notes
36+
37+
When validating bridge runtime changes locally, keep build/cache output outside
38+
the repo so generated wasm artifacts and toolchain caches do not dirty the
39+
checkout or hit sandboxed Homebrew/cache paths:
40+
41+
```bash
42+
export CCACHE_DIR=/private/tmp/llama_web_bridge_ccache
43+
export EM_CACHE=/private/tmp/llama_web_bridge_emcache
44+
BUILD_DIR=/private/tmp/llama_web_bridge_build MEM64_BUILD_DIR=/private/tmp/llama_web_bridge_build_mem64 OUT_DIR=/private/tmp/llama_web_bridge_dist WEBGPU_BRIDGE_BUILD_MEM64=1 ./scripts/build_bridge.sh
45+
```
46+
3547
## CI / Release
3648

3749
- CI build gate: `.github/workflows/ci.yml`
@@ -52,3 +64,12 @@ After publishing assets tag:
5264
1. Update/fetch pinned bridge assets in `llamadart`:
5365
`WEBGPU_BRIDGE_ASSETS_TAG=<tag> ./scripts/fetch_webgpu_bridge_assets.sh`
5466
2. Update docs/changelog in `llamadart` if behavior changed.
67+
68+
## Regression Smoke Guidance
69+
70+
- For pthread/runtime changes, test a BERT-class embedding model in Chromium
71+
with cross-origin isolation enabled. The regression shape is:
72+
`loadModelFromUrl`, `tokenize`, `embed`, and `embedBatch` on a host where
73+
`navigator.hardwareConcurrency` is greater than the bridge pthread pool size.
74+
- Run the smoke through both direct runtime (`disableWorker: true`) and the
75+
bridge worker path; both should report `n_threads` capped to the pool size.

CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ option(LLAMADART_WEBGPU_MEM64 "Build WebGPU bridge core with wasm64/memory64" OF
3232
set(LLAMADART_WEBGPU_MEM64_MAX_MEMORY "12884901888" CACHE STRING "Max wasm64 linear memory in bytes")
3333
option(LLAMADART_WEBGPU_PTHREADS "Enable pthread support for bridge runtime" ON)
3434
set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE "4" CACHE STRING "PThread pool size for bridge runtime")
35+
set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT "0" CACHE STRING "PThread pool strictness for bridge runtime")
3536
option(LLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH "Allow wasm linear memory growth" ON)
3637
set(LLAMADART_WEBGPU_INITIAL_MEMORY "0" CACHE STRING "Initial wasm linear memory in bytes when growth is disabled")
3738

@@ -40,6 +41,10 @@ if (LLAMADART_WEBGPU_PTHREADS)
4041
add_link_options("-pthread")
4142
endif()
4243

44+
# Emscripten controls pthread support through compile/link flags. Newer local
45+
# toolchains can fail CMake's libc pthread probe before llama.cpp configures.
46+
set(CMAKE_HAVE_LIBC_PTHREAD ON CACHE BOOL "Emscripten pthread availability" FORCE)
47+
4348
if (LLAMADART_WEBGPU_MEM64)
4449
set(LLAMA_WASM_MEM64 ON CACHE BOOL "" FORCE)
4550
add_compile_options("-sMEMORY64=1")
@@ -102,6 +107,10 @@ add_executable(llama_webgpu_core src/llama_webgpu_core.cpp)
102107

103108
target_compile_features(llama_webgpu_core PRIVATE cxx_std_17)
104109

110+
target_compile_definitions(llama_webgpu_core PRIVATE
111+
LLAMADART_WEBGPU_PTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}
112+
)
113+
105114
target_include_directories(llama_webgpu_core PRIVATE
106115
"${LLAMA_CPP_DIR}/include"
107116
"${LLAMA_CPP_DIR}/ggml/include"
@@ -120,7 +129,7 @@ set(LLAMADART_WEBGPU_LINK_OPTIONS
120129
"-sEXPORT_NAME=createLlamaWebGpuCoreModule"
121130
"-sENVIRONMENT=web,worker"
122131
"-sEXPORTED_RUNTIME_METHODS=['FS','ccall','UTF8ToString']"
123-
"-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
132+
"-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_pthread_pool_size','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
124133
"-lwasmfs_fetch.js"
125134
)
126135

@@ -141,7 +150,7 @@ endif()
141150

142151
if (LLAMADART_WEBGPU_PTHREADS)
143152
list(APPEND LLAMADART_WEBGPU_LINK_OPTIONS
144-
"-sPTHREAD_POOL_SIZE_STRICT=2"
153+
"-sPTHREAD_POOL_SIZE_STRICT=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT}"
145154
"-sPTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}"
146155
)
147156
endif()

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@ Useful environment variables:
3030
- `WEBGPU_BRIDGE_MEM64_MAX_MEMORY` (optional wasm64 max linear memory bytes)
3131
- `WEBGPU_BRIDGE_PTHREADS` (`1`/`0`, defaults to `1`)
3232
- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` (defaults to `4`)
33+
- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` (defaults to `0`)
3334

3435
Notes:
3536

3637
- wasm64 builds default to `WEBGPU_BRIDGE_MEM64_MAX_MEMORY=12884901888` (12 GiB).
3738
- Large single-file remote model loading requires a cross-origin isolated page
3839
(`COOP`/`COEP`) so worker-thread runtime paths are available.
39-
- pthread builds enable `-sPTHREAD_POOL_SIZE_STRICT=2` so pool exhaustion
40-
throws explicit errors instead of risking deadlock.
40+
- pthread builds preallocate `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` workers and cap
41+
bridge-selected thread counts to that compiled pool size. The linker keeps
42+
`PTHREAD_POOL_SIZE_STRICT=0` as a fallback so an unexpected over-pool request
43+
does not hard-abort the wasm runtime.
4144

4245
Build outputs:
4346

js/llama_webgpu_bridge.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,6 +1435,19 @@ class LlamaWebGpuBridgeRuntime {
14351435
return null;
14361436
}
14371437

1438+
try {
1439+
if (typeof core.ccall === 'function') {
1440+
const compiledPoolSize = Number(
1441+
core.ccall('llamadart_webgpu_pthread_pool_size', 'number', [], []),
1442+
);
1443+
if (Number.isFinite(compiledPoolSize) && compiledPoolSize > 0) {
1444+
return Math.max(1, Math.trunc(compiledPoolSize));
1445+
}
1446+
}
1447+
} catch (_) {
1448+
// Ignore lookup failures and fall back to runtime heuristics.
1449+
}
1450+
14381451
try {
14391452
const pThread = core.PThread;
14401453
if (!pThread || typeof pThread !== 'object') {

scripts/build_bridge.sh

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ BUILD_MEM64="${WEBGPU_BRIDGE_BUILD_MEM64:-0}"
1515
MEM64_MAX_MEMORY="${WEBGPU_BRIDGE_MEM64_MAX_MEMORY:-12884901888}"
1616
ENABLE_PTHREADS="${WEBGPU_BRIDGE_PTHREADS:-1}"
1717
PTHREAD_POOL_SIZE="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE:-4}"
18+
PTHREAD_POOL_SIZE_STRICT="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT:-0}"
1819
ALLOW_MEMORY_GROWTH="${WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH:-1}"
1920
INITIAL_MEMORY="${WEBGPU_BRIDGE_INITIAL_MEMORY:-0}"
2021

@@ -43,6 +44,7 @@ Environment variables:
4344
WEBGPU_BRIDGE_MEM64_MAX_MEMORY wasm64 max linear memory bytes (default: 12884901888)
4445
WEBGPU_BRIDGE_PTHREADS Enable pthread runtime support (default: 1)
4546
WEBGPU_BRIDGE_PTHREAD_POOL_SIZE PThread pool size when enabled (default: 4)
47+
WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT PThread strictness when enabled (default: 0)
4648
WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH Allow wasm memory growth (default: 1)
4749
WEBGPU_BRIDGE_INITIAL_MEMORY Fixed wasm memory bytes when growth disabled
4850
@@ -82,6 +84,7 @@ emcmake cmake \
8284
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
8385
-DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
8486
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
87+
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
8588
-DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
8689
-DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"
8790

@@ -116,6 +119,7 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
116119
-DLLAMADART_WEBGPU_MEM64_MAX_MEMORY="$MEM64_MAX_MEMORY" \
117120
-DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
118121
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
122+
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
119123
-DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
120124
-DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"
121125

@@ -136,27 +140,47 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
136140
echo "[bridge] applying wasm64 runtime bigint interop patch"
137141
python3 - <<'PY' "$OUT_DIR/llama_webgpu_core_mem64.js"
138142
from pathlib import Path
143+
import re
139144
import sys
140145
141146
target = Path(sys.argv[1])
142147
text = target.read_text(encoding='utf-8', errors='ignore')
143148
144-
replacements = {
145-
"__wasmfs_read(stream.fd,dataBuffer,length)": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
146-
"__wasmfs_read(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
147-
"__wasmfs_pread(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
148-
"__wasmfs_pread(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
149-
"__wasmfs_write(stream.fd,dataBuffer,length)": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
150-
"__wasmfs_write(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
151-
"__wasmfs_pwrite(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
152-
"__wasmfs_pwrite(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
153-
"__wasmfs_mmap(length,prot,flags,stream.fd,BigInt(offset))": "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
154-
}
149+
def bigint_or_name(name):
150+
return rf"(?:BigInt\(\s*{name}\s*\)|{name})"
151+
152+
data_buffer = bigint_or_name("dataBuffer")
153+
length = bigint_or_name("length")
154+
position = bigint_or_name("position")
155+
offset = bigint_or_name("offset")
156+
157+
replacements = [
158+
(
159+
rf"__wasmfs_read\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
160+
"__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
161+
),
162+
(
163+
rf"__wasmfs_pread\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
164+
"__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
165+
),
166+
(
167+
rf"__wasmfs_write\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
168+
"__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
169+
),
170+
(
171+
rf"__wasmfs_pwrite\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
172+
"__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
173+
),
174+
(
175+
rf"__wasmfs_mmap\(\s*{length}\s*,\s*prot\s*,\s*flags\s*,\s*stream\.fd\s*,\s*{offset}\s*\)",
176+
"__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
177+
),
178+
]
155179
156180
changed = False
157-
for old, new in replacements.items():
158-
if old in text:
159-
text = text.replace(old, new)
181+
for pattern, replacement in replacements:
182+
text, count = re.subn(pattern, replacement, text)
183+
if count > 0:
160184
changed = True
161185
162186
if not changed:

src/llama_webgpu_core.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,16 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_supports_pthreads() {
10301030
#endif
10311031
}
10321032

1033+
EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_pthread_pool_size() {
1034+
#if defined(__EMSCRIPTEN_PTHREADS__)
1035+
return LLAMADART_WEBGPU_PTHREAD_POOL_SIZE > 0
1036+
? LLAMADART_WEBGPU_PTHREAD_POOL_SIZE
1037+
: 1;
1038+
#else
1039+
return 1;
1040+
#endif
1041+
}
1042+
10331043
EMSCRIPTEN_KEEPALIVE const char * llamadart_webgpu_backends_json() {
10341044
refresh_backend_probe();
10351045
return g_backend_json.c_str();

0 commit comments

Comments
 (0)