Fix BERT embedding pthread pool sizing

leehack · leehack · commit 7b312c3c1783 · 2026-05-08T19:48:59.000-04:00
diff --git a/AGENTS.md b/AGENTS.md
@@ -32,6 +32,18 @@ Useful environment overrides:
 - `OUT_DIR`
 - `CMAKE_BUILD_TYPE`
 
+### Local Verification Notes
+
+When validating bridge runtime changes locally, keep build/cache output outside
+the repo so generated wasm artifacts and toolchain caches do not dirty the
+checkout or hit sandboxed Homebrew/cache paths:
+
+```bash
+export CCACHE_DIR=/private/tmp/llama_web_bridge_ccache
+export EM_CACHE=/private/tmp/llama_web_bridge_emcache
+BUILD_DIR=/private/tmp/llama_web_bridge_build MEM64_BUILD_DIR=/private/tmp/llama_web_bridge_build_mem64 OUT_DIR=/private/tmp/llama_web_bridge_dist WEBGPU_BRIDGE_BUILD_MEM64=1 ./scripts/build_bridge.sh
+```
+
 ## CI / Release
 
 - CI build gate: `.github/workflows/ci.yml`
@@ -52,3 +64,12 @@ After publishing assets tag:
 1. Update/fetch pinned bridge assets in `llamadart`:
    `WEBGPU_BRIDGE_ASSETS_TAG=<tag> ./scripts/fetch_webgpu_bridge_assets.sh`
 2. Update docs/changelog in `llamadart` if behavior changed.
+
+## Regression Smoke Guidance
+
+- For pthread/runtime changes, test a BERT-class embedding model in Chromium
+  with cross-origin isolation enabled. The regression shape is:
+  `loadModelFromUrl`, `tokenize`, `embed`, and `embedBatch` on a host where
+  `navigator.hardwareConcurrency` is greater than the bridge pthread pool size.
+- Run the smoke through both direct runtime (`disableWorker: true`) and the
+  bridge worker path; both should report `n_threads` capped to the pool size.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(LLAMADART_WEBGPU_MEM64 "Build WebGPU bridge core with wasm64/memory64" OF
 set(LLAMADART_WEBGPU_MEM64_MAX_MEMORY "12884901888" CACHE STRING "Max wasm64 linear memory in bytes")
 option(LLAMADART_WEBGPU_PTHREADS "Enable pthread support for bridge runtime" ON)
 set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE "4" CACHE STRING "PThread pool size for bridge runtime")
+set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT "0" CACHE STRING "PThread pool strictness for bridge runtime")
 option(LLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH "Allow wasm linear memory growth" ON)
 set(LLAMADART_WEBGPU_INITIAL_MEMORY "0" CACHE STRING "Initial wasm linear memory in bytes when growth is disabled")
 
@@ -40,6 +41,10 @@ if (LLAMADART_WEBGPU_PTHREADS)
     add_link_options("-pthread")
 endif()
 
+# Emscripten controls pthread support through compile/link flags. Newer local
+# toolchains can fail CMake's libc pthread probe before llama.cpp configures.
+set(CMAKE_HAVE_LIBC_PTHREAD ON CACHE BOOL "Emscripten pthread availability" FORCE)
+
 if (LLAMADART_WEBGPU_MEM64)
     set(LLAMA_WASM_MEM64 ON CACHE BOOL "" FORCE)
     add_compile_options("-sMEMORY64=1")
@@ -102,6 +107,10 @@ add_executable(llama_webgpu_core src/llama_webgpu_core.cpp)
 
 target_compile_features(llama_webgpu_core PRIVATE cxx_std_17)
 
+target_compile_definitions(llama_webgpu_core PRIVATE
+    LLAMADART_WEBGPU_PTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}
+)
+
 target_include_directories(llama_webgpu_core PRIVATE
     "${LLAMA_CPP_DIR}/include"
     "${LLAMA_CPP_DIR}/ggml/include"
@@ -120,7 +129,7 @@ set(LLAMADART_WEBGPU_LINK_OPTIONS
     "-sEXPORT_NAME=createLlamaWebGpuCoreModule"
     "-sENVIRONMENT=web,worker"
     "-sEXPORTED_RUNTIME_METHODS=['FS','ccall','UTF8ToString']"
-    "-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
+    "-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_pthread_pool_size','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
     "-lwasmfs_fetch.js"
 )
 
@@ -141,7 +150,7 @@ endif()
 
 if (LLAMADART_WEBGPU_PTHREADS)
     list(APPEND LLAMADART_WEBGPU_LINK_OPTIONS
-        "-sPTHREAD_POOL_SIZE_STRICT=2"
+        "-sPTHREAD_POOL_SIZE_STRICT=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT}"
         "-sPTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}"
     )
 endif()
diff --git a/README.md b/README.md
@@ -30,14 +30,17 @@ Useful environment variables:
 - `WEBGPU_BRIDGE_MEM64_MAX_MEMORY` (optional wasm64 max linear memory bytes)
 - `WEBGPU_BRIDGE_PTHREADS` (`1`/`0`, defaults to `1`)
 - `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` (defaults to `4`)
+- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` (defaults to `0`)
 
 Notes:
 
 - wasm64 builds default to `WEBGPU_BRIDGE_MEM64_MAX_MEMORY=12884901888` (12 GiB).
 - Large single-file remote model loading requires a cross-origin isolated page
   (`COOP`/`COEP`) so worker-thread runtime paths are available.
-- pthread builds enable `-sPTHREAD_POOL_SIZE_STRICT=2` so pool exhaustion
-  throws explicit errors instead of risking deadlock.
+- pthread builds preallocate `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` workers and cap
+  bridge-selected thread counts to that compiled pool size. The linker keeps
+  `PTHREAD_POOL_SIZE_STRICT=0` as a fallback so an unexpected over-pool request
+  does not hard-abort the wasm runtime.
 
 Build outputs:
 
diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js
@@ -1435,6 +1435,19 @@ class LlamaWebGpuBridgeRuntime {
       return null;
     }
 
+    try {
+      if (typeof core.ccall === 'function') {
+        const compiledPoolSize = Number(
+          core.ccall('llamadart_webgpu_pthread_pool_size', 'number', [], []),
+        );
+        if (Number.isFinite(compiledPoolSize) && compiledPoolSize > 0) {
+          return Math.max(1, Math.trunc(compiledPoolSize));
+        }
+      }
+    } catch (_) {
+      // Ignore lookup failures and fall back to runtime heuristics.
+    }
+
     try {
       const pThread = core.PThread;
       if (!pThread || typeof pThread !== 'object') {
diff --git a/scripts/build_bridge.sh b/scripts/build_bridge.sh
@@ -15,6 +15,7 @@ BUILD_MEM64="${WEBGPU_BRIDGE_BUILD_MEM64:-0}"
 MEM64_MAX_MEMORY="${WEBGPU_BRIDGE_MEM64_MAX_MEMORY:-12884901888}"
 ENABLE_PTHREADS="${WEBGPU_BRIDGE_PTHREADS:-1}"
 PTHREAD_POOL_SIZE="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE:-4}"
+PTHREAD_POOL_SIZE_STRICT="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT:-0}"
 ALLOW_MEMORY_GROWTH="${WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH:-1}"
 INITIAL_MEMORY="${WEBGPU_BRIDGE_INITIAL_MEMORY:-0}"
 
@@ -43,6 +44,7 @@ Environment variables:
   WEBGPU_BRIDGE_MEM64_MAX_MEMORY  wasm64 max linear memory bytes (default: 12884901888)
   WEBGPU_BRIDGE_PTHREADS  Enable pthread runtime support (default: 1)
   WEBGPU_BRIDGE_PTHREAD_POOL_SIZE  PThread pool size when enabled (default: 4)
+  WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT  PThread strictness when enabled (default: 0)
   WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH  Allow wasm memory growth (default: 1)
   WEBGPU_BRIDGE_INITIAL_MEMORY  Fixed wasm memory bytes when growth disabled
 
@@ -82,6 +84,7 @@ emcmake cmake \
   -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
   -DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
   -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
+  -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
   -DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
   -DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"
 
@@ -116,6 +119,7 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
     -DLLAMADART_WEBGPU_MEM64_MAX_MEMORY="$MEM64_MAX_MEMORY" \
     -DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
     -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
+    -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
     -DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
     -DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"
 
@@ -136,27 +140,47 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
   echo "[bridge] applying wasm64 runtime bigint interop patch"
   python3 - <<'PY' "$OUT_DIR/llama_webgpu_core_mem64.js"
 from pathlib import Path
+import re
 import sys
 
 target = Path(sys.argv[1])
 text = target.read_text(encoding='utf-8', errors='ignore')
 
-replacements = {
-    "__wasmfs_read(stream.fd,dataBuffer,length)": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
-    "__wasmfs_read(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
-    "__wasmfs_pread(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
-    "__wasmfs_pread(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
-    "__wasmfs_write(stream.fd,dataBuffer,length)": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
-    "__wasmfs_write(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
-    "__wasmfs_pwrite(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
-    "__wasmfs_pwrite(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
-    "__wasmfs_mmap(length,prot,flags,stream.fd,BigInt(offset))": "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
-}
+def bigint_or_name(name):
+    return rf"(?:BigInt\(\s*{name}\s*\)|{name})"
+
+data_buffer = bigint_or_name("dataBuffer")
+length = bigint_or_name("length")
+position = bigint_or_name("position")
+offset = bigint_or_name("offset")
+
+replacements = [
+    (
+        rf"__wasmfs_read\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
+        "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
+    ),
+    (
+        rf"__wasmfs_pread\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
+        "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
+    ),
+    (
+        rf"__wasmfs_write\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
+        "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
+    ),
+    (
+        rf"__wasmfs_pwrite\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
+        "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
+    ),
+    (
+        rf"__wasmfs_mmap\(\s*{length}\s*,\s*prot\s*,\s*flags\s*,\s*stream\.fd\s*,\s*{offset}\s*\)",
+        "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
+    ),
+]
 
 changed = False
-for old, new in replacements.items():
-    if old in text:
-        text = text.replace(old, new)
+for pattern, replacement in replacements:
+    text, count = re.subn(pattern, replacement, text)
+    if count > 0:
         changed = True
 
 if not changed:
diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp
@@ -1030,6 +1030,16 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_supports_pthreads() {
 #endif
 }
 
+EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_pthread_pool_size() {
+#if defined(__EMSCRIPTEN_PTHREADS__)
+  return LLAMADART_WEBGPU_PTHREAD_POOL_SIZE > 0
+      ? LLAMADART_WEBGPU_PTHREAD_POOL_SIZE
+      : 1;
+#else
+  return 1;
+#endif
+}
+
 EMSCRIPTEN_KEEPALIVE const char * llamadart_webgpu_backends_json() {
   refresh_backend_probe();
   return g_backend_json.c_str();