diff --git a/AGENTS.md b/AGENTS.md index 41d3cbf..08a321d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,6 +32,18 @@ Useful environment overrides: - `OUT_DIR` - `CMAKE_BUILD_TYPE` +### Local Verification Notes + +When validating bridge runtime changes locally, keep build/cache output outside +the repo so generated wasm artifacts and toolchain caches do not dirty the +checkout or hit sandboxed Homebrew/cache paths: + +```bash +export CCACHE_DIR=/private/tmp/llama_web_bridge_ccache +export EM_CACHE=/private/tmp/llama_web_bridge_emcache +BUILD_DIR=/private/tmp/llama_web_bridge_build MEM64_BUILD_DIR=/private/tmp/llama_web_bridge_build_mem64 OUT_DIR=/private/tmp/llama_web_bridge_dist WEBGPU_BRIDGE_BUILD_MEM64=1 ./scripts/build_bridge.sh +``` + ## CI / Release - CI build gate: `.github/workflows/ci.yml` @@ -52,3 +64,12 @@ After publishing assets tag: 1. Update/fetch pinned bridge assets in `llamadart`: `WEBGPU_BRIDGE_ASSETS_TAG= ./scripts/fetch_webgpu_bridge_assets.sh` 2. Update docs/changelog in `llamadart` if behavior changed. + +## Regression Smoke Guidance + +- For pthread/runtime changes, test a BERT-class embedding model in Chromium + with cross-origin isolation enabled. The regression shape is: + `loadModelFromUrl`, `tokenize`, `embed`, and `embedBatch` on a host where + `navigator.hardwareConcurrency` is greater than the bridge pthread pool size. +- Run the smoke through both direct runtime (`disableWorker: true`) and the + bridge worker path; both should report `n_threads` capped to the pool size. diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ae6fdf..c8eff32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(LLAMADART_WEBGPU_MEM64 "Build WebGPU bridge core with wasm64/memory64" OF set(LLAMADART_WEBGPU_MEM64_MAX_MEMORY "12884901888" CACHE STRING "Max wasm64 linear memory in bytes") option(LLAMADART_WEBGPU_PTHREADS "Enable pthread support for bridge runtime" ON) set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE "4" CACHE STRING "PThread pool size for bridge runtime") +set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT "0" CACHE STRING "PThread pool strictness for bridge runtime") option(LLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH "Allow wasm linear memory growth" ON) set(LLAMADART_WEBGPU_INITIAL_MEMORY "0" CACHE STRING "Initial wasm linear memory in bytes when growth is disabled") @@ -40,6 +41,10 @@ if (LLAMADART_WEBGPU_PTHREADS) add_link_options("-pthread") endif() +# Emscripten controls pthread support through compile/link flags. Newer local +# toolchains can fail CMake's libc pthread probe before llama.cpp configures. +set(CMAKE_HAVE_LIBC_PTHREAD ON CACHE BOOL "Emscripten pthread availability" FORCE) + if (LLAMADART_WEBGPU_MEM64) set(LLAMA_WASM_MEM64 ON CACHE BOOL "" FORCE) add_compile_options("-sMEMORY64=1") @@ -102,6 +107,10 @@ add_executable(llama_webgpu_core src/llama_webgpu_core.cpp) target_compile_features(llama_webgpu_core PRIVATE cxx_std_17) +target_compile_definitions(llama_webgpu_core PRIVATE + LLAMADART_WEBGPU_PTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE} +) + target_include_directories(llama_webgpu_core PRIVATE "${LLAMA_CPP_DIR}/include" "${LLAMA_CPP_DIR}/ggml/include" @@ -120,7 +129,7 @@ set(LLAMADART_WEBGPU_LINK_OPTIONS "-sEXPORT_NAME=createLlamaWebGpuCoreModule" "-sENVIRONMENT=web,worker" "-sEXPORTED_RUNTIME_METHODS=['FS','ccall','UTF8ToString']" - "-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']" + "-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_pthread_pool_size','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']" "-lwasmfs_fetch.js" ) @@ -141,7 +150,7 @@ endif() if (LLAMADART_WEBGPU_PTHREADS) list(APPEND LLAMADART_WEBGPU_LINK_OPTIONS - "-sPTHREAD_POOL_SIZE_STRICT=2" + "-sPTHREAD_POOL_SIZE_STRICT=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT}" "-sPTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}" ) endif() diff --git a/README.md b/README.md index 9c4dafa..3253d36 100644 --- a/README.md +++ b/README.md @@ -30,14 +30,18 @@ Useful environment variables: - `WEBGPU_BRIDGE_MEM64_MAX_MEMORY` (optional wasm64 max linear memory bytes) - `WEBGPU_BRIDGE_PTHREADS` (`1`/`0`, defaults to `1`) - `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` (defaults to `4`) +- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` (defaults to `0`) Notes: - wasm64 builds default to `WEBGPU_BRIDGE_MEM64_MAX_MEMORY=12884901888` (12 GiB). - Large single-file remote model loading requires a cross-origin isolated page (`COOP`/`COEP`) so worker-thread runtime paths are available. -- pthread builds enable `-sPTHREAD_POOL_SIZE_STRICT=2` so pool exhaustion - throws explicit errors instead of risking deadlock. +- pthread builds preallocate `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` workers and cap + bridge-selected thread counts to that compiled pool size. + `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` defaults to `0` so an unexpected + over-pool request does not hard-abort the wasm runtime, but it can be + overridden for stricter local diagnostics. Build outputs: diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js index 7515e32..71c5bd6 100644 --- a/js/llama_webgpu_bridge.js +++ b/js/llama_webgpu_bridge.js @@ -1435,6 +1435,19 @@ class LlamaWebGpuBridgeRuntime { return null; } + try { + if (typeof core.ccall === 'function') { + const compiledPoolSize = Number( + core.ccall('llamadart_webgpu_pthread_pool_size', 'number', [], []), + ); + if (Number.isFinite(compiledPoolSize) && compiledPoolSize > 0) { + return Math.max(1, Math.trunc(compiledPoolSize)); + } + } + } catch (_) { + // Ignore lookup failures and fall back to runtime heuristics. + } + try { const pThread = core.PThread; if (!pThread || typeof pThread !== 'object') { diff --git a/scripts/build_bridge.sh b/scripts/build_bridge.sh index fde5511..eba092e 100755 --- a/scripts/build_bridge.sh +++ b/scripts/build_bridge.sh @@ -15,6 +15,7 @@ BUILD_MEM64="${WEBGPU_BRIDGE_BUILD_MEM64:-0}" MEM64_MAX_MEMORY="${WEBGPU_BRIDGE_MEM64_MAX_MEMORY:-12884901888}" ENABLE_PTHREADS="${WEBGPU_BRIDGE_PTHREADS:-1}" PTHREAD_POOL_SIZE="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE:-4}" +PTHREAD_POOL_SIZE_STRICT="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT:-0}" ALLOW_MEMORY_GROWTH="${WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH:-1}" INITIAL_MEMORY="${WEBGPU_BRIDGE_INITIAL_MEMORY:-0}" @@ -43,6 +44,7 @@ Environment variables: WEBGPU_BRIDGE_MEM64_MAX_MEMORY wasm64 max linear memory bytes (default: 12884901888) WEBGPU_BRIDGE_PTHREADS Enable pthread runtime support (default: 1) WEBGPU_BRIDGE_PTHREAD_POOL_SIZE PThread pool size when enabled (default: 4) + WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT PThread strictness when enabled (default: 0) WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH Allow wasm memory growth (default: 1) WEBGPU_BRIDGE_INITIAL_MEMORY Fixed wasm memory bytes when growth disabled @@ -82,6 +84,7 @@ emcmake cmake \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \ -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \ + -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \ -DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \ -DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY" @@ -116,6 +119,7 @@ if [[ "$BUILD_MEM64" == "1" ]]; then -DLLAMADART_WEBGPU_MEM64_MAX_MEMORY="$MEM64_MAX_MEMORY" \ -DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \ -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \ + -DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \ -DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \ -DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY" @@ -136,27 +140,47 @@ if [[ "$BUILD_MEM64" == "1" ]]; then echo "[bridge] applying wasm64 runtime bigint interop patch" python3 - <<'PY' "$OUT_DIR/llama_webgpu_core_mem64.js" from pathlib import Path +import re import sys target = Path(sys.argv[1]) text = target.read_text(encoding='utf-8', errors='ignore') -replacements = { - "__wasmfs_read(stream.fd,dataBuffer,length)": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))", - "__wasmfs_read(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))", - "__wasmfs_pread(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", - "__wasmfs_pread(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", - "__wasmfs_write(stream.fd,dataBuffer,length)": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))", - "__wasmfs_write(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))", - "__wasmfs_pwrite(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", - "__wasmfs_pwrite(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", - "__wasmfs_mmap(length,prot,flags,stream.fd,BigInt(offset))": "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))", -} +def bigint_or_name(name): + return rf"(?:BigInt\(\s*{name}\s*\)|{name})" + +data_buffer = bigint_or_name("dataBuffer") +length = bigint_or_name("length") +position = bigint_or_name("position") +offset = bigint_or_name("offset") + +replacements = [ + ( + rf"__wasmfs_read\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)", + "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))", + ), + ( + rf"__wasmfs_pread\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)", + "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", + ), + ( + rf"__wasmfs_write\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)", + "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))", + ), + ( + rf"__wasmfs_pwrite\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)", + "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))", + ), + ( + rf"__wasmfs_mmap\(\s*{length}\s*,\s*prot\s*,\s*flags\s*,\s*stream\.fd\s*,\s*{offset}\s*\)", + "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))", + ), +] changed = False -for old, new in replacements.items(): - if old in text: - text = text.replace(old, new) +for pattern, replacement in replacements: + text, count = re.subn(pattern, replacement, text) + if count > 0: changed = True if not changed: diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp index a531a2f..c91d345 100644 --- a/src/llama_webgpu_core.cpp +++ b/src/llama_webgpu_core.cpp @@ -1030,6 +1030,16 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_supports_pthreads() { #endif } +EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_pthread_pool_size() { +#if defined(__EMSCRIPTEN_PTHREADS__) + return LLAMADART_WEBGPU_PTHREAD_POOL_SIZE > 0 + ? LLAMADART_WEBGPU_PTHREAD_POOL_SIZE + : 1; +#else + return 1; +#endif +} + EMSCRIPTEN_KEEPALIVE const char * llamadart_webgpu_backends_json() { refresh_backend_probe(); return g_backend_json.c_str();