Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ Useful environment overrides:
- `OUT_DIR`
- `CMAKE_BUILD_TYPE`

### Local Verification Notes

When validating bridge runtime changes locally, keep build/cache output outside
the repo so generated wasm artifacts and toolchain caches do not dirty the
checkout or hit sandboxed Homebrew/cache paths:

```bash
export CCACHE_DIR=/private/tmp/llama_web_bridge_ccache
export EM_CACHE=/private/tmp/llama_web_bridge_emcache
BUILD_DIR=/private/tmp/llama_web_bridge_build MEM64_BUILD_DIR=/private/tmp/llama_web_bridge_build_mem64 OUT_DIR=/private/tmp/llama_web_bridge_dist WEBGPU_BRIDGE_BUILD_MEM64=1 ./scripts/build_bridge.sh
```

## CI / Release

- CI build gate: `.github/workflows/ci.yml`
Expand All @@ -52,3 +64,12 @@ After publishing assets tag:
1. Update/fetch pinned bridge assets in `llamadart`:
`WEBGPU_BRIDGE_ASSETS_TAG=<tag> ./scripts/fetch_webgpu_bridge_assets.sh`
2. Update docs/changelog in `llamadart` if behavior changed.

## Regression Smoke Guidance

- For pthread/runtime changes, test a BERT-class embedding model in Chromium
with cross-origin isolation enabled. The regression shape is:
`loadModelFromUrl`, `tokenize`, `embed`, and `embedBatch` on a host where
`navigator.hardwareConcurrency` is greater than the bridge pthread pool size.
- Run the smoke through both direct runtime (`disableWorker: true`) and the
bridge worker path; both should report `n_threads` capped to the pool size.
13 changes: 11 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ option(LLAMADART_WEBGPU_MEM64 "Build WebGPU bridge core with wasm64/memory64" OF
set(LLAMADART_WEBGPU_MEM64_MAX_MEMORY "12884901888" CACHE STRING "Max wasm64 linear memory in bytes")
option(LLAMADART_WEBGPU_PTHREADS "Enable pthread support for bridge runtime" ON)
set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE "4" CACHE STRING "PThread pool size for bridge runtime")
set(LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT "0" CACHE STRING "PThread pool strictness for bridge runtime")
option(LLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH "Allow wasm linear memory growth" ON)
set(LLAMADART_WEBGPU_INITIAL_MEMORY "0" CACHE STRING "Initial wasm linear memory in bytes when growth is disabled")

Expand All @@ -40,6 +41,10 @@ if (LLAMADART_WEBGPU_PTHREADS)
add_link_options("-pthread")
endif()

# Emscripten controls pthread support through compile/link flags. Newer local
# toolchains can fail CMake's libc pthread probe before llama.cpp configures.
set(CMAKE_HAVE_LIBC_PTHREAD ON CACHE BOOL "Emscripten pthread availability" FORCE)

if (LLAMADART_WEBGPU_MEM64)
set(LLAMA_WASM_MEM64 ON CACHE BOOL "" FORCE)
add_compile_options("-sMEMORY64=1")
Expand Down Expand Up @@ -102,6 +107,10 @@ add_executable(llama_webgpu_core src/llama_webgpu_core.cpp)

target_compile_features(llama_webgpu_core PRIVATE cxx_std_17)

target_compile_definitions(llama_webgpu_core PRIVATE
LLAMADART_WEBGPU_PTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}
)

target_include_directories(llama_webgpu_core PRIVATE
"${LLAMA_CPP_DIR}/include"
"${LLAMA_CPP_DIR}/ggml/include"
Expand All @@ -120,7 +129,7 @@ set(LLAMADART_WEBGPU_LINK_OPTIONS
"-sEXPORT_NAME=createLlamaWebGpuCoreModule"
"-sENVIRONMENT=web,worker"
"-sEXPORTED_RUNTIME_METHODS=['FS','ccall','UTF8ToString']"
"-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
"-sEXPORTED_FUNCTIONS=['_main','_llamadart_webgpu_probe','_llamadart_webgpu_supports_pthreads','_llamadart_webgpu_pthread_pool_size','_llamadart_webgpu_backends_json','_llamadart_webgpu_last_error','_llamadart_webgpu_set_log_level','_llamadart_webgpu_load_model','_llamadart_webgpu_load_model_from_url','_llamadart_webgpu_mmproj_load','_llamadart_webgpu_mmproj_free','_llamadart_webgpu_mmproj_supports_vision','_llamadart_webgpu_mmproj_supports_audio','_llamadart_webgpu_media_clear_pending','_llamadart_webgpu_media_add_file','_llamadart_webgpu_media_add_encoded','_llamadart_webgpu_media_add_rgb','_llamadart_webgpu_media_add_audio_f32','_llamadart_webgpu_tokenize_to_json','_llamadart_webgpu_last_tokens_json','_llamadart_webgpu_detokenize_from_json','_llamadart_webgpu_last_detokenized','_llamadart_webgpu_embed_to_json','_llamadart_webgpu_last_embedding_json','_llamadart_webgpu_generate','_llamadart_webgpu_begin_generation','_llamadart_webgpu_next_token','_llamadart_webgpu_last_piece','_llamadart_webgpu_end_generation','_llamadart_webgpu_request_cancel','_llamadart_webgpu_last_output','_llamadart_webgpu_get_context_size','_llamadart_webgpu_model_meta_json','_llamadart_webgpu_shutdown']"
"-lwasmfs_fetch.js"
)

Expand All @@ -141,7 +150,7 @@ endif()

if (LLAMADART_WEBGPU_PTHREADS)
list(APPEND LLAMADART_WEBGPU_LINK_OPTIONS
"-sPTHREAD_POOL_SIZE_STRICT=2"
"-sPTHREAD_POOL_SIZE_STRICT=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT}"
"-sPTHREAD_POOL_SIZE=${LLAMADART_WEBGPU_PTHREAD_POOL_SIZE}"
)
endif()
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ Useful environment variables:
- `WEBGPU_BRIDGE_MEM64_MAX_MEMORY` (optional wasm64 max linear memory bytes)
- `WEBGPU_BRIDGE_PTHREADS` (`1`/`0`, defaults to `1`)
- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` (defaults to `4`)
- `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` (defaults to `0`)

Notes:

- wasm64 builds default to `WEBGPU_BRIDGE_MEM64_MAX_MEMORY=12884901888` (12 GiB).
- Large single-file remote model loading requires a cross-origin isolated page
(`COOP`/`COEP`) so worker-thread runtime paths are available.
- pthread builds enable `-sPTHREAD_POOL_SIZE_STRICT=2` so pool exhaustion
throws explicit errors instead of risking deadlock.
- pthread builds preallocate `WEBGPU_BRIDGE_PTHREAD_POOL_SIZE` workers and cap
bridge-selected thread counts to that compiled pool size.
`WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT` defaults to `0` so an unexpected
over-pool request does not hard-abort the wasm runtime, but it can be
overridden for stricter local diagnostics.

Build outputs:

Expand Down
13 changes: 13 additions & 0 deletions js/llama_webgpu_bridge.js
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,19 @@ class LlamaWebGpuBridgeRuntime {
return null;
}

try {
if (typeof core.ccall === 'function') {
const compiledPoolSize = Number(
core.ccall('llamadart_webgpu_pthread_pool_size', 'number', [], []),
);
if (Number.isFinite(compiledPoolSize) && compiledPoolSize > 0) {
return Math.max(1, Math.trunc(compiledPoolSize));
}
}
} catch (_) {
// Ignore lookup failures and fall back to runtime heuristics.
}

try {
const pThread = core.PThread;
if (!pThread || typeof pThread !== 'object') {
Expand Down
52 changes: 38 additions & 14 deletions scripts/build_bridge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ BUILD_MEM64="${WEBGPU_BRIDGE_BUILD_MEM64:-0}"
MEM64_MAX_MEMORY="${WEBGPU_BRIDGE_MEM64_MAX_MEMORY:-12884901888}"
ENABLE_PTHREADS="${WEBGPU_BRIDGE_PTHREADS:-1}"
PTHREAD_POOL_SIZE="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE:-4}"
PTHREAD_POOL_SIZE_STRICT="${WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT:-0}"
ALLOW_MEMORY_GROWTH="${WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH:-1}"
INITIAL_MEMORY="${WEBGPU_BRIDGE_INITIAL_MEMORY:-0}"

Expand Down Expand Up @@ -43,6 +44,7 @@ Environment variables:
WEBGPU_BRIDGE_MEM64_MAX_MEMORY wasm64 max linear memory bytes (default: 12884901888)
WEBGPU_BRIDGE_PTHREADS Enable pthread runtime support (default: 1)
WEBGPU_BRIDGE_PTHREAD_POOL_SIZE PThread pool size when enabled (default: 4)
WEBGPU_BRIDGE_PTHREAD_POOL_SIZE_STRICT PThread strictness when enabled (default: 0)
WEBGPU_BRIDGE_ALLOW_MEMORY_GROWTH Allow wasm memory growth (default: 1)
WEBGPU_BRIDGE_INITIAL_MEMORY Fixed wasm memory bytes when growth disabled

Expand Down Expand Up @@ -82,6 +84,7 @@ emcmake cmake \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
-DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
-DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"

Expand Down Expand Up @@ -116,6 +119,7 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
-DLLAMADART_WEBGPU_MEM64_MAX_MEMORY="$MEM64_MAX_MEMORY" \
-DLLAMADART_WEBGPU_PTHREADS="$CMAKE_PTHREADS" \
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE="$PTHREAD_POOL_SIZE" \
-DLLAMADART_WEBGPU_PTHREAD_POOL_SIZE_STRICT="$PTHREAD_POOL_SIZE_STRICT" \
-DLLAMADART_WEBGPU_ALLOW_MEMORY_GROWTH="$CMAKE_ALLOW_MEMORY_GROWTH" \
-DLLAMADART_WEBGPU_INITIAL_MEMORY="$INITIAL_MEMORY"

Expand All @@ -136,27 +140,47 @@ if [[ "$BUILD_MEM64" == "1" ]]; then
echo "[bridge] applying wasm64 runtime bigint interop patch"
python3 - <<'PY' "$OUT_DIR/llama_webgpu_core_mem64.js"
from pathlib import Path
import re
import sys

target = Path(sys.argv[1])
text = target.read_text(encoding='utf-8', errors='ignore')

replacements = {
"__wasmfs_read(stream.fd,dataBuffer,length)": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
"__wasmfs_read(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
"__wasmfs_pread(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
"__wasmfs_pread(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
"__wasmfs_write(stream.fd,dataBuffer,length)": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
"__wasmfs_write(stream.fd,dataBuffer,BigInt(length))": "__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
"__wasmfs_pwrite(stream.fd,dataBuffer,length,BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
"__wasmfs_pwrite(stream.fd,dataBuffer,BigInt(length),BigInt(position))": "__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
"__wasmfs_mmap(length,prot,flags,stream.fd,BigInt(offset))": "__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
}
def bigint_or_name(name):
return rf"(?:BigInt\(\s*{name}\s*\)|{name})"

data_buffer = bigint_or_name("dataBuffer")
length = bigint_or_name("length")
position = bigint_or_name("position")
offset = bigint_or_name("offset")

replacements = [
(
rf"__wasmfs_read\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
"__wasmfs_read(stream.fd,BigInt(dataBuffer),BigInt(length))",
),
(
rf"__wasmfs_pread\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
"__wasmfs_pread(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
),
(
rf"__wasmfs_write\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*\)",
"__wasmfs_write(stream.fd,BigInt(dataBuffer),BigInt(length))",
),
(
rf"__wasmfs_pwrite\(\s*stream\.fd\s*,\s*{data_buffer}\s*,\s*{length}\s*,\s*{position}\s*\)",
"__wasmfs_pwrite(stream.fd,BigInt(dataBuffer),BigInt(length),BigInt(position))",
),
(
rf"__wasmfs_mmap\(\s*{length}\s*,\s*prot\s*,\s*flags\s*,\s*stream\.fd\s*,\s*{offset}\s*\)",
"__wasmfs_mmap(BigInt(length),prot,flags,stream.fd,BigInt(offset))",
),
]

changed = False
for old, new in replacements.items():
if old in text:
text = text.replace(old, new)
for pattern, replacement in replacements:
text, count = re.subn(pattern, replacement, text)
if count > 0:
changed = True

if not changed:
Expand Down
10 changes: 10 additions & 0 deletions src/llama_webgpu_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,16 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_supports_pthreads() {
#endif
}

EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_pthread_pool_size() {
#if defined(__EMSCRIPTEN_PTHREADS__)
return LLAMADART_WEBGPU_PTHREAD_POOL_SIZE > 0
? LLAMADART_WEBGPU_PTHREAD_POOL_SIZE
: 1;
#else
return 1;
#endif
}

EMSCRIPTEN_KEEPALIVE const char * llamadart_webgpu_backends_json() {
refresh_backend_probe();
return g_backend_json.c_str();
Expand Down
Loading