diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92752c3..894f3cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: name: Build WebGPU Bridge (WASM) runs-on: ubuntu-latest env: - LLAMA_CPP_TAG: b8157 + LLAMA_CPP_TAG: b9016 steps: - uses: actions/checkout@v4 @@ -27,13 +27,17 @@ jobs: - name: Build bridge artifacts env: OUT_DIR: ${{ runner.temp }}/webgpu_bridge_dist + WEBGPU_BRIDGE_BUILD_MEM64: 1 run: ./scripts/build_bridge.sh - name: Verify outputs run: | test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js" + test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js" test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js" test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm" + test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js" + test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm" - name: Upload bridge artifacts uses: actions/upload-artifact@v4 @@ -41,5 +45,8 @@ jobs: name: webgpu-bridge-dist path: | ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js + ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm + ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js + ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm diff --git a/.github/workflows/publish_assets.yml b/.github/workflows/publish_assets.yml index 59be4a8..c040eda 100644 --- a/.github/workflows/publish_assets.yml +++ b/.github/workflows/publish_assets.yml @@ -13,7 +13,7 @@ on: llama_cpp_tag: description: llama.cpp tag to build from required: true - default: b8157 + default: b9016 push: tags: - 'v*' @@ -21,7 +21,7 @@ on: env: ASSETS_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_tag || github.ref_name }} ASSETS_REPO: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_repo || 'leehack/llama-web-bridge-assets' }} - LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b8157' }} + LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b9016' }} permissions: contents: read diff --git a/CMakeLists.txt b/CMakeLists.txt index f3f3197..3ae6fdf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,9 @@ find_package(Threads REQUIRED) set(MTMD_AUDIO_SRC "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-audio.cpp") set(MTMD_AUDIO_PATCHED "${CMAKE_BINARY_DIR}/generated/mtmd-audio-single-thread.cpp") +file(GLOB LLAMADART_MTMD_MODEL_SOURCES + "${LLAMA_CPP_DIR}/tools/mtmd/models/*.cpp") + file(READ "${MTMD_AUDIO_SRC}" MTMD_AUDIO_CONTENT) string(FIND "${MTMD_AUDIO_CONTENT}" "4, // n_threads" MTMD_AUDIO_THREAD_MARKER_INDEX) if (MTMD_AUDIO_THREAD_MARKER_INDEX EQUAL -1) @@ -74,39 +77,10 @@ string(REPLACE file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/generated") file(WRITE "${MTMD_AUDIO_PATCHED}" "${MTMD_AUDIO_CONTENT}") -set(LLAMADART_MTMD_MODEL_SOURCES - "${LLAMA_CPP_DIR}/tools/mtmd/models/cogvlm.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/conformer.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/glm4v.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/internvl.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/kimivl.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/kimik25.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/llama4.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/llava.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/minicpmv.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/pixtral.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/qwen2vl.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/qwen3vl.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/siglip.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/whisper-enc.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/mobilenetv5.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/youtuvl.cpp" -) - -set(LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES - "${LLAMA_CPP_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp" - "${LLAMA_CPP_DIR}/tools/mtmd/models/paddleocr.cpp" -) - -foreach(model_source IN LISTS LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES) - if (EXISTS "${model_source}") - list(APPEND LLAMADART_MTMD_MODEL_SOURCES "${model_source}") - endif() -endforeach() - add_library(llamadart_mtmd STATIC "${LLAMA_CPP_DIR}/tools/mtmd/mtmd.cpp" "${MTMD_AUDIO_PATCHED}" + "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-image.cpp" "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-helper.cpp" "${LLAMA_CPP_DIR}/tools/mtmd/clip.cpp" ${LLAMADART_MTMD_MODEL_SOURCES} diff --git a/README.md b/README.md index e910e94..9c4dafa 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ This repo includes a wasm build gate in: - `.github/workflows/ci.yml` -It builds against pinned `llama.cpp` tag `b8157` and uploads build artifacts. +It builds against pinned `llama.cpp` tag `b9016` and uploads build artifacts. ## Publishing @@ -93,7 +93,7 @@ Manual override example: 2. Inputs: - `assets_tag`: `v0.1.5` - `assets_repo`: `leehack/llama-web-bridge-assets` - - `llama_cpp_tag`: `b8157` + - `llama_cpp_tag`: `b9016` After publish, assets are CDN-available at: diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js index 6a24bf5..7515e32 100644 --- a/js/llama_webgpu_bridge.js +++ b/js/llama_webgpu_bridge.js @@ -108,6 +108,44 @@ function parsePositiveInteger(value) { return Math.trunc(numeric); } +function parseInteger(value, fallback = 0) { + const numeric = Number(value); + if (!Number.isFinite(numeric)) { + return fallback; + } + return Math.trunc(numeric); +} + +function parseBooleanFlag(value, fallback = false) { + if (typeof value === 'boolean') { + return value; + } + if (typeof value === 'number' && Number.isFinite(value)) { + return value !== 0; + } + return fallback; +} + +function parseOptionalBooleanFlag(value) { + if (typeof value === 'boolean') { + return value ? 1 : 0; + } + if (typeof value === 'number' && Number.isFinite(value)) { + return value !== 0 ? 1 : 0; + } + return -1; +} + +function parseEnumValue(value, allowed, fallback) { + const parsed = parseInteger(value, fallback); + return allowed.includes(parsed) ? parsed : fallback; +} + +function parsePositiveNumber(value) { + const numeric = Number(value); + return Number.isFinite(numeric) && numeric > 0 ? numeric : 0; +} + function parseTotalFromContentRangeHeader(contentRangeHeader) { if (typeof contentRangeHeader !== 'string' || contentRangeHeader.length === 0) { return 0; @@ -1329,6 +1367,17 @@ class LlamaWebGpuBridgeRuntime { this._nGpuLayers = Number.isFinite(config.nGpuLayers) ? Number(config.nGpuLayers) : -1; + this._nSeqMax = 0; + this._useMmap = false; + this._useMlock = false; + this._flashAttention = -1; + this._cacheTypeK = 1; + this._cacheTypeV = 1; + this._kvUnified = -1; + this._ropeFrequencyBase = 0; + this._ropeFrequencyScale = 0; + this._splitMode = -1; + this._mainGpu = -1; this._isSafari = isSafariUserAgent(this._config.userAgent ?? globalThis.navigator?.userAgent ?? ''); this._coreVariant = 'uninitialized'; this._preferMemory64 = this._config.preferMemory64 !== false; @@ -1963,6 +2012,70 @@ class LlamaWebGpuBridgeRuntime { } } + _resolveNativeLoadOptions(options = {}) { + this._nSeqMax = parsePositiveInteger(options.nSeqMax); + this._useMmap = parseBooleanFlag(options.useMmap, false); + this._useMlock = parseBooleanFlag(options.useMlock, false); + this._flashAttention = parseEnumValue(options.flashAttention, [-1, 0, 1], -1); + this._cacheTypeK = parseEnumValue(options.cacheTypeK, [1, 2, 8], 1); + this._cacheTypeV = parseEnumValue(options.cacheTypeV, [1, 2, 8], 1); + this._kvUnified = parseOptionalBooleanFlag(options.kvUnified); + this._ropeFrequencyBase = parsePositiveNumber(options.ropeFrequencyBase); + this._ropeFrequencyScale = parsePositiveNumber(options.ropeFrequencyScale); + this._splitMode = parseEnumValue(options.splitMode, [0, 1, 2, 3], -1); + this._mainGpu = parseInteger(options.mainGpu, -1); + if (this._mainGpu < 0) { + this._mainGpu = -1; + } + + const wantsQuantizedKvCache = this._cacheTypeK !== 1 || this._cacheTypeV !== 1; + if (this._flashAttention === 0 && wantsQuantizedKvCache) { + throw new Error( + 'Non-F16 KV cache requires flashAttention to be auto or enabled.', + ); + } + if (this._flashAttention === -1 && wantsQuantizedKvCache) { + this._flashAttention = 1; + this._runtimeNotes.push('flash_attention:auto_enabled_for_kv_cache'); + } + if (this._kvUnified < 0 && this._nSeqMax > 1) { + this._kvUnified = 1; + this._runtimeNotes.push('kv_unified:auto_enabled_for_sequences'); + } + } + + _nativeLoadOptionValues() { + return [ + this._nSeqMax, + this._useMmap ? 1 : 0, + this._useMlock ? 1 : 0, + this._flashAttention, + this._cacheTypeK, + this._cacheTypeV, + this._kvUnified, + this._ropeFrequencyBase, + this._ropeFrequencyScale, + this._splitMode, + this._mainGpu, + ]; + } + + _nativeLoadOptionTypes() { + return [ + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + ]; + } + async _tryLoadModelFromRemoteFetchBackend(core, url, options = {}) { if (!this._canUseRemoteFetchBackend(options)) { return { loaded: false, sizeBytes: null }; @@ -2031,6 +2144,7 @@ class LlamaWebGpuBridgeRuntime { 'number', 'number', 'number', + ...this._nativeLoadOptionTypes(), ], [ remoteFetchUrl, @@ -2041,6 +2155,7 @@ class LlamaWebGpuBridgeRuntime { this._nUbatch, this._nGpuLayers, chunkBytes, + ...this._nativeLoadOptionValues(), ], { async: true }, ), @@ -2926,6 +3041,8 @@ class LlamaWebGpuBridgeRuntime { this._nUbatch = this._nBatch; } + this._resolveNativeLoadOptions(options); + if (Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0) { this._pushRuntimeNote(`thread_pool_size:${this._threadPoolSizeHint}`); } @@ -2947,6 +3064,9 @@ class LlamaWebGpuBridgeRuntime { if (this._nUbatch > 0) { this._pushRuntimeNote(`n_ubatch:${this._nUbatch}`); } + if (this._nSeqMax > 0) { + this._pushRuntimeNote(`n_seq_max:${this._nSeqMax}`); + } if (isCpuModelMode && !Number.isFinite(requestedBatch) && !Number.isFinite(requestedUbatch)) { this._runtimeNotes.push('cpu_batch_tuned_default'); } @@ -3174,7 +3294,16 @@ class LlamaWebGpuBridgeRuntime { await core.ccall( 'llamadart_webgpu_load_model', 'number', - ['string', 'number', 'number', 'number', 'number', 'number', 'number'], + [ + 'string', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + ...this._nativeLoadOptionTypes(), + ], [ this._modelPath, this._nCtx, @@ -3183,6 +3312,7 @@ class LlamaWebGpuBridgeRuntime { this._nBatch, this._nUbatch, this._nGpuLayers, + ...this._nativeLoadOptionValues(), ], { async: true }, ), @@ -3307,6 +3437,7 @@ class LlamaWebGpuBridgeRuntime { 'number', 'number', 'number', + ...this._nativeLoadOptionTypes(), ], [ reloadUrl, @@ -3317,6 +3448,7 @@ class LlamaWebGpuBridgeRuntime { this._nUbatch, candidateLayers, remoteFetchReloadChunkBytes, + ...this._nativeLoadOptionValues(), ], { async: true }, ), @@ -3326,7 +3458,16 @@ class LlamaWebGpuBridgeRuntime { await core.ccall( 'llamadart_webgpu_load_model', 'number', - ['string', 'number', 'number', 'number', 'number', 'number', 'number'], + [ + 'string', + 'number', + 'number', + 'number', + 'number', + 'number', + 'number', + ...this._nativeLoadOptionTypes(), + ], [ this._modelPath, this._nCtx, @@ -3335,6 +3476,7 @@ class LlamaWebGpuBridgeRuntime { this._nBatch, this._nUbatch, candidateLayers, + ...this._nativeLoadOptionValues(), ], { async: true }, ), @@ -4079,6 +4221,20 @@ class LlamaWebGpuBridgeRuntime { 'llamadart.webgpu.n_threads_batch': String(this._threadsBatch), 'llamadart.webgpu.n_batch': this._nBatch > 0 ? String(this._nBatch) : '', 'llamadart.webgpu.n_ubatch': this._nUbatch > 0 ? String(this._nUbatch) : '', + 'llamadart.webgpu.n_seq_max': this._nSeqMax > 0 ? String(this._nSeqMax) : '', + 'llamadart.webgpu.flash_attention': String(this._flashAttention), + 'llamadart.webgpu.cache_type_k': String(this._cacheTypeK), + 'llamadart.webgpu.cache_type_v': String(this._cacheTypeV), + 'llamadart.webgpu.kv_unified': + this._kvUnified >= 0 ? String(this._kvUnified) : '', + 'llamadart.webgpu.rope_freq_base': + this._ropeFrequencyBase > 0 ? String(this._ropeFrequencyBase) : '', + 'llamadart.webgpu.rope_freq_scale': + this._ropeFrequencyScale > 0 ? String(this._ropeFrequencyScale) : '', + 'llamadart.webgpu.split_mode': + this._splitMode >= 0 ? String(this._splitMode) : '', + 'llamadart.webgpu.main_gpu': + this._mainGpu >= 0 ? String(this._mainGpu) : '', 'llamadart.webgpu.thread_pool_size': Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0 ? String(this._threadPoolSizeHint) diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp index 0bb311b..a531a2f 100644 --- a/src/llama_webgpu_core.cpp +++ b/src/llama_webgpu_core.cpp @@ -887,6 +887,17 @@ int32_t next_token_impl() { return 1; } +bool is_supported_kv_cache_type(int32_t value) { + switch (value) { + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } +} + int32_t load_model_internal( const char * model_path, int32_t n_ctx, @@ -895,12 +906,29 @@ int32_t load_model_internal( int32_t n_batch, int32_t n_ubatch, int32_t n_gpu_layers, - bool use_mmap) { + int32_t n_seq_max, + bool use_mmap, + bool use_mlock, + int32_t flash_attn_type, + int32_t type_k, + int32_t type_v, + int32_t kv_unified, + double rope_freq_base, + double rope_freq_scale, + int32_t split_mode, + int32_t main_gpu) { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; mparams.use_mmap = use_mmap; - mparams.use_mlock = false; + mparams.use_mlock = use_mlock; mparams.vocab_only = false; + if (split_mode >= LLAMA_SPLIT_MODE_NONE && + split_mode <= LLAMA_SPLIT_MODE_TENSOR) { + mparams.split_mode = static_cast(split_mode); + } + if (main_gpu >= 0) { + mparams.main_gpu = main_gpu; + } g_state.model = llama_model_load_from_file(model_path, mparams); if (g_state.model == nullptr) { @@ -913,6 +941,10 @@ int32_t load_model_internal( cparams.n_ctx = static_cast(n_ctx); } + if (n_seq_max > 0) { + cparams.n_seq_max = static_cast(n_seq_max); + } + if (n_threads > 0) { cparams.n_threads = n_threads; } @@ -939,6 +971,27 @@ int32_t load_model_internal( cparams.n_ubatch = std::min(cparams.n_batch, 512U); } + if (flash_attn_type >= LLAMA_FLASH_ATTN_TYPE_AUTO && + flash_attn_type <= LLAMA_FLASH_ATTN_TYPE_ENABLED) { + cparams.flash_attn_type = + static_cast(flash_attn_type); + } + if (is_supported_kv_cache_type(type_k)) { + cparams.type_k = static_cast(type_k); + } + if (is_supported_kv_cache_type(type_v)) { + cparams.type_v = static_cast(type_v); + } + if (kv_unified >= 0) { + cparams.kv_unified = kv_unified != 0; + } + if (rope_freq_base > 0.0) { + cparams.rope_freq_base = static_cast(rope_freq_base); + } + if (rope_freq_scale > 0.0) { + cparams.rope_freq_scale = static_cast(rope_freq_scale); + } + const bool enable_gpu_ops = n_gpu_layers > 0; g_model_uses_gpu_ops = enable_gpu_ops; cparams.offload_kqv = enable_gpu_ops; @@ -1005,7 +1058,18 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model( int32_t n_threads_batch, int32_t n_batch, int32_t n_ubatch, - int32_t n_gpu_layers) { + int32_t n_gpu_layers, + int32_t n_seq_max, + int32_t use_mmap, + int32_t use_mlock, + int32_t flash_attn_type, + int32_t type_k, + int32_t type_v, + int32_t kv_unified, + double rope_freq_base, + double rope_freq_scale, + int32_t split_mode, + int32_t main_gpu) { clear_error(); g_last_output.clear(); g_cancel_requested = false; @@ -1025,7 +1089,17 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model( n_batch, n_ubatch, n_gpu_layers, - false); + n_seq_max, + use_mmap != 0, + use_mlock != 0, + flash_attn_type, + type_k, + type_v, + kv_unified, + rope_freq_base, + rope_freq_scale, + split_mode, + main_gpu); } EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url( @@ -1036,7 +1110,18 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url( int32_t n_batch, int32_t n_ubatch, int32_t n_gpu_layers, - int32_t chunk_size) { + int32_t chunk_size, + int32_t n_seq_max, + int32_t use_mmap, + int32_t use_mlock, + int32_t flash_attn_type, + int32_t type_k, + int32_t type_v, + int32_t kv_unified, + double rope_freq_base, + double rope_freq_scale, + int32_t split_mode, + int32_t main_gpu) { clear_error(); g_last_output.clear(); g_cancel_requested = false; @@ -1102,7 +1187,17 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url( n_batch, n_ubatch, n_gpu_layers, - false); + n_seq_max, + use_mmap != 0, + use_mlock != 0, + flash_attn_type, + type_k, + type_v, + kv_unified, + rope_freq_base, + rope_freq_scale, + split_mode, + main_gpu); if (unlink(fetch_file_path.c_str()) != 0 && errno != ENOENT) { // best-effort cleanup of temporary fetch path }