leehack
diff --git a/‎AGENTS.md‎
Lines changed: 24 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎doc/webgpu_bridge.md‎
Lines changed: 4 additions & 4 deletions b/‎doc/webgpu_bridge.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎example/chat_app/web/index.html‎
Lines changed: 2 additions & 2 deletions b/‎example/chat_app/web/index.html‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/src/backends/llama_cpp/load_param_helpers.dart‎
Lines changed: 7 additions & 30 deletions b/‎lib/src/backends/llama_cpp/load_param_helpers.dart‎
Lines changed: 7 additions & 30 deletions
diff --git a/‎lib/src/backends/webgpu/interop.dart‎
Lines changed: 9 additions & 0 deletions b/‎lib/src/backends/webgpu/interop.dart‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/src/backends/webgpu/webgpu_backend.dart‎
Lines changed: 25 additions & 0 deletions b/‎lib/src/backends/webgpu/webgpu_backend.dart‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎lib/src/core/models/config/llama_cpp_param_values.dart‎
Lines changed: 38 additions & 0 deletions b/‎lib/src/core/models/config/llama_cpp_param_values.dart‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎scripts/fetch_webgpu_bridge_assets.sh‎
Lines changed: 3 additions & 3 deletions b/‎scripts/fetch_webgpu_bridge_assets.sh‎
Lines changed: 3 additions & 3 deletions
@@ -25,6 +25,30 @@ dart pub global run coverage:format_coverage --lcov --in=coverage/test --out=cov
 dart run tool/testing/check_lcov_threshold.dart coverage/lcov.info 70
 ```
 
+### Local Chat App Web E2E
+Use the real chat app path for WebGPU bridge validation after bridge/runtime
+updates. This catches issues that direct bridge probes miss.
+
+```bash
+cd example/chat_app
+flutter build web --base-href=/example/chat_app/build/web/
+cd ../..
+python3 tool/testing/serve_static_with_headers.py --directory . --port 7358
+
+.venv-playwright/bin/python tool/testing/playwright_chat_app_real_model_smoke.py \
+  http://127.0.0.1:7358/example/chat_app/build/web/ \
+  --model-url http://127.0.0.1:7358/example/llamadart_server/models/Qwen3.5-0.8B-Q4_K_M.gguf \
+  --expect 4
+```
+
+When serving `build/web` under a repo-root path, build with the matching
+`--base-href`; otherwise Flutter resolves `flutter_bootstrap.js` and
+`webgpu_bridge/*` from `/`. On macOS headless Chromium, use the smoke script's
+default `--browser-angle auto` or pass `--browser-angle metal`; without Metal
+ANGLE the adapter may lack `shader-f16` and llama.cpp can abort in
+`ggml-webgpu` even for CPU/gpuLayers=0 runs. For larger models such as Gemma 4,
+pass `--mem64` and a smaller `--context-size` to keep the smoke bounded.
+
 ### CI Standards
 - `dart format --output=none --set-exit-if-changed .` checks formatting
 - `dart analyze` runs the linter
 
@@ -3,6 +3,12 @@
 * **Native runtime sync**:
   * Updated native hook pinning to `leehack/llamadart-native@b9016`,
     picking up the CUDA 12.8 Blackwell-capable native bundles.
+  * Updated default web bridge asset pinning to
+    `leehack/llama-web-bridge-assets@v0.1.13` (llama.cpp `b9016`) so
+    native and web runtimes track the same upstream revision.
+  * Picked up the bridge-side Qwen UTF-8 streaming stabilization and
+    multimodal fallback narrowing, while preserving control-token output for
+    parser consumers.
 * **Load-time tuning knobs**:
   * Added `ModelParams.useMmap` (default `true`) and
     `ModelParams.useMlock` (default `false`), wired to
@@ -28,6 +34,9 @@
     `ModelParams.ropeFrequencyScale` (both nullable) for
     context-extension overrides on `llama_context_params.rope_freq_base` /
     `rope_freq_scale`. `null` keeps the model's trained values.
+  * Forwarded native-compatible `ModelParams` load tuning knobs through the
+    WebGPU bridge path, including `maxParallelSequences`, flash attention,
+    KV-cache type, KV-unified, RoPE, split-mode, and main-GPU options.
 * **GPU device selection API**:
   * Added `ModelParams.mainGpu` and wired it to llama.cpp
     `llama_model_params.main_gpu`.
 
@@ -19,7 +19,7 @@ pipelines.
    `https://cdn.jsdelivr.net/gh/leehack/llama-web-bridge-assets@<tag>/llama_webgpu_bridge.js`
 2. Local fallback: `./webgpu_bridge/llama_webgpu_bridge.js`
 
-Default pinned tag in the example is `v0.1.10`.
+Default pinned tag in the example is `v0.1.13`.
 
 For broader browser coverage in this repository, fetched/local assets are patched
 to a universal Safari-compatible gate by default (`MIN_SAFARI_VERSION=170400`).
@@ -32,7 +32,7 @@ model bytes.
 To vendor pinned assets into local app web files:
 
 ```bash
-WEBGPU_BRIDGE_ASSETS_TAG=v0.1.10 ./scripts/fetch_webgpu_bridge_assets.sh
+WEBGPU_BRIDGE_ASSETS_TAG=v0.1.13 ./scripts/fetch_webgpu_bridge_assets.sh
 ```
 
 Optional compatibility env vars:
@@ -108,7 +108,7 @@ You can override CDN source/version before the bridge loader runs:
 ```html
 <script>
   window.__llamadartBridgeAssetsRepo = 'leehack/llama-web-bridge-assets';
-  window.__llamadartBridgeAssetsTag = 'v0.1.10';
+  window.__llamadartBridgeAssetsTag = 'v0.1.13';
 </script>
 ```
 
@@ -124,7 +124,7 @@ window.LlamaWebGpuBridge = class LlamaWebGpuBridge {
 
 `WebGpuLlamaBackend` can use these methods if present:
 
-- `loadModelFromUrl(url, { nCtx, nThreads, nGpuLayers, useCache, progressCallback })`
+- `loadModelFromUrl(url, { nCtx, nThreads, nThreadsBatch, nBatch, nUbatch, nGpuLayers, nSeqMax, flashAttention, cacheTypeK, cacheTypeV, kvUnified, ropeFrequencyBase, ropeFrequencyScale, splitMode, mainGpu, useCache, forceRemoteFetchBackend, remoteFetchChunkBytes, progressCallback })`
 - `prefetchModelToCache(url, { useCache, force, cacheName, progressCallback })`
 - `evictModelFromCache(url, { cacheName })`
 - `loadMultimodalProjector(url)`
 
@@ -215,8 +215,8 @@
     const bridgeAssetsTag =
       typeof configuredTag === 'string' && configuredTag.length > 0
         ? configuredTag
-        : 'v0.1.10';
-    const localBridgeVersion = 'v0.1.10-local-20260308a';
+        : 'v0.1.13';
+    const localBridgeVersion = 'v0.1.13-local-b9016';
     window.__llamadartBridgeLocalVersion = localBridgeVersion;
 
     const localBridgeUrl = `./webgpu_bridge/llama_webgpu_bridge.js?v=${localBridgeVersion}`;
 
@@ -4,41 +4,18 @@
 
 import '../../core/models/config/flash_attention.dart';
 import '../../core/models/config/kv_cache_type.dart';
+import '../../core/models/config/llama_cpp_param_values.dart'
+    as llama_cpp_values;
 import '../../core/models/inference/model_params.dart';
 import 'bindings.dart';
 
+export '../../core/models/config/llama_cpp_param_values.dart'
+    show resolveFlashAttention;
+
 /// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type`. Pure
 /// switch, no side effects.
 ggml_type ggmlTypeFor(KvCacheType type) {
-  switch (type) {
-    case KvCacheType.f16:
-      return ggml_type.GGML_TYPE_F16;
-    case KvCacheType.q8_0:
-      return ggml_type.GGML_TYPE_Q8_0;
-    case KvCacheType.q4_0:
-      return ggml_type.GGML_TYPE_Q4_0;
-  }
-}
-
-/// Resolves the user-requested [FlashAttention] given the requested KV
-/// cache types. llama.cpp refuses non-F16 KV without flash attention, so
-/// `auto` is auto-promoted to `enabled` when either KV type isn't F16.
-/// Explicit `enabled` / `disabled` are passed through unchanged.
-///
-/// Pairing this with [ModelParams]'s constructor-side ArgumentError on
-/// `(non-F16 KV, FA disabled)` ensures the only ambiguous case (`auto`)
-/// gets resolved deterministically here.
-FlashAttention resolveFlashAttention({
-  required FlashAttention requested,
-  required KvCacheType cacheTypeK,
-  required KvCacheType cacheTypeV,
-}) {
-  final wantsKvQuantization =
-      cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16;
-  if (requested == FlashAttention.auto && wantsKvQuantization) {
-    return FlashAttention.enabled;
-  }
-  return requested;
+  return ggml_type.fromValue(llama_cpp_values.ggmlTypeValueFor(type));
 }
 
 /// Applies the user-controlled fields of [params] to a freshly-defaulted
@@ -57,7 +34,7 @@ FlashAttention applyContextParams(
   llama_context_params ctxParams,
   ModelParams params,
 ) {
-  final resolvedFlashAttn = resolveFlashAttention(
+  final resolvedFlashAttn = llama_cpp_values.resolveFlashAttention(
     requested: params.flashAttention,
     cacheTypeK: params.cacheTypeK,
     cacheTypeV: params.cacheTypeV,
 
@@ -125,6 +125,15 @@ extension type WebGpuLoadModelOptions._(JSObject _) implements JSObject {
     @JS('nBatch') int? nBatch,
     @JS('nUbatch') int? nUbatch,
     @JS('nGpuLayers') int? nGpuLayers,
+    @JS('nSeqMax') int? nSeqMax,
+    @JS('flashAttention') int? flashAttention,
+    @JS('cacheTypeK') int? cacheTypeK,
+    @JS('cacheTypeV') int? cacheTypeV,
+    @JS('kvUnified') bool? kvUnified,
+    @JS('ropeFrequencyBase') double? ropeFrequencyBase,
+    @JS('ropeFrequencyScale') double? ropeFrequencyScale,
+    @JS('splitMode') int? splitMode,
+    @JS('mainGpu') int? mainGpu,
     @JS('useCache') bool? useCache,
     @JS('forceRemoteFetchBackend') bool? forceRemoteFetchBackend,
     @JS('remoteFetchThresholdBytes') int? remoteFetchThresholdBytes,
 
@@ -9,6 +9,7 @@ import 'package:web/web.dart';
 
 import '../../core/models/chat/content_part.dart';
 import '../../core/models/config/gpu_backend.dart';
+import '../../core/models/config/llama_cpp_param_values.dart';
 import '../../core/models/config/log_level.dart';
 import '../../core/models/inference/generation_params.dart';
 import '../../core/models/inference/model_params.dart';
@@ -615,6 +616,20 @@ class WebGpuLlamaBackend
     return (nBatch: tunedBatch, nUbatch: tunedUbatch);
   }
 
+  int _webGpuFlashAttentionValue(ModelParams params) {
+    return llamaFlashAttentionTypeValueFor(
+      resolveFlashAttention(
+        requested: params.flashAttention,
+        cacheTypeK: params.cacheTypeK,
+        cacheTypeV: params.cacheTypeV,
+      ),
+    );
+  }
+
+  bool? _webGpuKvUnifiedValue(ModelParams params) {
+    return params.kvUnified ?? (params.maxParallelSequences > 1 ? true : null);
+  }
+
   int _resolveSafeRequestedGpuLayers({
     required String url,
     required ModelParams params,
@@ -815,6 +830,7 @@ class WebGpuLlamaBackend
     ModelParams params, {
     Function(double progress)? onProgress,
   }) async {
+    params.validate();
     _preferMemory64Override = null;
     _forceRemoteFetchBackendOverride = null;
 
@@ -931,6 +947,15 @@ class WebGpuLlamaBackend
                 ? params.microBatchSize
                 : batchTuning.nUbatch,
             nGpuLayers: attempt.gpuLayers,
+            nSeqMax: math.max(1, params.maxParallelSequences),
+            flashAttention: _webGpuFlashAttentionValue(params),
+            cacheTypeK: ggmlTypeValueFor(params.cacheTypeK),
+            cacheTypeV: ggmlTypeValueFor(params.cacheTypeV),
+            kvUnified: _webGpuKvUnifiedValue(params),
+            ropeFrequencyBase: params.ropeFrequencyBase,
+            ropeFrequencyScale: params.ropeFrequencyScale,
+            splitMode: params.splitMode.llamaCppValue,
+            mainGpu: params.mainGpu,
             useCache: true,
             forceRemoteFetchBackend: forceRemoteFetchBackend,
             remoteFetchChunkBytes: remoteFetchChunkBytesOverride,
 
@@ -0,0 +1,38 @@
+import 'flash_attention.dart';
+import 'kv_cache_type.dart';
+
+/// Maps llamadart's [KvCacheType] enum to llama.cpp's `ggml_type` value.
+int ggmlTypeValueFor(KvCacheType type) {
+  return switch (type) {
+    KvCacheType.f16 => 1,
+    KvCacheType.q4_0 => 2,
+    KvCacheType.q8_0 => 8,
+  };
+}
+
+/// Maps llamadart's [FlashAttention] enum to llama.cpp's flash-attention
+/// option value.
+int llamaFlashAttentionTypeValueFor(FlashAttention type) {
+  return switch (type) {
+    FlashAttention.auto => -1,
+    FlashAttention.disabled => 0,
+    FlashAttention.enabled => 1,
+  };
+}
+
+/// Resolves the user-requested [FlashAttention] given the requested KV cache
+/// types. llama.cpp refuses non-F16 KV without flash attention, so `auto` is
+/// auto-promoted to `enabled` when either KV type isn't F16. Explicit `enabled`
+/// or `disabled` values pass through unchanged.
+FlashAttention resolveFlashAttention({
+  required FlashAttention requested,
+  required KvCacheType cacheTypeK,
+  required KvCacheType cacheTypeV,
+}) {
+  final wantsKvQuantization =
+      cacheTypeK != KvCacheType.f16 || cacheTypeV != KvCacheType.f16;
+  if (requested == FlashAttention.auto && wantsKvQuantization) {
+    return FlashAttention.enabled;
+  }
+  return requested;
+}
@@ -4,7 +4,7 @@ set -euo pipefail
 ROOT_DIR="$(git rev-parse --show-toplevel)"
 OUT_DIR="${WEBGPU_BRIDGE_OUT_DIR:-$ROOT_DIR/example/chat_app/web/webgpu_bridge}"
 ASSETS_REPO="${WEBGPU_BRIDGE_ASSETS_REPO:-leehack/llama-web-bridge-assets}"
-ASSETS_TAG="${WEBGPU_BRIDGE_ASSETS_TAG:-v0.1.10}"
+ASSETS_TAG="${WEBGPU_BRIDGE_ASSETS_TAG:-v0.1.13}"
 CDN_BASE="${WEBGPU_BRIDGE_CDN_BASE:-https://cdn.jsdelivr.net/gh/${ASSETS_REPO}@${ASSETS_TAG}}"
 PATCH_SAFARI_COMPAT="${WEBGPU_BRIDGE_PATCH_SAFARI_COMPAT:-1}"
 MIN_SAFARI_VERSION="${WEBGPU_BRIDGE_MIN_SAFARI_VERSION:-170400}"
@@ -14,7 +14,7 @@ if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
 Downloads prebuilt WebGPU bridge assets into the chat_app web directory.
 
 Default source:
-  https://cdn.jsdelivr.net/gh/leehack/llama-web-bridge-assets@v0.1.10
+  https://cdn.jsdelivr.net/gh/leehack/llama-web-bridge-assets@v0.1.13
 
 Environment variables:
   WEBGPU_BRIDGE_ASSETS_REPO   Asset repo in owner/repo format
@@ -28,7 +28,7 @@ Usage:
   ./scripts/fetch_webgpu_bridge_assets.sh
 
 Examples:
-  WEBGPU_BRIDGE_ASSETS_TAG=v0.1.10 ./scripts/fetch_webgpu_bridge_assets.sh
+  WEBGPU_BRIDGE_ASSETS_TAG=v0.1.13 ./scripts/fetch_webgpu_bridge_assets.sh
   WEBGPU_BRIDGE_ASSETS_REPO=acme/llama-web-bridge-assets WEBGPU_BRIDGE_ASSETS_TAG=v2 ./scripts/fetch_webgpu_bridge_assets.sh
 USAGE
   exit 0