diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 92752c3..894f3cc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ jobs:
     name: Build WebGPU Bridge (WASM)
     runs-on: ubuntu-latest
     env:
-      LLAMA_CPP_TAG: b8157
+      LLAMA_CPP_TAG: b9016
     steps:
       - uses: actions/checkout@v4
 
@@ -27,13 +27,17 @@ jobs:
       - name: Build bridge artifacts
         env:
           OUT_DIR: ${{ runner.temp }}/webgpu_bridge_dist
+          WEBGPU_BRIDGE_BUILD_MEM64: 1
         run: ./scripts/build_bridge.sh
 
       - name: Verify outputs
         run: |
           test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js"
+          test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js"
           test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js"
           test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm"
+          test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js"
+          test -f "${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm"
 
       - name: Upload bridge artifacts
         uses: actions/upload-artifact@v4
@@ -41,5 +45,8 @@ jobs:
           name: webgpu-bridge-dist
           path: |
             ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge.js
+            ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_bridge_worker.js
             ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.js
             ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core.wasm
+            ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.js
+            ${{ runner.temp }}/webgpu_bridge_dist/llama_webgpu_core_mem64.wasm
diff --git a/.github/workflows/publish_assets.yml b/.github/workflows/publish_assets.yml
index 59be4a8..c040eda 100644
--- a/.github/workflows/publish_assets.yml
+++ b/.github/workflows/publish_assets.yml
@@ -13,7 +13,7 @@ on:
       llama_cpp_tag:
         description: llama.cpp tag to build from
         required: true
-        default: b8157
+        default: b9016
   push:
     tags:
       - 'v*'
@@ -21,7 +21,7 @@ on:
 env:
   ASSETS_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_tag || github.ref_name }}
   ASSETS_REPO: ${{ github.event_name == 'workflow_dispatch' && inputs.assets_repo || 'leehack/llama-web-bridge-assets' }}
-  LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b8157' }}
+  LLAMA_CPP_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.llama_cpp_tag || 'b9016' }}
 
 permissions:
   contents: read
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3f3197..3ae6fdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,9 @@ find_package(Threads REQUIRED)
 set(MTMD_AUDIO_SRC "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-audio.cpp")
 set(MTMD_AUDIO_PATCHED "${CMAKE_BINARY_DIR}/generated/mtmd-audio-single-thread.cpp")
 
+file(GLOB LLAMADART_MTMD_MODEL_SOURCES
+    "${LLAMA_CPP_DIR}/tools/mtmd/models/*.cpp")
+
 file(READ "${MTMD_AUDIO_SRC}" MTMD_AUDIO_CONTENT)
 string(FIND "${MTMD_AUDIO_CONTENT}" "4,  // n_threads" MTMD_AUDIO_THREAD_MARKER_INDEX)
 if (MTMD_AUDIO_THREAD_MARKER_INDEX EQUAL -1)
@@ -74,39 +77,10 @@ string(REPLACE
 file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/generated")
 file(WRITE "${MTMD_AUDIO_PATCHED}" "${MTMD_AUDIO_CONTENT}")
 
-set(LLAMADART_MTMD_MODEL_SOURCES
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/cogvlm.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/conformer.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/glm4v.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/internvl.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/kimivl.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/kimik25.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/llama4.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/llava.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/minicpmv.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/pixtral.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/qwen2vl.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/qwen3vl.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/siglip.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/whisper-enc.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/mobilenetv5.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/youtuvl.cpp"
-)
-
-set(LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp"
-    "${LLAMA_CPP_DIR}/tools/mtmd/models/paddleocr.cpp"
-)
-
-foreach(model_source IN LISTS LLAMADART_MTMD_OPTIONAL_MODEL_SOURCES)
-    if (EXISTS "${model_source}")
-        list(APPEND LLAMADART_MTMD_MODEL_SOURCES "${model_source}")
-    endif()
-endforeach()
-
 add_library(llamadart_mtmd STATIC
     "${LLAMA_CPP_DIR}/tools/mtmd/mtmd.cpp"
     "${MTMD_AUDIO_PATCHED}"
+    "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-image.cpp"
     "${LLAMA_CPP_DIR}/tools/mtmd/mtmd-helper.cpp"
     "${LLAMA_CPP_DIR}/tools/mtmd/clip.cpp"
     ${LLAMADART_MTMD_MODEL_SOURCES}
diff --git a/README.md b/README.md
index e910e94..9c4dafa 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ This repo includes a wasm build gate in:
 
 - `.github/workflows/ci.yml`
 
-It builds against pinned `llama.cpp` tag `b8157` and uploads build artifacts.
+It builds against pinned `llama.cpp` tag `b9016` and uploads build artifacts.
 
 ## Publishing
 
@@ -93,7 +93,7 @@ Manual override example:
 2. Inputs:
    - `assets_tag`: `v0.1.5`
    - `assets_repo`: `leehack/llama-web-bridge-assets`
-   - `llama_cpp_tag`: `b8157`
+   - `llama_cpp_tag`: `b9016`
 
 After publish, assets are CDN-available at:
 
diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js
index 6a24bf5..7515e32 100644
--- a/js/llama_webgpu_bridge.js
+++ b/js/llama_webgpu_bridge.js
@@ -108,6 +108,44 @@ function parsePositiveInteger(value) {
   return Math.trunc(numeric);
 }
 
+function parseInteger(value, fallback = 0) {
+  const numeric = Number(value);
+  if (!Number.isFinite(numeric)) {
+    return fallback;
+  }
+  return Math.trunc(numeric);
+}
+
+function parseBooleanFlag(value, fallback = false) {
+  if (typeof value === 'boolean') {
+    return value;
+  }
+  if (typeof value === 'number' && Number.isFinite(value)) {
+    return value !== 0;
+  }
+  return fallback;
+}
+
+function parseOptionalBooleanFlag(value) {
+  if (typeof value === 'boolean') {
+    return value ? 1 : 0;
+  }
+  if (typeof value === 'number' && Number.isFinite(value)) {
+    return value !== 0 ? 1 : 0;
+  }
+  return -1;
+}
+
+function parseEnumValue(value, allowed, fallback) {
+  const parsed = parseInteger(value, fallback);
+  return allowed.includes(parsed) ? parsed : fallback;
+}
+
+function parsePositiveNumber(value) {
+  const numeric = Number(value);
+  return Number.isFinite(numeric) && numeric > 0 ? numeric : 0;
+}
+
 function parseTotalFromContentRangeHeader(contentRangeHeader) {
   if (typeof contentRangeHeader !== 'string' || contentRangeHeader.length === 0) {
     return 0;
@@ -1329,6 +1367,17 @@ class LlamaWebGpuBridgeRuntime {
     this._nGpuLayers = Number.isFinite(config.nGpuLayers)
       ? Number(config.nGpuLayers)
       : -1;
+    this._nSeqMax = 0;
+    this._useMmap = false;
+    this._useMlock = false;
+    this._flashAttention = -1;
+    this._cacheTypeK = 1;
+    this._cacheTypeV = 1;
+    this._kvUnified = -1;
+    this._ropeFrequencyBase = 0;
+    this._ropeFrequencyScale = 0;
+    this._splitMode = -1;
+    this._mainGpu = -1;
     this._isSafari = isSafariUserAgent(this._config.userAgent ?? globalThis.navigator?.userAgent ?? '');
     this._coreVariant = 'uninitialized';
     this._preferMemory64 = this._config.preferMemory64 !== false;
@@ -1963,6 +2012,70 @@ class LlamaWebGpuBridgeRuntime {
     }
   }
 
+  _resolveNativeLoadOptions(options = {}) {
+    this._nSeqMax = parsePositiveInteger(options.nSeqMax);
+    this._useMmap = parseBooleanFlag(options.useMmap, false);
+    this._useMlock = parseBooleanFlag(options.useMlock, false);
+    this._flashAttention = parseEnumValue(options.flashAttention, [-1, 0, 1], -1);
+    this._cacheTypeK = parseEnumValue(options.cacheTypeK, [1, 2, 8], 1);
+    this._cacheTypeV = parseEnumValue(options.cacheTypeV, [1, 2, 8], 1);
+    this._kvUnified = parseOptionalBooleanFlag(options.kvUnified);
+    this._ropeFrequencyBase = parsePositiveNumber(options.ropeFrequencyBase);
+    this._ropeFrequencyScale = parsePositiveNumber(options.ropeFrequencyScale);
+    this._splitMode = parseEnumValue(options.splitMode, [0, 1, 2, 3], -1);
+    this._mainGpu = parseInteger(options.mainGpu, -1);
+    if (this._mainGpu < 0) {
+      this._mainGpu = -1;
+    }
+
+    const wantsQuantizedKvCache = this._cacheTypeK !== 1 || this._cacheTypeV !== 1;
+    if (this._flashAttention === 0 && wantsQuantizedKvCache) {
+      throw new Error(
+        'Non-F16 KV cache requires flashAttention to be auto or enabled.',
+      );
+    }
+    if (this._flashAttention === -1 && wantsQuantizedKvCache) {
+      this._flashAttention = 1;
+      this._runtimeNotes.push('flash_attention:auto_enabled_for_kv_cache');
+    }
+    if (this._kvUnified < 0 && this._nSeqMax > 1) {
+      this._kvUnified = 1;
+      this._runtimeNotes.push('kv_unified:auto_enabled_for_sequences');
+    }
+  }
+
+  _nativeLoadOptionValues() {
+    return [
+      this._nSeqMax,
+      this._useMmap ? 1 : 0,
+      this._useMlock ? 1 : 0,
+      this._flashAttention,
+      this._cacheTypeK,
+      this._cacheTypeV,
+      this._kvUnified,
+      this._ropeFrequencyBase,
+      this._ropeFrequencyScale,
+      this._splitMode,
+      this._mainGpu,
+    ];
+  }
+
+  _nativeLoadOptionTypes() {
+    return [
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+      'number',
+    ];
+  }
+
   async _tryLoadModelFromRemoteFetchBackend(core, url, options = {}) {
     if (!this._canUseRemoteFetchBackend(options)) {
       return { loaded: false, sizeBytes: null };
@@ -2031,6 +2144,7 @@ class LlamaWebGpuBridgeRuntime {
             'number',
             'number',
             'number',
+            ...this._nativeLoadOptionTypes(),
           ],
           [
             remoteFetchUrl,
@@ -2041,6 +2155,7 @@ class LlamaWebGpuBridgeRuntime {
             this._nUbatch,
             this._nGpuLayers,
             chunkBytes,
+            ...this._nativeLoadOptionValues(),
           ],
           { async: true },
         ),
@@ -2926,6 +3041,8 @@ class LlamaWebGpuBridgeRuntime {
       this._nUbatch = this._nBatch;
     }
 
+    this._resolveNativeLoadOptions(options);
+
     if (Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0) {
       this._pushRuntimeNote(`thread_pool_size:${this._threadPoolSizeHint}`);
     }
@@ -2947,6 +3064,9 @@ class LlamaWebGpuBridgeRuntime {
     if (this._nUbatch > 0) {
       this._pushRuntimeNote(`n_ubatch:${this._nUbatch}`);
     }
+    if (this._nSeqMax > 0) {
+      this._pushRuntimeNote(`n_seq_max:${this._nSeqMax}`);
+    }
     if (isCpuModelMode && !Number.isFinite(requestedBatch) && !Number.isFinite(requestedUbatch)) {
       this._runtimeNotes.push('cpu_batch_tuned_default');
     }
@@ -3174,7 +3294,16 @@ class LlamaWebGpuBridgeRuntime {
             await core.ccall(
               'llamadart_webgpu_load_model',
               'number',
-              ['string', 'number', 'number', 'number', 'number', 'number', 'number'],
+              [
+                'string',
+                'number',
+                'number',
+                'number',
+                'number',
+                'number',
+                'number',
+                ...this._nativeLoadOptionTypes(),
+              ],
               [
                 this._modelPath,
                 this._nCtx,
@@ -3183,6 +3312,7 @@ class LlamaWebGpuBridgeRuntime {
                 this._nBatch,
                 this._nUbatch,
                 this._nGpuLayers,
+                ...this._nativeLoadOptionValues(),
               ],
               { async: true },
             ),
@@ -3307,6 +3437,7 @@ class LlamaWebGpuBridgeRuntime {
                   'number',
                   'number',
                   'number',
+                  ...this._nativeLoadOptionTypes(),
                 ],
                 [
                   reloadUrl,
@@ -3317,6 +3448,7 @@ class LlamaWebGpuBridgeRuntime {
                   this._nUbatch,
                   candidateLayers,
                   remoteFetchReloadChunkBytes,
+                  ...this._nativeLoadOptionValues(),
                 ],
                 { async: true },
               ),
@@ -3326,7 +3458,16 @@ class LlamaWebGpuBridgeRuntime {
               await core.ccall(
                 'llamadart_webgpu_load_model',
                 'number',
-                ['string', 'number', 'number', 'number', 'number', 'number', 'number'],
+                [
+                  'string',
+                  'number',
+                  'number',
+                  'number',
+                  'number',
+                  'number',
+                  'number',
+                  ...this._nativeLoadOptionTypes(),
+                ],
                 [
                   this._modelPath,
                   this._nCtx,
@@ -3335,6 +3476,7 @@ class LlamaWebGpuBridgeRuntime {
                   this._nBatch,
                   this._nUbatch,
                   candidateLayers,
+                  ...this._nativeLoadOptionValues(),
                 ],
                 { async: true },
               ),
@@ -4079,6 +4221,20 @@ class LlamaWebGpuBridgeRuntime {
       'llamadart.webgpu.n_threads_batch': String(this._threadsBatch),
       'llamadart.webgpu.n_batch': this._nBatch > 0 ? String(this._nBatch) : '',
       'llamadart.webgpu.n_ubatch': this._nUbatch > 0 ? String(this._nUbatch) : '',
+      'llamadart.webgpu.n_seq_max': this._nSeqMax > 0 ? String(this._nSeqMax) : '',
+      'llamadart.webgpu.flash_attention': String(this._flashAttention),
+      'llamadart.webgpu.cache_type_k': String(this._cacheTypeK),
+      'llamadart.webgpu.cache_type_v': String(this._cacheTypeV),
+      'llamadart.webgpu.kv_unified':
+        this._kvUnified >= 0 ? String(this._kvUnified) : '',
+      'llamadart.webgpu.rope_freq_base':
+        this._ropeFrequencyBase > 0 ? String(this._ropeFrequencyBase) : '',
+      'llamadart.webgpu.rope_freq_scale':
+        this._ropeFrequencyScale > 0 ? String(this._ropeFrequencyScale) : '',
+      'llamadart.webgpu.split_mode':
+        this._splitMode >= 0 ? String(this._splitMode) : '',
+      'llamadart.webgpu.main_gpu':
+        this._mainGpu >= 0 ? String(this._mainGpu) : '',
       'llamadart.webgpu.thread_pool_size':
         Number.isFinite(this._threadPoolSizeHint) && this._threadPoolSizeHint > 0
           ? String(this._threadPoolSizeHint)
diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp
index 0bb311b..a531a2f 100644
--- a/src/llama_webgpu_core.cpp
+++ b/src/llama_webgpu_core.cpp
@@ -887,6 +887,17 @@ int32_t next_token_impl() {
   return 1;
 }
 
+bool is_supported_kv_cache_type(int32_t value) {
+  switch (value) {
+    case GGML_TYPE_F16:
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q8_0:
+      return true;
+    default:
+      return false;
+  }
+}
+
 int32_t load_model_internal(
     const char * model_path,
     int32_t n_ctx,
@@ -895,12 +906,29 @@ int32_t load_model_internal(
     int32_t n_batch,
     int32_t n_ubatch,
     int32_t n_gpu_layers,
-    bool use_mmap) {
+    int32_t n_seq_max,
+    bool use_mmap,
+    bool use_mlock,
+    int32_t flash_attn_type,
+    int32_t type_k,
+    int32_t type_v,
+    int32_t kv_unified,
+    double rope_freq_base,
+    double rope_freq_scale,
+    int32_t split_mode,
+    int32_t main_gpu) {
   llama_model_params mparams = llama_model_default_params();
   mparams.n_gpu_layers = n_gpu_layers;
   mparams.use_mmap = use_mmap;
-  mparams.use_mlock = false;
+  mparams.use_mlock = use_mlock;
   mparams.vocab_only = false;
+  if (split_mode >= LLAMA_SPLIT_MODE_NONE &&
+      split_mode <= LLAMA_SPLIT_MODE_TENSOR) {
+    mparams.split_mode = static_cast<llama_split_mode>(split_mode);
+  }
+  if (main_gpu >= 0) {
+    mparams.main_gpu = main_gpu;
+  }
 
   g_state.model = llama_model_load_from_file(model_path, mparams);
   if (g_state.model == nullptr) {
@@ -913,6 +941,10 @@ int32_t load_model_internal(
     cparams.n_ctx = static_cast<uint32_t>(n_ctx);
   }
 
+  if (n_seq_max > 0) {
+    cparams.n_seq_max = static_cast<uint32_t>(n_seq_max);
+  }
+
   if (n_threads > 0) {
     cparams.n_threads = n_threads;
   }
@@ -939,6 +971,27 @@ int32_t load_model_internal(
     cparams.n_ubatch = std::min<uint32_t>(cparams.n_batch, 512U);
   }
 
+  if (flash_attn_type >= LLAMA_FLASH_ATTN_TYPE_AUTO &&
+      flash_attn_type <= LLAMA_FLASH_ATTN_TYPE_ENABLED) {
+    cparams.flash_attn_type =
+        static_cast<llama_flash_attn_type>(flash_attn_type);
+  }
+  if (is_supported_kv_cache_type(type_k)) {
+    cparams.type_k = static_cast<ggml_type>(type_k);
+  }
+  if (is_supported_kv_cache_type(type_v)) {
+    cparams.type_v = static_cast<ggml_type>(type_v);
+  }
+  if (kv_unified >= 0) {
+    cparams.kv_unified = kv_unified != 0;
+  }
+  if (rope_freq_base > 0.0) {
+    cparams.rope_freq_base = static_cast<float>(rope_freq_base);
+  }
+  if (rope_freq_scale > 0.0) {
+    cparams.rope_freq_scale = static_cast<float>(rope_freq_scale);
+  }
+
   const bool enable_gpu_ops = n_gpu_layers > 0;
   g_model_uses_gpu_ops = enable_gpu_ops;
   cparams.offload_kqv = enable_gpu_ops;
@@ -1005,7 +1058,18 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model(
     int32_t n_threads_batch,
     int32_t n_batch,
     int32_t n_ubatch,
-    int32_t n_gpu_layers) {
+    int32_t n_gpu_layers,
+    int32_t n_seq_max,
+    int32_t use_mmap,
+    int32_t use_mlock,
+    int32_t flash_attn_type,
+    int32_t type_k,
+    int32_t type_v,
+    int32_t kv_unified,
+    double rope_freq_base,
+    double rope_freq_scale,
+    int32_t split_mode,
+    int32_t main_gpu) {
   clear_error();
   g_last_output.clear();
   g_cancel_requested = false;
@@ -1025,7 +1089,17 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model(
       n_batch,
       n_ubatch,
       n_gpu_layers,
-      false);
+      n_seq_max,
+      use_mmap != 0,
+      use_mlock != 0,
+      flash_attn_type,
+      type_k,
+      type_v,
+      kv_unified,
+      rope_freq_base,
+      rope_freq_scale,
+      split_mode,
+      main_gpu);
 }
 
 EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url(
@@ -1036,7 +1110,18 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url(
     int32_t n_batch,
     int32_t n_ubatch,
     int32_t n_gpu_layers,
-    int32_t chunk_size) {
+    int32_t chunk_size,
+    int32_t n_seq_max,
+    int32_t use_mmap,
+    int32_t use_mlock,
+    int32_t flash_attn_type,
+    int32_t type_k,
+    int32_t type_v,
+    int32_t kv_unified,
+    double rope_freq_base,
+    double rope_freq_scale,
+    int32_t split_mode,
+    int32_t main_gpu) {
   clear_error();
   g_last_output.clear();
   g_cancel_requested = false;
@@ -1102,7 +1187,17 @@ EMSCRIPTEN_KEEPALIVE int32_t llamadart_webgpu_load_model_from_url(
           n_batch,
           n_ubatch,
           n_gpu_layers,
-          false);
+          n_seq_max,
+          use_mmap != 0,
+          use_mlock != 0,
+          flash_attn_type,
+          type_k,
+          type_v,
+          kv_unified,
+          rope_freq_base,
+          rope_freq_scale,
+          split_mode,
+          main_gpu);
   if (unlink(fetch_file_path.c_str()) != 0 && errno != ENOENT) {
     // best-effort cleanup of temporary fetch path
   }