From e94d13cf6959a8b8b16f3be90a56e996aba07f96 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 8 Apr 2026 09:19:57 -0700
Subject: [PATCH 01/10] Update register tiling matmul to use f32 accumulation

---
 .../ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl   | 12 ++++++------
 .../wgsl-shaders/mul_mat_subgroup_matrix.wgsl        |  3 +++
 2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
index b1da421a691..ee37e6d249c 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
@@ -4,14 +4,14 @@ enable f16;
 #include "mul_mat_decls.tmpl"
 
 #ifdef VEC
-fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
-    return vec4<f32>(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn]));
+fn store_val(acc: array<array<f32, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
+    return vec4<f32>(acc[tm][tn], acc[tm + 1][tn], acc[tm + 2][tn], acc[tm + 3][tn]);
 }
 #endif
 
 #ifdef SCALAR
-fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
-    return f32(acc[tm][tn]);
+fn store_val(acc: array<array<f32, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
+    return acc[tm][tn];
 }
 #endif
 
@@ -98,7 +98,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
     let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M;
     let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N;
 
-    var acc: array<array<f16, TILE_N>, TILE_M>;
+    var acc: array<array<f32, TILE_N>, TILE_M>;
 
     for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
 
@@ -122,7 +122,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 let src1_idx = src1_n * TILE_K + k_inner;
                 let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx];
                 for (var tm = 0u; tm < TILE_M; tm++) {
-                      acc[tm][tn] += src0_tile[tm] * src1_val;
+                      acc[tm][tn] += f32(src0_tile[tm]) * f32(src1_val);
                 }
             }
         }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl
index 9f9ef279f29..4151ce430b0 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl
@@ -6,6 +6,9 @@ enable chromium_experimental_subgroup_matrix;
 #include "common_decls.tmpl"
 #include "mul_mat_decls.tmpl"
 
+// TODO: this shader path does not work with some models like qwen2.5 on Metal devices, f16 accumulation causes NaNs.
+// See https://github.com/ggml-org/llama.cpp/issues/21602
+
 #ifdef VEC
 fn store_dst(shmem_idx: u32, dst_idx: u32) {
     dst[dst_idx] = vec4<f32>(

From af4c1d516ffc6c8d5e394d7b5439ffe627aa0a03 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 8 Apr 2026 11:41:12 -0700
Subject: [PATCH 02/10] fix profiling code

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b8df0f4dd05..fff91f3e1c3 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -535,7 +535,7 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
 static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context &             ctx,
-                                                        const std::vector<webgpu_command> & commands,
+                                                        const std::vector<webgpu_encoded_op> & commands,
                                                         std::vector<wgpu::FutureWaitInfo> & futures) {
     for (const auto & command : commands) {
         auto label   = command.pipeline_name;

From ac5267d4421147b71a8ae2cd1f611bb58bc44fff Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 9 Apr 2026 13:21:24 -0700
Subject: [PATCH 03/10] Fix register tiling matmul for chrome, i'm blaming dawn

---
 .../wgsl-shaders/mul_mat_decls.tmpl           | 35 ++++++++-----------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
index ea91c13468f..d74f87d89fb 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -502,12 +502,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         let d = load_src0_f16_at(block_byte_base);
         let dmin = load_src0_f16_at(block_byte_base + 2u);
 
-        // Load packed scales
-        var scale_vals: array<u32, 3>;
-        for (var i: u32 = 0u; i < 3u; i++) {
-            scale_vals[i] = load_src0_u32_at(block_byte_base + 4u + 4u * i);
-        }
-
         // Map k_in_block to loop structure:
         // Outer loop over 64-element groups (alternating q_b_idx)
         // Inner loop over 2 shifts per group
@@ -523,15 +517,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         var sc: u32;
         var mn: u32;
 
+        let scale_base = block_byte_base + 4u;
+
         if (is < 4u) {
-            let sc_byte = get_byte(scale_vals[is / 4u], is % 4u);
-            let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u);
+            let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u);
+            let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
             sc = sc_byte & 63u;
             mn = min_byte & 63u;
         } else {
-            let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u);
-            let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u);
-            let min_hi = get_byte(scale_vals[is / 4u], is % 4u);
+            let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u);
+            let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u);
+            let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
 
             sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
             mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
@@ -578,11 +574,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         let d = load_src0_f16_at(block_byte_base);
         let dmin = load_src0_f16_at(block_byte_base + 2u);
 
-        // Load packed scales
-        var scale_vals: array<u32, 3>;
-        for (var i: u32 = 0u; i < 3u; i++) {
-            scale_vals[i] = load_src0_u32_at(block_byte_base + 4u + 4u * i);
-        }
 
         // The original loop processes elements in groups of 64
         // Each group of 64: q_b_idx cycles through [0,32,64,96], shift cycles [0,4]
@@ -603,15 +594,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         var sc: u32;
         var mn: u32;
 
+        let scale_base = block_byte_base + 4u;
+
         if (is < 4u) {
-            let sc_byte = get_byte(scale_vals[is / 4u], is % 4u);
-            let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u);
+            let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u);
+            let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
             sc = sc_byte & 63u;
             mn = min_byte & 63u;
         } else {
-            let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u);
-            let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u);
-            let min_hi = get_byte(scale_vals[is / 4u], is % 4u);
+            let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u);
+            let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u);
+            let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
 
             sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
             mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);

From 4edf91b8eb23dd1ea9594d7b353d9d0776e5aa6a Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 10 Apr 2026 19:21:06 -0700
Subject: [PATCH 04/10] Update batch tuning value for iOS

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index fff91f3e1c3..b077296267f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -79,7 +79,7 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
 
 /* Constants */
 
-#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
+#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 64u
 #define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN      10u
 #define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS           30000u
 #define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS           (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
@@ -437,34 +437,25 @@ static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
 }
 
 #ifdef __EMSCRIPTEN__
-// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
 EM_JS(int, ggml_webgpu_is_ios_browser, (), {
     const ua = navigator.userAgent;
     return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
 });
 #endif
 
-static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
+// TODO: these next two functions may want tuning across different platforms and workloads,
+static uint32_t ggml_backend_webgpu_get_max_inflight_batches() {
 #ifdef __EMSCRIPTEN__
+    // iOS has very strict limits on the number of in-flight GPU commands,
+    // so we need to throttle to avoid failures.
     if (ggml_webgpu_is_ios_browser()) {
         return 1;
     }
-#else
-    GGML_UNUSED(info);
 #endif
-
     return UINT32_MAX;
 }
 
-static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
-#ifdef __EMSCRIPTEN__
-    if (ggml_webgpu_is_ios_browser()) {
-        return 16;
-    }
-#else
-    GGML_UNUSED(info);
-#endif
-
+static uint32_t ggml_backend_webgpu_get_command_submit_batch_size() {
     return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
 }
 

From 354cb5c021048676691f39e4211e344cbcfdfd5f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 10 Apr 2026 19:29:49 -0700
Subject: [PATCH 05/10] compile fix

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b077296267f..79ffbea63d8 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -3428,8 +3428,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     }
 #endif
     ctx->webgpu_global_ctx->adapter.GetInfo(&info);
-    ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
-    ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches(info);
+    ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
+    ctx->webgpu_global_ctx->max_inflight_batches      = ggml_backend_webgpu_get_max_inflight_batches();
     wgpu::SupportedFeatures features;
     ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
     // we require f16 support

From 0928d310729900635b6295e691ff3e2e862a7055 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 12 Apr 2026 20:48:32 -0700
Subject: [PATCH 06/10] Fix use of new load function

---
 .../wgsl-shaders/mul_mat_decls.tmpl           | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
index a3b17b69878..56a76a6e6c4 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -520,14 +520,14 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         let scale_base = block_byte_base + 4u;
 
         if (is < 4u) {
-            let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u);
-            let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
+            let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u);
+            let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
             sc = sc_byte & 63u;
             mn = min_byte & 63u;
         } else {
-            let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u);
-            let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u);
-            let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
+            let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u);
+            let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u);
+            let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
 
             sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
             mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
@@ -597,14 +597,14 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
         let scale_base = block_byte_base + 4u;
 
         if (is < 4u) {
-            let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u);
-            let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
+            let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u);
+            let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
             sc = sc_byte & 63u;
             mn = min_byte & 63u;
         } else {
-            let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u);
-            let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u);
-            let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u);
+            let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u);
+            let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u);
+            let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
 
             sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
             mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);

From 0dfdf15717439626205da8473d7da79a141ef431 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 13 Apr 2026 11:17:28 -0700
Subject: [PATCH 07/10] Move to a single query set for GPU profiling

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 341 ++++++++++++---------------
 1 file changed, 154 insertions(+), 187 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index a64f0774995..b9bac216b87 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -73,8 +73,8 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
 #endif  // GGML_WEBGPU_CPU_PROFILE
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-#    define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS       32
-#    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
+#    define WEBGPU_MAX_PROFILE_QUERY_COUNT        4096u
+#    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES (WEBGPU_MAX_PROFILE_QUERY_COUNT * sizeof(uint64_t))
 #endif
 
 /* Constants */
@@ -159,75 +159,10 @@ struct webgpu_param_arena {
     ~webgpu_param_arena() { this->cleanup(); }
 };
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-struct webgpu_gpu_profile_bufs {
-    wgpu::Buffer   host_buf;
-    wgpu::Buffer   dev_buf;
-    wgpu::QuerySet query_set;
-};
-
-// Holds a pool of parameter buffers for WebGPU operations
-struct webgpu_gpu_profile_buf_pool {
-    std::vector<webgpu_gpu_profile_bufs> free;
-
-    std::mutex mutex;
-
-    std::condition_variable cv;
-
-    void init(wgpu::Device      device,
-              int               num_bufs,
-              size_t            buf_size,
-              wgpu::BufferUsage dev_buf_usage,
-              wgpu::BufferUsage host_buf_usage) {
-        for (int i = 0; i < num_bufs; i++) {
-            wgpu::Buffer host_buf;
-            wgpu::Buffer dev_buf;
-            ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf");
-            ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf");
-            // Create a query set for 2 timestamps
-            wgpu::QuerySetDescriptor ts_query_set_desc = {};
-
-            ts_query_set_desc.type      = wgpu::QueryType::Timestamp;
-            ts_query_set_desc.count     = 2;
-            wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc);
-
-            free.push_back({ host_buf, dev_buf, ts_query_set });
-        }
-    }
-
-    webgpu_gpu_profile_bufs alloc_bufs() {
-        std::unique_lock<std::mutex> lock(mutex);
-        cv.wait(lock, [this] { return !free.empty(); });
-        webgpu_gpu_profile_bufs bufs = free.back();
-        free.pop_back();
-        return bufs;
-    }
-
-    void free_bufs(std::vector<webgpu_gpu_profile_bufs> bufs) {
-        std::lock_guard<std::mutex> lock(mutex);
-        free.insert(free.end(), bufs.begin(), bufs.end());
-        cv.notify_all();
-    }
-
-    void cleanup() {
-        std::lock_guard<std::mutex> lock(mutex);
-        for (auto & bufs : free) {
-            bufs.host_buf.Destroy();
-            bufs.dev_buf.Destroy();
-            bufs.query_set.Destroy();
-        }
-        free.clear();
-    }
-
-    ~webgpu_gpu_profile_buf_pool() { this->cleanup(); }
-};
-#endif
-
 struct webgpu_encoded_op {
     uint32_t num_kernels = 0;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    webgpu_gpu_profile_bufs timestamp_query_bufs;
-    std::string             pipeline_name;
+    std::string pipeline_name;
 #endif
 };
 
@@ -256,7 +191,7 @@ struct webgpu_global_context_struct {
     webgpu_capabilities  capabilities;
     // Shared buffer to move data from device to host
     wgpu::Buffer         get_tensor_staging_buf;
-    // Global mutex for pipeline and staging buffer, will be refactored to exclude pipeline caches.
+    // Global mutex for get_tensor
     std::recursive_mutex mutex;
 
     wgpu::Buffer    memset_params_buf;
@@ -272,8 +207,6 @@ struct webgpu_global_context_struct {
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // Profiling: per-shader GPU time in ms
     std::unordered_map<std::string, double> shader_gpu_time_ms;
-    // Profiling: pool of timestamp query buffers (one per operation)
-    webgpu_gpu_profile_buf_pool             timestamp_query_buf_pool;
 #endif
 
 #ifdef GGML_WEBGPU_DEBUG
@@ -317,6 +250,38 @@ struct webgpu_context_struct {
     wgpu::Buffer       set_rows_host_error_buf;
 
     size_t memset_bytes_per_thread;
+
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    wgpu::Buffer   profile_timestamp_dev_buf;
+    wgpu::Buffer   profile_timestamp_host_buf;
+    wgpu::QuerySet profile_timestamp_query_set;
+    uint32_t       profile_timestamp_query_count = 0;
+#endif
+
+    ~webgpu_context_struct() {
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        if (this->profile_timestamp_host_buf) {
+            this->profile_timestamp_host_buf.Destroy();
+            this->profile_timestamp_host_buf = nullptr;
+        }
+        if (this->profile_timestamp_dev_buf) {
+            this->profile_timestamp_dev_buf.Destroy();
+            this->profile_timestamp_dev_buf = nullptr;
+        }
+        if (this->profile_timestamp_query_set) {
+            this->profile_timestamp_query_set.Destroy();
+            this->profile_timestamp_query_set = nullptr;
+        }
+#endif
+        if (this->set_rows_host_error_buf) {
+            this->set_rows_host_error_buf.Destroy();
+            this->set_rows_host_error_buf = nullptr;
+        }
+        if (this->set_rows_dev_error_buf) {
+            this->set_rows_dev_error_buf.Destroy();
+            this->set_rows_dev_error_buf = nullptr;
+        }
+    }
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -399,24 +364,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 
 /** WebGPU Actions */
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &             ctx,
-                                                     std::vector<wgpu::FutureWaitInfo> & futures) {
-    if (futures.empty()) {
-        return;
-    }
-
-    constexpr size_t max_futures_per_wait = 64;
-
-    while (!futures.empty()) {
-        ctx->instance.WaitAny(std::min(max_futures_per_wait, futures.size()), futures.data(), UINT64_MAX);
-        futures.erase(std::remove_if(futures.begin(), futures.end(),
-                                     [](const wgpu::FutureWaitInfo & info) { return info.completed; }),
-                      futures.end());
-    }
-}
-#endif
-
 template <typename T>
 static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
                                                   T                callback_status,
@@ -528,36 +475,8 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
 }
 #endif
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context &                ctx,
-                                                        const std::vector<webgpu_encoded_op> & commands,
-                                                        std::vector<wgpu::FutureWaitInfo> &    futures) {
-    for (const auto & command : commands) {
-        auto label   = command.pipeline_name;
-        auto ts_bufs = command.timestamp_query_bufs;
-
-        wgpu::Future f = ts_bufs.host_buf.MapAsync(
-            wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
-            [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-                if (status != wgpu::MapAsyncStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
-                } else {
-                    const uint64_t * ts_data    = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
-                    // WebGPU timestamps are in ns; convert to ms
-                    double           elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
-                    ctx->shader_gpu_time_ms[label] += elapsed_ms;
-                }
-                // We can't unmap in here due to WebGPU reentrancy limitations.
-                ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
-            });
-        futures.push_back({ f });
-    }
-}
-#endif
-
 static webgpu_encoded_op ggml_backend_webgpu_build_multi(
-    webgpu_global_context &                                ctx,
-    webgpu_param_arena &                                   param_arena,
+    webgpu_context &                                       ctx,
     wgpu::CommandEncoder &                                 encoder,
     const std::vector<webgpu_pipeline> &                   pipelines,
     const std::vector<std::vector<uint32_t>> &             params_list,
@@ -574,37 +493,36 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(
 
     for (size_t i = 0; i < pipelines.size(); i++) {
         const size_t param_size   = params_list[i].size() * sizeof(uint32_t);
-        const size_t param_offset = param_arena.alloc_slot(param_size);
+        const size_t param_offset = ctx->param_arena.alloc_slot(param_size);
 
         std::vector<wgpu::BindGroupEntry> entries            = bind_group_entries_list[i];
         uint32_t                          params_binding_num = entries.size();
         entries.push_back({ .binding = params_binding_num,
-                            .buffer  = param_arena.buffer,
+                            .buffer  = ctx->param_arena.buffer,
                             .offset  = param_offset,
-                            .size    = param_arena.slot_size });
+                            .size    = ctx->param_arena.slot_size });
 
         wgpu::BindGroupDescriptor bind_group_desc;
         bind_group_desc.layout     = pipelines[i].pipeline.GetBindGroupLayout(0);
         bind_group_desc.entryCount = entries.size();
         bind_group_desc.entries    = entries.data();
         bind_group_desc.label      = pipelines[i].name.c_str();
-        bind_groups.push_back(ctx->device.CreateBindGroup(&bind_group_desc));
+        bind_groups.push_back(ctx->global_ctx->device.CreateBindGroup(&bind_group_desc));
         param_offsets.push_back(param_offset);
     }
 
     for (size_t i = 0; i < param_offsets.size(); i++) {
-        ctx->queue.WriteBuffer(param_arena.buffer, param_offsets[i], params_list[i].data(),
-                               params_list[i].size() * sizeof(uint32_t));
+        ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(),
+                                           params_list[i].size() * sizeof(uint32_t));
     }
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
-    if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
-        ts_bufs.host_buf.Unmap();
-    }
+    GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT);
+    const uint32_t query_begin = ctx->profile_timestamp_query_count++;
+    const uint32_t query_end   = ctx->profile_timestamp_query_count++;
 
-    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_bufs.query_set,
-                                              .beginningOfPassWriteIndex = 0,
-                                              .endOfPassWriteIndex       = 1 };
+    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ctx->profile_timestamp_query_set,
+                                              .beginningOfPassWriteIndex = query_begin,
+                                              .endOfPassWriteIndex       = query_end };
     wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
     wgpu::ComputePassEncoder    pass      = encoder.BeginComputePass(&pass_desc);
 #else
@@ -618,23 +536,19 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(
     pass.End();
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0);
-    encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
-    result.timestamp_query_bufs = ts_bufs;
-    result.pipeline_name        = pipelines.front().name;
+    result.pipeline_name = pipelines.front().name;
 #endif
     return result;
 }
 
-static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_global_context &           ctx,
-                                                   webgpu_param_arena &              param_arena,
+static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context &                  ctx,
                                                    wgpu::CommandEncoder &            encoder,
                                                    webgpu_pipeline &                 pipeline,
                                                    std::vector<uint32_t>             params,
                                                    std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                    uint32_t                          wg_x,
                                                    uint32_t                          wg_y = 1) {
-    return ggml_backend_webgpu_build_multi(ctx, param_arena, encoder,
+    return ggml_backend_webgpu_build_multi(ctx, encoder,
                                            {
                                                pipeline
     },
@@ -642,6 +556,30 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_global_context &
                                            { { wg_x, wg_y } });
 }
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+static void ggml_backend_webgpu_collect_profile_results(webgpu_context &                   ctx,
+                                                        const std::vector<std::string> & pipeline_names) {
+    if (pipeline_names.empty()) {
+        return;
+    }
+
+    const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t);
+    GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size());
+
+    ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0,
+                                   mapped_size);
+    const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size);
+
+    for (size_t i = 0; i < pipeline_names.size(); ++i) {
+        // WebGPU timestamps are in ns; convert to ms.
+        const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
+        ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
+    }
+
+    ctx->profile_timestamp_host_buf.Unmap();
+}
+#endif
+
 static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
                                               wgpu::Buffer &          buf,
                                               uint32_t                value,
@@ -829,7 +767,7 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context &       ctx,
     };
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_set(webgpu_context &       ctx,
@@ -895,7 +833,7 @@ static webgpu_encoded_op ggml_webgpu_set(webgpu_context &       ctx,
                         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_pad(webgpu_context &       ctx,
@@ -953,7 +891,7 @@ static webgpu_encoded_op ggml_webgpu_pad(webgpu_context &       ctx,
     };
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context &       ctx,
@@ -1015,7 +953,7 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context &       ctx,
 
     const uint32_t wg_x = CEIL_DIV((uint32_t) src1->ne[0], decisions->wg_size);
     const uint32_t wg_y = (uint32_t) (dst->ne[2] * dst->ne[3]);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
 }
 
 static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context &       ctx,
@@ -1072,7 +1010,7 @@ static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context &       ctx,
 
     const uint32_t wg_x = CEIL_DIV((uint32_t) src0->ne[1], decisions->block_size);
     const uint32_t wg_y = token_tiles * (uint32_t) dst->ne[2];
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
 }
 
 static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context &       ctx,
@@ -1158,7 +1096,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context &       ctx,
          .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  }
     };
 
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, h, n_seqs);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, h, n_seqs);
 }
 
 static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context &       ctx,
@@ -1228,7 +1166,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context &
         threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
     }
     uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, 1);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, 1);
 }
 
 // Workgroup size is a common constant
@@ -1295,7 +1233,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context &       ctx,
     uint32_t total_threads  = float_parallel ? blocks_per_row * total_rows : total_rows;
     uint32_t wg_x           = CEIL_DIV(total_threads, decisions->wg_size);
 
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context &       ctx,
@@ -1441,7 +1379,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context &       ctx,
         compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
     }
 
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
 }
 
 static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context &       ctx,
@@ -1597,7 +1535,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context &       ctx,
     entries_list.push_back(std::move(main_entries));
     workgroups_list.push_back({ wg_x, wg_y });
 
-    return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list,
+    return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
                                            entries_list, workgroups_list);
 }
 
@@ -1923,11 +1861,11 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context &       ctx,
             workgroups_list.push_back({ (uint32_t) nrows, 1u });
         }
 
-        return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list,
+        return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
                                                entries_list, workgroups_list);
     }
 
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 #endif  // __EMSCRIPTEN__
 
@@ -2009,7 +1947,7 @@ static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context &       ctx,
@@ -2112,7 +2050,7 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_concat(webgpu_context &       ctx,
@@ -2169,7 +2107,7 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context &       ctx,
     webgpu_pipeline pipeline  = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
     uint32_t        wg_x      = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context &       ctx,
@@ -2214,7 +2152,7 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context &       ctx,
     webgpu_pipeline pipeline  = ctx->shader_lib->get_repeat_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
     uint32_t        wg_x      = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context &       ctx,
@@ -2260,7 +2198,7 @@ static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context &       ctx,
     };
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries,
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries,
                                      ggml_nrows(src));
 }
 
@@ -2366,7 +2304,7 @@ static webgpu_encoded_op ggml_webgpu_rope(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_glu(webgpu_context &       ctx,
@@ -2432,7 +2370,7 @@ static webgpu_encoded_op ggml_webgpu_glu(webgpu_context &       ctx,
                         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_scale(webgpu_context &       ctx,
@@ -2486,7 +2424,7 @@ static webgpu_encoded_op ggml_webgpu_scale(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context &       ctx,
@@ -2570,7 +2508,7 @@ static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context &       ctx,
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries,
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries,
                                      ggml_nrows(dst));
 }
 
@@ -2599,7 +2537,7 @@ static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context &       ctx,
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_argmax_pipeline(shader_lib_ctx);
     uint32_t        wg_x     = ggml_nelements(dst);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
@@ -2696,7 +2634,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
     workgroups_list.push_back({ wg_x_init, wg_y_init });
 
     if (merge_passes == 0) {
-        return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list,
+        return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
                                                entries_list, workgroups_list);
     }
 
@@ -2758,7 +2696,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
         in_is_tmp = !in_is_tmp;
     }
 
-    return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list,
+    return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
                                            entries_list, workgroups_list);
 }
 
@@ -2790,7 +2728,7 @@ static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context &       ctx,
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_cumsum_pipeline(shader_lib_ctx);
     uint32_t        wg_x     = ggml_nrows(dst);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context &       ctx,
@@ -2825,7 +2763,7 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context &       ctx,
     webgpu_pipeline pipeline = ctx->shader_lib->get_sum_rows_pipeline(shader_lib_ctx);
 
     uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst);
-    return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
 }
 
 // Returns the encoded command, or std::nullopt if the operation is a no-op
@@ -2937,13 +2875,20 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
     std::vector<webgpu_encoded_op> commands;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    std::vector<wgpu::FutureWaitInfo> profile_futures;
+    std::vector<std::string> profile_pipeline_names;
 #endif
     uint32_t             num_batched_kernels  = 0;
     uint32_t             num_inflight_batches = 0;
     bool                 contains_set_rows    = false;
     wgpu::CommandEncoder batch_encoder        = ctx->global_ctx->device.CreateCommandEncoder();
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    ctx->profile_timestamp_query_count = 0;
+    if (ctx->profile_timestamp_host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
+        ctx->profile_timestamp_host_buf.Unmap();
+    }
+#endif
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
             contains_set_rows = true;
@@ -2951,37 +2896,52 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         if (auto cmd = ggml_webgpu_encode_node(ctx, batch_encoder, cgraph->nodes[i])) {
             commands.push_back(*cmd);
             num_batched_kernels += cmd.value().num_kernels;
+#ifdef GGML_WEBGPU_GPU_PROFILE
+            profile_pipeline_names.push_back(cmd->pipeline_name);
+#endif
         }
 
         if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
             num_batched_kernels                = 0;
             wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
             ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
-#ifdef GGML_WEBGPU_GPU_PROFILE
-            ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
-#endif
             ctx->param_arena.reset();
             commands.clear();
             batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
         }
     }
-    if (!commands.empty()) {
+    if (num_batched_kernels > 0) {
         wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
         ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
-#ifdef GGML_WEBGPU_GPU_PROFILE
-        ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
-#endif
         ctx->param_arena.reset();
         commands.clear();
     }
 
-    // If there are SET_ROWS operations in this graph, copy the error buffers to the host for checking.
-    if (contains_set_rows) {
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    const size_t profile_buf_size = ctx->profile_timestamp_query_count * sizeof(uint64_t);
+#endif
+
+    // Copy any post-graph bookkeeping buffers to the host for checking.
+    if (contains_set_rows
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        || ctx->profile_timestamp_query_count > 0
+#endif
+    ) {
         wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder();
-        encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
-                                   ctx->set_rows_host_error_buf.GetSize());
-        wgpu::CommandBuffer set_rows_commands = encoder.Finish();
-        ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches);
+        if (contains_set_rows) {
+            encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
+                                       ctx->set_rows_host_error_buf.GetSize());
+        }
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        if (ctx->profile_timestamp_query_count > 0) {
+            encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count,
+                                    ctx->profile_timestamp_dev_buf, 0);
+            encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0,
+                                       profile_buf_size);
+        }
+#endif
+        wgpu::CommandBuffer post_graph_commands = encoder.Finish();
+        ggml_backend_webgpu_submit_commands(ctx, post_graph_commands, num_inflight_batches);
     }
 
     ggml_backend_webgpu_wait_queue(ctx->global_ctx);
@@ -2997,7 +2957,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     }
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    ggml_backend_webgpu_wait_profile_futures(ctx->global_ctx, profile_futures);
+    ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names);
 #endif
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx);
     return GGML_STATUS_SUCCESS;
@@ -3539,14 +3499,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
                               "memset_params_buf");
     ctx->webgpu_global_ctx->queue = ctx->webgpu_global_ctx->device.GetQueue();
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    // Initialize buffer pool for timestamp queries, used for profiling
-    ctx->webgpu_global_ctx->timestamp_query_buf_pool.init(
-        ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
-        wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
-        wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
-#endif
-
     GGML_LOG_INFO(
         "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
         "device_desc: %s\n",
@@ -3571,6 +3523,21 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
                               WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
                               wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf,
+                              WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
+                              wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
+                              "profile_timestamp_dev_buf");
+    ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_host_buf,
+                              WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
+                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
+                              "profile_timestamp_host_buf");
+    wgpu::QuerySetDescriptor query_set_desc = {};
+    query_set_desc.type                     = wgpu::QueryType::Timestamp;
+    query_set_desc.count                    = WEBGPU_MAX_PROFILE_QUERY_COUNT;
+    webgpu_ctx->profile_timestamp_query_set = webgpu_ctx->global_ctx->device.CreateQuerySet(&query_set_desc);
+#endif
+
 #ifdef GGML_WEBGPU_DEBUG
     // Initialize debug buffers
     ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->global_ctx->debug_host_buf,

From 55c05a9af5dc4ac273e1699def185b50b73fef0b Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 13 Apr 2026 13:37:29 -0700
Subject: [PATCH 08/10] Move to batching compute passes when not profiling

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 513 +++++++++++++--------------
 1 file changed, 239 insertions(+), 274 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b9bac216b87..f062e3e298f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -245,9 +245,11 @@ struct webgpu_context_struct {
 
     std::unique_ptr<ggml_webgpu_shader_lib> shader_lib;
 
-    webgpu_param_arena param_arena;
-    wgpu::Buffer       set_rows_dev_error_buf;
-    wgpu::Buffer       set_rows_host_error_buf;
+    webgpu_param_arena       param_arena;
+    wgpu::Buffer             set_rows_dev_error_buf;
+    wgpu::Buffer             set_rows_host_error_buf;
+    wgpu::CommandEncoder     active_command_encoder;
+    wgpu::ComputePassEncoder active_compute_pass;
 
     size_t memset_bytes_per_thread;
 
@@ -477,7 +479,6 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
 
 static webgpu_encoded_op ggml_backend_webgpu_build_multi(
     webgpu_context &                                       ctx,
-    wgpu::CommandEncoder &                                 encoder,
     const std::vector<webgpu_pipeline> &                   pipelines,
     const std::vector<std::vector<uint32_t>> &             params_list,
     const std::vector<std::vector<wgpu::BindGroupEntry>> & bind_group_entries_list,
@@ -515,25 +516,35 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(
         ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(),
                                            params_list[i].size() * sizeof(uint32_t));
     }
+
+    bool                     own_pass = false;
+    wgpu::ComputePassEncoder pass     = ctx->active_compute_pass;
 #ifdef GGML_WEBGPU_GPU_PROFILE
     GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT);
     const uint32_t query_begin = ctx->profile_timestamp_query_count++;
     const uint32_t query_end   = ctx->profile_timestamp_query_count++;
-
-    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ctx->profile_timestamp_query_set,
-                                              .beginningOfPassWriteIndex = query_begin,
-                                              .endOfPassWriteIndex       = query_end };
-    wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
-    wgpu::ComputePassEncoder    pass      = encoder.BeginComputePass(&pass_desc);
+#endif
+    if (!pass) {
+        own_pass = true;
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ctx->profile_timestamp_query_set,
+                                                  .beginningOfPassWriteIndex = query_begin,
+                                                  .endOfPassWriteIndex       = query_end };
+        wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
+        pass                                  = ctx->active_command_encoder.BeginComputePass(&pass_desc);
 #else
-    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+        pass = ctx->active_command_encoder.BeginComputePass();
 #endif
+    }
+
     for (size_t i = 0; i < pipelines.size(); i++) {
         pass.SetPipeline(pipelines[i].pipeline);
         pass.SetBindGroup(0, bind_groups[i]);
         pass.DispatchWorkgroups(workgroups_list[i].first, workgroups_list[i].second, 1);
     }
-    pass.End();
+    if (own_pass) {
+        pass.End();
+    }
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     result.pipeline_name = pipelines.front().name;
@@ -542,13 +553,12 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(
 }
 
 static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context &                  ctx,
-                                                   wgpu::CommandEncoder &            encoder,
                                                    webgpu_pipeline &                 pipeline,
                                                    std::vector<uint32_t>             params,
                                                    std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                    uint32_t                          wg_x,
                                                    uint32_t                          wg_y = 1) {
-    return ggml_backend_webgpu_build_multi(ctx, encoder,
+    return ggml_backend_webgpu_build_multi(ctx,
                                            {
                                                pipeline
     },
@@ -556,30 +566,6 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context &
                                            { { wg_x, wg_y } });
 }
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-static void ggml_backend_webgpu_collect_profile_results(webgpu_context &                   ctx,
-                                                        const std::vector<std::string> & pipeline_names) {
-    if (pipeline_names.empty()) {
-        return;
-    }
-
-    const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t);
-    GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size());
-
-    ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0,
-                                   mapped_size);
-    const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size);
-
-    for (size_t i = 0; i < pipeline_names.size(); ++i) {
-        // WebGPU timestamps are in ns; convert to ms.
-        const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
-        ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
-    }
-
-    ctx->profile_timestamp_host_buf.Unmap();
-}
-#endif
-
 static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
                                               wgpu::Buffer &          buf,
                                               uint32_t                value,
@@ -726,10 +712,7 @@ static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0
     return flags;
 }
 
-static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context &       ctx,
-                                         wgpu::CommandEncoder & encoder,
-                                         ggml_tensor *          src,
-                                         ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src,
         .dst         = dst,
@@ -767,14 +750,13 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context &       ctx,
     };
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_set(webgpu_context &       ctx,
-                                         wgpu::CommandEncoder & encoder,
-                                         ggml_tensor *          src0,
-                                         ggml_tensor *          src1,
-                                         ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx,
+                                         ggml_tensor *    src0,
+                                         ggml_tensor *    src1,
+                                         ggml_tensor *    dst) {
     const bool inplace = ggml_webgpu_tensor_equal(src0, dst);
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
@@ -833,13 +815,10 @@ static webgpu_encoded_op ggml_webgpu_set(webgpu_context &       ctx,
                         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_pad(webgpu_context &       ctx,
-                                         wgpu::CommandEncoder & encoder,
-                                         ggml_tensor *          src,
-                                         ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0 = src, .dst = dst, .max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup
     };
@@ -891,14 +870,13 @@ static webgpu_encoded_op ggml_webgpu_pad(webgpu_context &       ctx,
     };
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context &       ctx,
-                                               wgpu::CommandEncoder & encoder,
-                                               ggml_tensor *          src0,
-                                               ggml_tensor *          src1,
-                                               ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx,
+                                               ggml_tensor *    src0,
+                                               ggml_tensor *    src1,
+                                               ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0               = src0,
         .src1               = src1,
@@ -953,14 +931,13 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context &       ctx,
 
     const uint32_t wg_x = CEIL_DIV((uint32_t) src1->ne[0], decisions->wg_size);
     const uint32_t wg_y = (uint32_t) (dst->ne[2] * dst->ne[3]);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 
-static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src0,
-                                              ggml_tensor *          src1,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx,
+                                              ggml_tensor *    src0,
+                                              ggml_tensor *    src1,
+                                              ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -1010,18 +987,17 @@ static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context &       ctx,
 
     const uint32_t wg_x = CEIL_DIV((uint32_t) src0->ne[1], decisions->block_size);
     const uint32_t wg_y = token_tiles * (uint32_t) dst->ne[2];
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
-}
-
-static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context &       ctx,
-                                                     wgpu::CommandEncoder & encoder,
-                                                     ggml_tensor *          src0,
-                                                     ggml_tensor *          src1,
-                                                     ggml_tensor *          src2,
-                                                     ggml_tensor *          src3,
-                                                     ggml_tensor *          src4,
-                                                     ggml_tensor *          src5,
-                                                     ggml_tensor *          dst) {
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+}
+
+static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
+                                                     ggml_tensor *    src0,
+                                                     ggml_tensor *    src1,
+                                                     ggml_tensor *    src2,
+                                                     ggml_tensor *    src3,
+                                                     ggml_tensor *    src4,
+                                                     ggml_tensor *    src5,
+                                                     ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -1096,14 +1072,13 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context &       ctx,
          .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  }
     };
 
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, h, n_seqs);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, h, n_seqs);
 }
 
-static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context &       ctx,
-                                                             wgpu::CommandEncoder & encoder,
-                                                             ggml_tensor *          src,
-                                                             ggml_tensor *          idx,
-                                                             ggml_tensor *          dst) {
+static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context & ctx,
+                                                             ggml_tensor *    src,
+                                                             ggml_tensor *    idx,
+                                                             ggml_tensor *    dst) {
     // For set rows specifically, we need to check if src and idx are empty
     // tensors.
     if (ggml_is_empty(src) || ggml_is_empty(idx)) {
@@ -1166,7 +1141,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_set_rows(webgpu_context &
         threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
     }
     uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, 1);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1);
 }
 
 // Workgroup size is a common constant
@@ -1177,11 +1152,10 @@ static std::vector<wgpu::ConstantEntry> ggml_webgpu_wg_size_entry(uint32_t wg_si
     return constants;
 }
 
-static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src,
-                                              ggml_tensor *          idx,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
+                                              ggml_tensor *    src,
+                                              ggml_tensor *    idx,
+                                              ggml_tensor *    dst) {
     const bool float_parallel = src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16 || src->type == GGML_TYPE_I32;
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
@@ -1233,14 +1207,13 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context &       ctx,
     uint32_t total_threads  = float_parallel ? blocks_per_row * total_rows : total_rows;
     uint32_t wg_x           = CEIL_DIV(total_threads, decisions->wg_size);
 
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context &       ctx,
-                                             wgpu::CommandEncoder & encoder,
-                                             ggml_tensor *          src0,
-                                             ggml_tensor *          src1,
-                                             ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
+                                             ggml_tensor *    src0,
+                                             ggml_tensor *    src1,
+                                             ggml_tensor *    dst) {
     // Determine if this is a mat-vec operation
     bool is_vec = (dst->ne[1] == 1);
 
@@ -1379,15 +1352,14 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context &       ctx,
         compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
     }
 
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 
-static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context &       ctx,
-                                                wgpu::CommandEncoder & encoder,
-                                                ggml_tensor *          src0,
-                                                ggml_tensor *          src1,
-                                                ggml_tensor *          src2,
-                                                ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
+                                                ggml_tensor *    src0,
+                                                ggml_tensor *    src1,
+                                                ggml_tensor *    src2,
+                                                ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -1535,19 +1507,17 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context &       ctx,
     entries_list.push_back(std::move(main_entries));
     workgroups_list.push_back({ wg_x, wg_y });
 
-    return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
-                                           entries_list, workgroups_list);
+    return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
 }
 
 #ifndef __EMSCRIPTEN__
-static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context &       ctx,
-                                                wgpu::CommandEncoder & encoder,
-                                                ggml_tensor *          Q,
-                                                ggml_tensor *          K,
-                                                ggml_tensor *          V,
-                                                ggml_tensor *          mask,
-                                                ggml_tensor *          sinks,
-                                                ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
+                                                ggml_tensor *    Q,
+                                                ggml_tensor *    K,
+                                                ggml_tensor *    V,
+                                                ggml_tensor *    mask,
+                                                ggml_tensor *    sinks,
+                                                ggml_tensor *    dst) {
     float scale = *(float *) dst->op_params;
     float max_bias;
     memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
@@ -1861,18 +1831,14 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context &       ctx,
             workgroups_list.push_back({ (uint32_t) nrows, 1u });
         }
 
-        return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
-                                               entries_list, workgroups_list);
+        return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
     }
 
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 #endif  // __EMSCRIPTEN__
 
-static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool is_unary = dst->op == GGML_OP_UNARY;
     bool inplace  = ggml_webgpu_tensor_equal(src, dst) || (dst->op == GGML_OP_FILL);
 
@@ -1947,14 +1913,13 @@ static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context &       ctx,
-                                               wgpu::CommandEncoder & encoder,
-                                               ggml_tensor *          src0,
-                                               ggml_tensor *          src1,
-                                               ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx,
+                                               ggml_tensor *    src0,
+                                               ggml_tensor *    src1,
+                                               ggml_tensor *    dst) {
     binary_overlap_flags flags = ggml_webgpu_detect_binary_overlap(src0, src1, dst);
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
@@ -2050,14 +2015,13 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_concat(webgpu_context &       ctx,
-                                            wgpu::CommandEncoder & encoder,
-                                            ggml_tensor *          src0,
-                                            ggml_tensor *          src1,
-                                            ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
+                                            ggml_tensor *    src0,
+                                            ggml_tensor *    src1,
+                                            ggml_tensor *    dst) {
     uint32_t ne  = (uint32_t) ggml_nelements(dst);
     uint32_t dim = (uint32_t) dst->op_params[0];
 
@@ -2107,13 +2071,10 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context &       ctx,
     webgpu_pipeline pipeline  = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
     uint32_t        wg_x      = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context &       ctx,
-                                            wgpu::CommandEncoder & encoder,
-                                            ggml_tensor *          src0,
-                                            ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
     std::vector<uint32_t> params = { ne,
@@ -2152,13 +2113,10 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context &       ctx,
     webgpu_pipeline pipeline  = ctx->shader_lib->get_repeat_pipeline(shader_lib_ctx);
     auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
     uint32_t        wg_x      = CEIL_DIV(ne, decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool inplace = ggml_webgpu_tensor_equal(src, dst);
 
     std::vector<uint32_t> params = {
@@ -2198,16 +2156,14 @@ static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context &       ctx,
     };
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries,
-                                     ggml_nrows(src));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(src));
 }
 
-static webgpu_encoded_op ggml_webgpu_rope(webgpu_context &       ctx,
-                                          wgpu::CommandEncoder & encoder,
-                                          ggml_tensor *          src0,
-                                          ggml_tensor *          src1,
-                                          ggml_tensor *          src2,
-                                          ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_rope(webgpu_context & ctx,
+                                          ggml_tensor *    src0,
+                                          ggml_tensor *    src1,
+                                          ggml_tensor *    src2,
+                                          ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -2304,14 +2260,13 @@ static webgpu_encoded_op ggml_webgpu_rope(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_glu(webgpu_context &       ctx,
-                                         wgpu::CommandEncoder & encoder,
-                                         ggml_tensor *          src0,
-                                         ggml_tensor *          src1,
-                                         ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx,
+                                         ggml_tensor *    src0,
+                                         ggml_tensor *    src1,
+                                         ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -2370,13 +2325,10 @@ static webgpu_encoded_op ggml_webgpu_glu(webgpu_context &       ctx,
                         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_scale(webgpu_context &       ctx,
-                                           wgpu::CommandEncoder & encoder,
-                                           ggml_tensor *          src,
-                                           ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool inplace = ggml_webgpu_tensor_equal(src, dst);
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
@@ -2424,15 +2376,14 @@ static webgpu_encoded_op ggml_webgpu_scale(webgpu_context &       ctx,
     }
 
     uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src0,
-                                              ggml_tensor *          src1,
-                                              ggml_tensor *          src2,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx,
+                                              ggml_tensor *    src0,
+                                              ggml_tensor *    src1,
+                                              ggml_tensor *    src2,
+                                              ggml_tensor *    dst) {
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
         .src0        = src0,
         .src1        = src1,
@@ -2508,14 +2459,10 @@ static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context &       ctx,
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries,
-                                     ggml_nrows(dst));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst));
 }
 
-static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context &       ctx,
-                                            wgpu::CommandEncoder & encoder,
-                                            ggml_tensor *          src,
-                                            ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
                                      (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
                                      (uint32_t) src->ne[0] };
@@ -2537,13 +2484,10 @@ static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context &       ctx,
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_argmax_pipeline(shader_lib_ctx);
     uint32_t        wg_x     = ggml_nelements(dst);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
-                                             wgpu::CommandEncoder & encoder,
-                                             ggml_tensor *          src,
-                                             ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool is_top_k = dst->op == GGML_OP_TOP_K;
 
     ggml_webgpu_shader_lib_context shader_lib_ctx = {
@@ -2634,8 +2578,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
     workgroups_list.push_back({ wg_x_init, wg_y_init });
 
     if (merge_passes == 0) {
-        return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
-                                               entries_list, workgroups_list);
+        return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
     }
 
     bool     in_is_tmp = start_in_tmp;
@@ -2696,14 +2639,10 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context &       ctx,
         in_is_tmp = !in_is_tmp;
     }
 
-    return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list,
-                                           entries_list, workgroups_list);
+    return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
 }
 
-static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context &       ctx,
-                                            wgpu::CommandEncoder & encoder,
-                                            ggml_tensor *          src,
-                                            ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
                                      (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
                                      (uint32_t) src->ne[0] };
@@ -2728,13 +2667,10 @@ static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context &       ctx,
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_cumsum_pipeline(shader_lib_ctx);
     uint32_t        wg_x     = ggml_nrows(dst);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context &       ctx,
-                                              wgpu::CommandEncoder & encoder,
-                                              ggml_tensor *          src,
-                                              ggml_tensor *          dst) {
+static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool                  total_sum = dst->op == GGML_OP_SUM;
     std::vector<uint32_t> params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
                                      (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
@@ -2763,13 +2699,11 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context &       ctx,
     webgpu_pipeline pipeline = ctx->shader_lib->get_sum_rows_pipeline(shader_lib_ctx);
 
     uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst);
-    return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
 // Returns the encoded command, or std::nullopt if the operation is a no-op
-static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context         ctx,
-                                                                wgpu::CommandEncoder & encoder,
-                                                                ggml_tensor *          node) {
+static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
         return std::nullopt;
     }
@@ -2792,20 +2726,20 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context
             return std::nullopt;
         case GGML_OP_CPY:
         case GGML_OP_CONT:
-            return ggml_webgpu_cpy(ctx, encoder, src0, node);
+            return ggml_webgpu_cpy(ctx, src0, node);
         case GGML_OP_SET:
-            return ggml_webgpu_set(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_set(ctx, src0, src1, node);
         case GGML_OP_SET_ROWS:
-            return ggml_webgpu_set_rows(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_set_rows(ctx, src0, src1, node);
         case GGML_OP_GET_ROWS:
-            return ggml_webgpu_get_rows(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_get_rows(ctx, src0, src1, node);
         case GGML_OP_MUL_MAT:
-            return ggml_webgpu_mul_mat(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_mul_mat(ctx, src0, src1, node);
         case GGML_OP_MUL_MAT_ID:
-            return ggml_webgpu_mul_mat_id(ctx, encoder, src0, src1, src2, node);
+            return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node);
         case GGML_OP_FLASH_ATTN_EXT:
 #ifndef __EMSCRIPTEN__
-            return ggml_webgpu_flash_attn(ctx, encoder, src0, src1, src2, node->src[3], node->src[4], node);
+            return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
 #else
             return std::nullopt;
 #endif
@@ -2813,22 +2747,22 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
-            return ggml_webgpu_binary_op(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_binary_op(ctx, src0, src1, node);
         case GGML_OP_CONCAT:
-            return ggml_webgpu_concat(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_concat(ctx, src0, src1, node);
         case GGML_OP_REPEAT:
-            return ggml_webgpu_repeat(ctx, encoder, src0, node);
+            return ggml_webgpu_repeat(ctx, src0, node);
         case GGML_OP_RMS_NORM:
         case GGML_OP_L2_NORM:
-            return ggml_webgpu_row_norm(ctx, encoder, src0, node);
+            return ggml_webgpu_row_norm(ctx, src0, node);
         case GGML_OP_ROPE:
-            return ggml_webgpu_rope(ctx, encoder, src0, src1, src2, node);
+            return ggml_webgpu_rope(ctx, src0, src1, src2, node);
         case GGML_OP_GLU:
-            return ggml_webgpu_glu(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_glu(ctx, src0, src1, node);
         case GGML_OP_SCALE:
-            return ggml_webgpu_scale(ctx, encoder, src0, node);
+            return ggml_webgpu_scale(ctx, src0, node);
         case GGML_OP_SOFT_MAX:
-            return ggml_webgpu_soft_max(ctx, encoder, src0, src1, src2, node);
+            return ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
         case GGML_OP_UNARY:
         case GGML_OP_CLAMP:
         case GGML_OP_FILL:
@@ -2839,32 +2773,80 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context
         case GGML_OP_COS:
         case GGML_OP_DIAG:
         case GGML_OP_TRI:
-            return ggml_webgpu_unary_op(ctx, encoder, src0, node);
+            return ggml_webgpu_unary_op(ctx, src0, node);
         case GGML_OP_SOLVE_TRI:
-            return ggml_webgpu_solve_tri(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_solve_tri(ctx, src0, src1, node);
         case GGML_OP_SSM_CONV:
-            return ggml_webgpu_ssm_conv(ctx, encoder, src0, src1, node);
+            return ggml_webgpu_ssm_conv(ctx, src0, src1, node);
         case GGML_OP_GATED_DELTA_NET:
-            return ggml_webgpu_gated_delta_net(ctx, encoder, src0, src1, src2, node->src[3], node->src[4], node->src[5],
-                                               node);
+            return ggml_webgpu_gated_delta_net(ctx, src0, src1, src2, node->src[3], node->src[4], node->src[5], node);
         case GGML_OP_PAD:
-            return ggml_webgpu_pad(ctx, encoder, src0, node);
+            return ggml_webgpu_pad(ctx, src0, node);
         case GGML_OP_ARGMAX:
-            return ggml_webgpu_argmax(ctx, encoder, src0, node);
+            return ggml_webgpu_argmax(ctx, src0, node);
         case GGML_OP_ARGSORT:
         case GGML_OP_TOP_K:
             // we reuse the same argsort implementation for top_k
-            return ggml_webgpu_argsort(ctx, encoder, src0, node);
+            return ggml_webgpu_argsort(ctx, src0, node);
         case GGML_OP_CUMSUM:
-            return ggml_webgpu_cumsum(ctx, encoder, src0, node);
+            return ggml_webgpu_cumsum(ctx, src0, node);
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
-            return ggml_webgpu_sum_rows(ctx, encoder, src0, node);
+            return ggml_webgpu_sum_rows(ctx, src0, node);
         default:
             return std::nullopt;
     }
 }
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+static void ggml_backend_webgpu_collect_profile_results(webgpu_context &                 ctx,
+                                                        const std::vector<std::string> & pipeline_names,
+                                                        uint32_t &                       num_inflight_batches) {
+    if (pipeline_names.empty()) {
+        return;
+    }
+
+    wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder();
+    encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count,
+                            ctx->profile_timestamp_dev_buf, 0);
+    encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0,
+                               ctx->profile_timestamp_query_count * sizeof(uint64_t));
+
+    wgpu::CommandBuffer profile_commands = encoder.Finish();
+    ggml_backend_webgpu_submit_commands(ctx, profile_commands, num_inflight_batches);
+
+    const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t);
+    GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size());
+
+    ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0,
+                                   mapped_size);
+    const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size);
+
+    for (size_t i = 0; i < pipeline_names.size(); ++i) {
+        // WebGPU timestamps are in ns; convert to ms.
+        const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6;
+        ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms;
+    }
+
+    ctx->profile_timestamp_host_buf.Unmap();
+}
+#endif
+
+static void ggml_backend_webgpu_check_set_rows(webgpu_context & ctx, uint32_t & num_inflight_batches) {
+    wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
+                               ctx->set_rows_host_error_buf.GetSize());
+    wgpu::CommandBuffer commands = encoder.Finish();
+    ggml_backend_webgpu_submit_commands(ctx, commands, num_inflight_batches);
+    ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->set_rows_host_error_buf, wgpu::MapMode::Read, 0,
+                                   ctx->set_rows_host_error_buf.GetSize());
+    const uint32_t * error_data = (const uint32_t *) ctx->set_rows_host_error_buf.GetConstMappedRange();
+    if (*error_data) {
+        GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
+    }
+    ctx->set_rows_host_error_buf.Unmap();
+}
+
 static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
 
@@ -2874,26 +2856,28 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
     std::vector<webgpu_encoded_op> commands;
+
+    uint32_t num_batched_kernels  = 0;
+    uint32_t num_inflight_batches = 0;
+    bool     contains_set_rows    = false;
+    bool     batch_compute_passes = true;
+
 #ifdef GGML_WEBGPU_GPU_PROFILE
+    ctx->profile_timestamp_query_count = 0;
+    batch_compute_passes               = false;
     std::vector<std::string> profile_pipeline_names;
 #endif
-    uint32_t             num_batched_kernels  = 0;
-    uint32_t             num_inflight_batches = 0;
-    bool                 contains_set_rows    = false;
-    wgpu::CommandEncoder batch_encoder        = ctx->global_ctx->device.CreateCommandEncoder();
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    ctx->profile_timestamp_query_count = 0;
-    if (ctx->profile_timestamp_host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
-        ctx->profile_timestamp_host_buf.Unmap();
+    ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
+    if (batch_compute_passes) {
+        ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
     }
-#endif
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
             contains_set_rows = true;
         }
-        if (auto cmd = ggml_webgpu_encode_node(ctx, batch_encoder, cgraph->nodes[i])) {
+        if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
             commands.push_back(*cmd);
             num_batched_kernels += cmd.value().num_kernels;
 #ifdef GGML_WEBGPU_GPU_PROFILE
@@ -2902,63 +2886,46 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         }
 
         if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
+            if (ctx->active_compute_pass) {
+                ctx->active_compute_pass.End();
+            }
             num_batched_kernels                = 0;
-            wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
+            wgpu::CommandBuffer batch_commands = ctx->active_command_encoder.Finish();
             ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
+
+            // reset state for next batch
+            ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
+            if (batch_compute_passes) {
+                ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
+            }
             ctx->param_arena.reset();
             commands.clear();
-            batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
         }
     }
+
+    if (ctx->active_compute_pass) {
+        ctx->active_compute_pass.End();
+        ctx->active_compute_pass = nullptr;
+    }
+
     if (num_batched_kernels > 0) {
-        wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
+        wgpu::CommandBuffer batch_commands = ctx->active_command_encoder.Finish();
         ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
         ctx->param_arena.reset();
         commands.clear();
     }
+    ctx->active_command_encoder = nullptr;
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    const size_t profile_buf_size = ctx->profile_timestamp_query_count * sizeof(uint64_t);
+    ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names, num_inflight_batches);
 #endif
 
-    // Copy any post-graph bookkeeping buffers to the host for checking.
-    if (contains_set_rows
-#ifdef GGML_WEBGPU_GPU_PROFILE
-        || ctx->profile_timestamp_query_count > 0
-#endif
-    ) {
-        wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder();
-        if (contains_set_rows) {
-            encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
-                                       ctx->set_rows_host_error_buf.GetSize());
-        }
-#ifdef GGML_WEBGPU_GPU_PROFILE
-        if (ctx->profile_timestamp_query_count > 0) {
-            encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count,
-                                    ctx->profile_timestamp_dev_buf, 0);
-            encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0,
-                                       profile_buf_size);
-        }
-#endif
-        wgpu::CommandBuffer post_graph_commands = encoder.Finish();
-        ggml_backend_webgpu_submit_commands(ctx, post_graph_commands, num_inflight_batches);
+    if (contains_set_rows) {
+        ggml_backend_webgpu_check_set_rows(ctx, num_inflight_batches);
     }
 
     ggml_backend_webgpu_wait_queue(ctx->global_ctx);
 
-    if (contains_set_rows) {
-        ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->set_rows_host_error_buf, wgpu::MapMode::Read, 0,
-                                       ctx->set_rows_host_error_buf.GetSize());
-        const uint32_t * error_data = (const uint32_t *) ctx->set_rows_host_error_buf.GetConstMappedRange();
-        if (*error_data) {
-            GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
-        }
-        ctx->set_rows_host_error_buf.Unmap();
-    }
-
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names);
-#endif
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx);
     return GGML_STATUS_SUCCESS;
 }
@@ -3524,14 +3491,12 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
                               wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf,
-                              WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
-                              wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
-                              "profile_timestamp_dev_buf");
+    ggml_webgpu_create_buffer(
+        webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
+        wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");
     ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_host_buf,
                               WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
-                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
-                              "profile_timestamp_host_buf");
+                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "profile_timestamp_host_buf");
     wgpu::QuerySetDescriptor query_set_desc = {};
     query_set_desc.type                     = wgpu::QueryType::Timestamp;
     query_set_desc.count                    = WEBGPU_MAX_PROFILE_QUERY_COUNT;

From 6468f3930c992fe8c3dcbcd63e552594da75538f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 13 Apr 2026 13:51:41 -0700
Subject: [PATCH 09/10] Refactor build_multi

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 175 ++++++++++++---------------
 1 file changed, 77 insertions(+), 98 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index f062e3e298f..f2cdc327e7c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -162,10 +162,17 @@ struct webgpu_param_arena {
 struct webgpu_encoded_op {
     uint32_t num_kernels = 0;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    std::string pipeline_name;
+    std::vector<std::string> pipeline_names;
 #endif
 };
 
+struct webgpu_dispatch_desc {
+    webgpu_pipeline                   pipeline;
+    std::vector<uint32_t>             params;
+    std::vector<wgpu::BindGroupEntry> bind_group_entries;
+    std::pair<uint32_t, uint32_t>     workgroups = { 1, 1 };
+};
+
 struct webgpu_capabilities {
     wgpu::Limits limits;
     bool         supports_subgroup_matrix = false;
@@ -477,26 +484,19 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
 }
 #endif
 
-static webgpu_encoded_op ggml_backend_webgpu_build_multi(
-    webgpu_context &                                       ctx,
-    const std::vector<webgpu_pipeline> &                   pipelines,
-    const std::vector<std::vector<uint32_t>> &             params_list,
-    const std::vector<std::vector<wgpu::BindGroupEntry>> & bind_group_entries_list,
-    const std::vector<std::pair<uint32_t, uint32_t>> &     workgroups_list) {
-    GGML_ASSERT(pipelines.size() == params_list.size());
-    GGML_ASSERT(pipelines.size() == bind_group_entries_list.size());
-    GGML_ASSERT(pipelines.size() == workgroups_list.size());
-
+static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &                          ctx,
+                                                         const std::vector<webgpu_dispatch_desc> & dispatches) {
     webgpu_encoded_op            result = {};
     std::vector<wgpu::BindGroup> bind_groups;
     std::vector<size_t>          param_offsets;
-    result.num_kernels = pipelines.size();
+    result.num_kernels = dispatches.size();
 
-    for (size_t i = 0; i < pipelines.size(); i++) {
-        const size_t param_size   = params_list[i].size() * sizeof(uint32_t);
-        const size_t param_offset = ctx->param_arena.alloc_slot(param_size);
+    for (size_t i = 0; i < dispatches.size(); i++) {
+        const webgpu_dispatch_desc & dispatch     = dispatches[i];
+        const size_t                 param_size   = dispatch.params.size() * sizeof(uint32_t);
+        const size_t                 param_offset = ctx->param_arena.alloc_slot(param_size);
 
-        std::vector<wgpu::BindGroupEntry> entries            = bind_group_entries_list[i];
+        std::vector<wgpu::BindGroupEntry> entries            = dispatch.bind_group_entries;
         uint32_t                          params_binding_num = entries.size();
         entries.push_back({ .binding = params_binding_num,
                             .buffer  = ctx->param_arena.buffer,
@@ -504,51 +504,44 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(
                             .size    = ctx->param_arena.slot_size });
 
         wgpu::BindGroupDescriptor bind_group_desc;
-        bind_group_desc.layout     = pipelines[i].pipeline.GetBindGroupLayout(0);
+        bind_group_desc.layout     = dispatch.pipeline.pipeline.GetBindGroupLayout(0);
         bind_group_desc.entryCount = entries.size();
         bind_group_desc.entries    = entries.data();
-        bind_group_desc.label      = pipelines[i].name.c_str();
+        bind_group_desc.label      = dispatch.pipeline.name.c_str();
         bind_groups.push_back(ctx->global_ctx->device.CreateBindGroup(&bind_group_desc));
         param_offsets.push_back(param_offset);
     }
 
     for (size_t i = 0; i < param_offsets.size(); i++) {
-        ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(),
-                                           params_list[i].size() * sizeof(uint32_t));
+        ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], dispatches[i].params.data(),
+                                           dispatches[i].params.size() * sizeof(uint32_t));
     }
 
-    bool                     own_pass = false;
-    wgpu::ComputePassEncoder pass     = ctx->active_compute_pass;
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT);
-    const uint32_t query_begin = ctx->profile_timestamp_query_count++;
-    const uint32_t query_end   = ctx->profile_timestamp_query_count++;
-#endif
-    if (!pass) {
-        own_pass = true;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-        wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ctx->profile_timestamp_query_set,
-                                                  .beginningOfPassWriteIndex = query_begin,
-                                                  .endOfPassWriteIndex       = query_end };
-        wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
-        pass                                  = ctx->active_command_encoder.BeginComputePass(&pass_desc);
-#else
-        pass = ctx->active_command_encoder.BeginComputePass();
-#endif
-    }
-
-    for (size_t i = 0; i < pipelines.size(); i++) {
-        pass.SetPipeline(pipelines[i].pipeline);
+    for (size_t i = 0; i < dispatches.size(); i++) {
+        GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT);
+        const uint32_t              query_begin = ctx->profile_timestamp_query_count++;
+        const uint32_t              query_end   = ctx->profile_timestamp_query_count++;
+        wgpu::PassTimestampWrites   ts_writes   = { .querySet                  = ctx->profile_timestamp_query_set,
+                                                    .beginningOfPassWriteIndex = query_begin,
+                                                    .endOfPassWriteIndex       = query_end };
+        wgpu::ComputePassDescriptor pass_desc   = { .timestampWrites = &ts_writes };
+        wgpu::ComputePassEncoder    pass        = ctx->active_command_encoder.BeginComputePass(&pass_desc);
+
+        pass.SetPipeline(dispatches[i].pipeline.pipeline);
         pass.SetBindGroup(0, bind_groups[i]);
-        pass.DispatchWorkgroups(workgroups_list[i].first, workgroups_list[i].second, 1);
-    }
-    if (own_pass) {
+        pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
         pass.End();
+        result.pipeline_names.push_back(dispatches[i].pipeline.name);
+    }
+#else
+    for (size_t i = 0; i < dispatches.size(); i++) {
+        ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
+        ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
+        ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
     }
-
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    result.pipeline_name = pipelines.front().name;
 #endif
+
     return result;
 }
 
@@ -558,12 +551,10 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context &
                                                    std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                    uint32_t                          wg_x,
                                                    uint32_t                          wg_y = 1) {
-    return ggml_backend_webgpu_build_multi(ctx,
-                                           {
-                                               pipeline
-    },
-                                           { std::move(params) }, { std::move(bind_group_entries) },
-                                           { { wg_x, wg_y } });
+    return ggml_backend_webgpu_build_multi(
+        ctx, {
+                 { pipeline, std::move(params), std::move(bind_group_entries), { wg_x, wg_y } },
+    });
 }
 
 static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
@@ -1371,10 +1362,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
     // Get or create pipeline
     webgpu_pipeline gather_pipeline, main_pipeline;
 
-    std::vector<webgpu_pipeline>                   pipelines;
-    std::vector<std::vector<uint32_t>>             params_list;
-    std::vector<std::vector<wgpu::BindGroupEntry>> entries_list;
-    std::vector<std::pair<uint32_t, uint32_t>>     workgroups_list;
+    std::vector<webgpu_dispatch_desc> dispatches;
 
     gather_pipeline = ctx->shader_lib->get_mul_mat_id_gather_pipeline(shader_lib_ctx);
     main_pipeline   = ctx->shader_lib->get_mul_mat_id_pipeline(shader_lib_ctx);
@@ -1434,10 +1422,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
     const uint32_t gather_wg_x     = std::min(gather_total_wg, max_wg_per_dim);
     const uint32_t gather_wg_y     = CEIL_DIV(gather_total_wg, gather_wg_x);
 
-    pipelines.push_back(gather_pipeline);
-    params_list.push_back(std::move(gather_params));
-    entries_list.push_back(std::move(gather_entries));
-    workgroups_list.push_back({ gather_wg_x, gather_wg_y });
+    dispatches.push_back({
+        gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y }
+    });
 
     // params for mul_mat_id.wgsl
     std::vector<uint32_t> main_params = {
@@ -1502,12 +1489,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
 
     compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
 
-    pipelines.push_back(main_pipeline);
-    params_list.push_back(std::move(main_params));
-    entries_list.push_back(std::move(main_entries));
-    workgroups_list.push_back({ wg_x, wg_y });
+    dispatches.push_back({
+        main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y }
+    });
 
-    return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
+    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
 
 #ifndef __EMSCRIPTEN__
@@ -1809,29 +1795,26 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
 
         const uint64_t split_wg_total = (uint64_t) wg_x * nwg;
         GGML_ASSERT(split_wg_total <= UINT32_MAX);
-        std::vector<webgpu_pipeline>                   pipelines;
-        std::vector<std::vector<uint32_t>>             params_list;
-        std::vector<std::vector<wgpu::BindGroupEntry>> entries_list;
-        std::vector<std::pair<uint32_t, uint32_t>>     workgroups_list;
+        std::vector<webgpu_dispatch_desc> dispatches;
 
         if (use_blk) {
-            pipelines.push_back(blk_pipeline);
-            params_list.push_back(std::move(blk_params));
-            entries_list.push_back(std::move(blk_entries));
-            workgroups_list.push_back({ blk_nblk0, blk_nblk1 * blk_batch_count });
+            dispatches.push_back({
+                blk_pipeline,
+                std::move(blk_params),
+                std::move(blk_entries),
+                { blk_nblk0, blk_nblk1 * blk_batch_count }
+            });
         }
-        pipelines.push_back(pipeline);
-        params_list.push_back(std::move(split_params));
-        entries_list.push_back(std::move(split_entries));
-        workgroups_list.push_back({ (uint32_t) split_wg_total, 1u });
+        dispatches.push_back({
+            pipeline, std::move(split_params), std::move(split_entries), { (uint32_t) split_wg_total, 1u }
+        });
         if (use_vec_reduce) {
-            pipelines.push_back(reduce_pipeline);
-            params_list.push_back(std::move(reduce_params));
-            entries_list.push_back(std::move(reduce_entries));
-            workgroups_list.push_back({ (uint32_t) nrows, 1u });
+            dispatches.push_back({
+                reduce_pipeline, std::move(reduce_params), std::move(reduce_entries), { (uint32_t) nrows, 1u }
+            });
         }
 
-        return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
+        return ggml_backend_webgpu_build_multi(ctx, dispatches);
     }
 
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
@@ -2545,10 +2528,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
     const uint32_t stride_idx2 = out_ne0 * (uint32_t) dst->ne[1];
     const uint32_t stride_idx3 = stride_idx2 * (uint32_t) dst->ne[2];
 
-    std::vector<webgpu_pipeline>                   pipelines;
-    std::vector<std::vector<uint32_t>>             params_list;
-    std::vector<std::vector<wgpu::BindGroupEntry>> entries_list;
-    std::vector<std::pair<uint32_t, uint32_t>>     workgroups_list;
+    std::vector<webgpu_dispatch_desc> dispatches;
 
     const uint32_t init_offset       = start_in_tmp ? offset_tmp : offset_dst;
     const size_t   init_align_offset = start_in_tmp ? tmp_offset : ggml_webgpu_tensor_align_offset(ctx, dst);
@@ -2572,13 +2552,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
         { .binding = 1, .buffer = ggml_webgpu_tensor_buf(dst), .offset = init_align_offset, .size = init_binding_size }
     };
 
-    pipelines.push_back(argsort_pipeline);
-    params_list.push_back(std::move(init_params));
-    entries_list.push_back(std::move(init_entries));
-    workgroups_list.push_back({ wg_x_init, wg_y_init });
+    dispatches.push_back({
+        argsort_pipeline, std::move(init_params), std::move(init_entries), { wg_x_init, wg_y_init }
+    });
 
     if (merge_passes == 0) {
-        return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
+        return ggml_backend_webgpu_build_multi(ctx, dispatches);
     }
 
     bool     in_is_tmp = start_in_tmp;
@@ -2630,16 +2609,15 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
         const uint32_t total_wg_merge = nm * nrows;
         const uint32_t wg_x_merge     = std::min(total_wg_merge, max_wg);
         const uint32_t wg_y_merge     = CEIL_DIV(total_wg_merge, wg_x_merge);
-        workgroups_list.push_back({ wg_x_merge, wg_y_merge });
-        pipelines.push_back(argsort_merge_pipeline);
-        params_list.push_back(std::move(merge_params));
-        entries_list.push_back(std::move(merge_entries));
+        dispatches.push_back({
+            argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge }
+        });
 
         len <<= 1;
         in_is_tmp = !in_is_tmp;
     }
 
-    return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list);
+    return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
 
 static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
@@ -2881,7 +2859,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
             commands.push_back(*cmd);
             num_batched_kernels += cmd.value().num_kernels;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-            profile_pipeline_names.push_back(cmd->pipeline_name);
+            profile_pipeline_names.insert(profile_pipeline_names.end(), cmd->pipeline_names.begin(),
+                                          cmd->pipeline_names.end());
 #endif
         }
 

From aa2b38054819744305f0e0c8c734de59c0630a43 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 13 Apr 2026 15:11:06 -0700
Subject: [PATCH 10/10] remove iOS throttling now that we're batching compute
 passes

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index f2cdc327e7c..c460df5952b 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -392,22 +392,8 @@ static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
     }
 }
 
-#ifdef __EMSCRIPTEN__
-EM_JS(int, ggml_webgpu_is_ios_browser, (), {
-    const ua = navigator.userAgent;
-    return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
-});
-#endif
-
 // TODO: these next two functions may want tuning across different platforms and workloads,
 static uint32_t ggml_backend_webgpu_get_max_inflight_batches() {
-#ifdef __EMSCRIPTEN__
-    // iOS has very strict limits on the number of in-flight GPU commands,
-    // so we need to throttle to avoid failures.
-    if (ggml_webgpu_is_ios_browser()) {
-        return 1;
-    }
-#endif
     return UINT32_MAX;
 }