From e94d13cf6959a8b8b16f3be90a56e996aba07f96 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 8 Apr 2026 09:19:57 -0700 Subject: [PATCH 01/10] Update register tiling matmul to use f32 accumulation --- .../ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl | 12 ++++++------ .../wgsl-shaders/mul_mat_subgroup_matrix.wgsl | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl index b1da421a691..ee37e6d249c 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl @@ -4,14 +4,14 @@ enable f16; #include "mul_mat_decls.tmpl" #ifdef VEC -fn store_val(acc: array, TILE_M>, tn: u32, tm: u32) -> vec4 { - return vec4(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn])); +fn store_val(acc: array, TILE_M>, tn: u32, tm: u32) -> vec4 { + return vec4(acc[tm][tn], acc[tm + 1][tn], acc[tm + 2][tn], acc[tm + 3][tn]); } #endif #ifdef SCALAR -fn store_val(acc: array, TILE_M>, tn: u32, tm: u32) -> f32 { - return f32(acc[tm][tn]); +fn store_val(acc: array, TILE_M>, tn: u32, tm: u32) -> f32 { + return acc[tm][tn]; } #endif @@ -98,7 +98,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M; let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N; - var acc: array, TILE_M>; + var acc: array, TILE_M>; for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) { @@ -122,7 +122,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let src1_idx = src1_n * TILE_K + k_inner; let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx]; for (var tm = 0u; tm < TILE_M; tm++) { - acc[tm][tn] += src0_tile[tm] * src1_val; + acc[tm][tn] += f32(src0_tile[tm]) * f32(src1_val); } } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl index 9f9ef279f29..4151ce430b0 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl @@ -6,6 +6,9 @@ enable chromium_experimental_subgroup_matrix; #include "common_decls.tmpl" #include "mul_mat_decls.tmpl" +// TODO: this shader path does not work with some models like qwen2.5 on Metal devices, f16 accumulation causes NaNs. +// See https://github.com/ggml-org/llama.cpp/issues/21602 + #ifdef VEC fn store_dst(shmem_idx: u32, dst_idx: u32) { dst[dst_idx] = vec4( From af4c1d516ffc6c8d5e394d7b5439ffe627aa0a03 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 8 Apr 2026 11:41:12 -0700 Subject: [PATCH 02/10] fix profiling code --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b8df0f4dd05..fff91f3e1c3 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -535,7 +535,7 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) { #ifdef GGML_WEBGPU_GPU_PROFILE static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context & ctx, - const std::vector & commands, + const std::vector & commands, std::vector & futures) { for (const auto & command : commands) { auto label = command.pipeline_name; From ac5267d4421147b71a8ae2cd1f611bb58bc44fff Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 9 Apr 2026 13:21:24 -0700 Subject: [PATCH 03/10] Fix register tiling matmul for chrome, i'm blaming dawn --- .../wgsl-shaders/mul_mat_decls.tmpl | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl index ea91c13468f..d74f87d89fb 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl @@ -502,12 +502,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let d = load_src0_f16_at(block_byte_base); let dmin = load_src0_f16_at(block_byte_base + 2u); - // Load packed scales - var scale_vals: array; - for (var i: u32 = 0u; i < 3u; i++) { - scale_vals[i] = load_src0_u32_at(block_byte_base + 4u + 4u * i); - } - // Map k_in_block to loop structure: // Outer loop over 64-element groups (alternating q_b_idx) // Inner loop over 2 shifts per group @@ -523,15 +517,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 var sc: u32; var mn: u32; + let scale_base = block_byte_base + 4u; + if (is < 4u) { - let sc_byte = get_byte(scale_vals[is / 4u], is % 4u); - let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u); + let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u); + let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); sc = sc_byte & 63u; mn = min_byte & 63u; } else { - let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u); - let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u); - let min_hi = get_byte(scale_vals[is / 4u], is % 4u); + let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u); + let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u); + let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u); mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u); @@ -578,11 +574,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let d = load_src0_f16_at(block_byte_base); let dmin = load_src0_f16_at(block_byte_base + 2u); - // Load packed scales - var scale_vals: array; - for (var i: u32 = 0u; i < 3u; i++) { - scale_vals[i] = load_src0_u32_at(block_byte_base + 4u + 4u * i); - } // The original loop processes elements in groups of 64 // Each group of 64: q_b_idx cycles through [0,32,64,96], shift cycles [0,4] @@ -603,15 +594,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 var sc: u32; var mn: u32; + let scale_base = block_byte_base + 4u; + if (is < 4u) { - let sc_byte = get_byte(scale_vals[is / 4u], is % 4u); - let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u); + let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u); + let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); sc = sc_byte & 63u; mn = min_byte & 63u; } else { - let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u); - let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u); - let min_hi = get_byte(scale_vals[is / 4u], is % 4u); + let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u); + let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u); + let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u); mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u); From 4edf91b8eb23dd1ea9594d7b353d9d0776e5aa6a Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Fri, 10 Apr 2026 19:21:06 -0700 Subject: [PATCH 04/10] Update batch tuning value for iOS --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index fff91f3e1c3..b077296267f 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -79,7 +79,7 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim /* Constants */ -#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u +#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 64u #define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u #define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u #define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6) @@ -437,34 +437,25 @@ static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status, } #ifdef __EMSCRIPTEN__ -// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures. EM_JS(int, ggml_webgpu_is_ios_browser, (), { const ua = navigator.userAgent; return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0; }); #endif -static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) { +// TODO: these next two functions may want tuning across different platforms and workloads, +static uint32_t ggml_backend_webgpu_get_max_inflight_batches() { #ifdef __EMSCRIPTEN__ + // iOS has very strict limits on the number of in-flight GPU commands, + // so we need to throttle to avoid failures. if (ggml_webgpu_is_ios_browser()) { return 1; } -#else - GGML_UNUSED(info); #endif - return UINT32_MAX; } -static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) { -#ifdef __EMSCRIPTEN__ - if (ggml_webgpu_is_ios_browser()) { - return 16; - } -#else - GGML_UNUSED(info); -#endif - +static uint32_t ggml_backend_webgpu_get_command_submit_batch_size() { return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE; } From 354cb5c021048676691f39e4211e344cbcfdfd5f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Fri, 10 Apr 2026 19:29:49 -0700 Subject: [PATCH 05/10] compile fix --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b077296267f..79ffbea63d8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -3428,8 +3428,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { } #endif ctx->webgpu_global_ctx->adapter.GetInfo(&info); - ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info); - ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info); + ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(); + ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(); wgpu::SupportedFeatures features; ctx->webgpu_global_ctx->adapter.GetFeatures(&features); // we require f16 support From 0928d310729900635b6295e691ff3e2e862a7055 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 12 Apr 2026 20:48:32 -0700 Subject: [PATCH 06/10] Fix use of new load function --- .../wgsl-shaders/mul_mat_decls.tmpl | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl index a3b17b69878..56a76a6e6c4 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl @@ -520,14 +520,14 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let scale_base = block_byte_base + 4u; if (is < 4u) { - let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u); - let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); + let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u); + let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u); sc = sc_byte & 63u; mn = min_byte & 63u; } else { - let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u); - let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u); - let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); + let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u); + let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u); + let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u); sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u); mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u); @@ -597,14 +597,14 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let scale_base = block_byte_base + 4u; if (is < 4u) { - let sc_byte = get_byte(load_src0_u32_at(scale_base), is % 4u); - let min_byte = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); + let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u); + let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u); sc = sc_byte & 63u; mn = min_byte & 63u; } else { - let sc_min_lo = get_byte(load_src0_u32_at(scale_base + 8), (is + 4u) % 4u); - let sc_hi = get_byte(load_src0_u32_at(scale_base), (is - 4u) % 4u); - let min_hi = get_byte(load_src0_u32_at(scale_base + 4), is % 4u); + let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u); + let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u); + let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u); sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u); mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u); From 0dfdf15717439626205da8473d7da79a141ef431 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 13 Apr 2026 11:17:28 -0700 Subject: [PATCH 07/10] Move to a single query set for GPU profiling --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 341 ++++++++++++--------------- 1 file changed, 154 insertions(+), 187 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index a64f0774995..b9bac216b87 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -73,8 +73,8 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim #endif // GGML_WEBGPU_CPU_PROFILE #ifdef GGML_WEBGPU_GPU_PROFILE -# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 32 -# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps +# define WEBGPU_MAX_PROFILE_QUERY_COUNT 4096u +# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES (WEBGPU_MAX_PROFILE_QUERY_COUNT * sizeof(uint64_t)) #endif /* Constants */ @@ -159,75 +159,10 @@ struct webgpu_param_arena { ~webgpu_param_arena() { this->cleanup(); } }; -#ifdef GGML_WEBGPU_GPU_PROFILE -struct webgpu_gpu_profile_bufs { - wgpu::Buffer host_buf; - wgpu::Buffer dev_buf; - wgpu::QuerySet query_set; -}; - -// Holds a pool of parameter buffers for WebGPU operations -struct webgpu_gpu_profile_buf_pool { - std::vector free; - - std::mutex mutex; - - std::condition_variable cv; - - void init(wgpu::Device device, - int num_bufs, - size_t buf_size, - wgpu::BufferUsage dev_buf_usage, - wgpu::BufferUsage host_buf_usage) { - for (int i = 0; i < num_bufs; i++) { - wgpu::Buffer host_buf; - wgpu::Buffer dev_buf; - ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf"); - ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf"); - // Create a query set for 2 timestamps - wgpu::QuerySetDescriptor ts_query_set_desc = {}; - - ts_query_set_desc.type = wgpu::QueryType::Timestamp; - ts_query_set_desc.count = 2; - wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc); - - free.push_back({ host_buf, dev_buf, ts_query_set }); - } - } - - webgpu_gpu_profile_bufs alloc_bufs() { - std::unique_lock lock(mutex); - cv.wait(lock, [this] { return !free.empty(); }); - webgpu_gpu_profile_bufs bufs = free.back(); - free.pop_back(); - return bufs; - } - - void free_bufs(std::vector bufs) { - std::lock_guard lock(mutex); - free.insert(free.end(), bufs.begin(), bufs.end()); - cv.notify_all(); - } - - void cleanup() { - std::lock_guard lock(mutex); - for (auto & bufs : free) { - bufs.host_buf.Destroy(); - bufs.dev_buf.Destroy(); - bufs.query_set.Destroy(); - } - free.clear(); - } - - ~webgpu_gpu_profile_buf_pool() { this->cleanup(); } -}; -#endif - struct webgpu_encoded_op { uint32_t num_kernels = 0; #ifdef GGML_WEBGPU_GPU_PROFILE - webgpu_gpu_profile_bufs timestamp_query_bufs; - std::string pipeline_name; + std::string pipeline_name; #endif }; @@ -256,7 +191,7 @@ struct webgpu_global_context_struct { webgpu_capabilities capabilities; // Shared buffer to move data from device to host wgpu::Buffer get_tensor_staging_buf; - // Global mutex for pipeline and staging buffer, will be refactored to exclude pipeline caches. + // Global mutex for get_tensor std::recursive_mutex mutex; wgpu::Buffer memset_params_buf; @@ -272,8 +207,6 @@ struct webgpu_global_context_struct { #ifdef GGML_WEBGPU_GPU_PROFILE // Profiling: per-shader GPU time in ms std::unordered_map shader_gpu_time_ms; - // Profiling: pool of timestamp query buffers (one per operation) - webgpu_gpu_profile_buf_pool timestamp_query_buf_pool; #endif #ifdef GGML_WEBGPU_DEBUG @@ -317,6 +250,38 @@ struct webgpu_context_struct { wgpu::Buffer set_rows_host_error_buf; size_t memset_bytes_per_thread; + +#ifdef GGML_WEBGPU_GPU_PROFILE + wgpu::Buffer profile_timestamp_dev_buf; + wgpu::Buffer profile_timestamp_host_buf; + wgpu::QuerySet profile_timestamp_query_set; + uint32_t profile_timestamp_query_count = 0; +#endif + + ~webgpu_context_struct() { +#ifdef GGML_WEBGPU_GPU_PROFILE + if (this->profile_timestamp_host_buf) { + this->profile_timestamp_host_buf.Destroy(); + this->profile_timestamp_host_buf = nullptr; + } + if (this->profile_timestamp_dev_buf) { + this->profile_timestamp_dev_buf.Destroy(); + this->profile_timestamp_dev_buf = nullptr; + } + if (this->profile_timestamp_query_set) { + this->profile_timestamp_query_set.Destroy(); + this->profile_timestamp_query_set = nullptr; + } +#endif + if (this->set_rows_host_error_buf) { + this->set_rows_host_error_buf.Destroy(); + this->set_rows_host_error_buf = nullptr; + } + if (this->set_rows_dev_error_buf) { + this->set_rows_dev_error_buf.Destroy(); + this->set_rows_dev_error_buf = nullptr; + } + } }; typedef std::shared_ptr webgpu_context; @@ -399,24 +364,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** WebGPU Actions */ -#ifdef GGML_WEBGPU_GPU_PROFILE -static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context & ctx, - std::vector & futures) { - if (futures.empty()) { - return; - } - - constexpr size_t max_futures_per_wait = 64; - - while (!futures.empty()) { - ctx->instance.WaitAny(std::min(max_futures_per_wait, futures.size()), futures.data(), UINT64_MAX); - futures.erase(std::remove_if(futures.begin(), futures.end(), - [](const wgpu::FutureWaitInfo & info) { return info.completed; }), - futures.end()); - } -} -#endif - template static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status, T callback_status, @@ -528,36 +475,8 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) { } #endif -#ifdef GGML_WEBGPU_GPU_PROFILE -static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context & ctx, - const std::vector & commands, - std::vector & futures) { - for (const auto & command : commands) { - auto label = command.pipeline_name; - auto ts_bufs = command.timestamp_query_bufs; - - wgpu::Future f = ts_bufs.host_buf.MapAsync( - wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, - [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) { - if (status != wgpu::MapAsyncStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str()); - } else { - const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); - // WebGPU timestamps are in ns; convert to ms - double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; - ctx->shader_gpu_time_ms[label] += elapsed_ms; - } - // We can't unmap in here due to WebGPU reentrancy limitations. - ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs }); - }); - futures.push_back({ f }); - } -} -#endif - static webgpu_encoded_op ggml_backend_webgpu_build_multi( - webgpu_global_context & ctx, - webgpu_param_arena & param_arena, + webgpu_context & ctx, wgpu::CommandEncoder & encoder, const std::vector & pipelines, const std::vector> & params_list, @@ -574,37 +493,36 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi( for (size_t i = 0; i < pipelines.size(); i++) { const size_t param_size = params_list[i].size() * sizeof(uint32_t); - const size_t param_offset = param_arena.alloc_slot(param_size); + const size_t param_offset = ctx->param_arena.alloc_slot(param_size); std::vector entries = bind_group_entries_list[i]; uint32_t params_binding_num = entries.size(); entries.push_back({ .binding = params_binding_num, - .buffer = param_arena.buffer, + .buffer = ctx->param_arena.buffer, .offset = param_offset, - .size = param_arena.slot_size }); + .size = ctx->param_arena.slot_size }); wgpu::BindGroupDescriptor bind_group_desc; bind_group_desc.layout = pipelines[i].pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = entries.size(); bind_group_desc.entries = entries.data(); bind_group_desc.label = pipelines[i].name.c_str(); - bind_groups.push_back(ctx->device.CreateBindGroup(&bind_group_desc)); + bind_groups.push_back(ctx->global_ctx->device.CreateBindGroup(&bind_group_desc)); param_offsets.push_back(param_offset); } for (size_t i = 0; i < param_offsets.size(); i++) { - ctx->queue.WriteBuffer(param_arena.buffer, param_offsets[i], params_list[i].data(), - params_list[i].size() * sizeof(uint32_t)); + ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(), + params_list[i].size() * sizeof(uint32_t)); } #ifdef GGML_WEBGPU_GPU_PROFILE - webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); - if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { - ts_bufs.host_buf.Unmap(); - } + GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT); + const uint32_t query_begin = ctx->profile_timestamp_query_count++; + const uint32_t query_end = ctx->profile_timestamp_query_count++; - wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set, - .beginningOfPassWriteIndex = 0, - .endOfPassWriteIndex = 1 }; + wgpu::PassTimestampWrites ts_writes = { .querySet = ctx->profile_timestamp_query_set, + .beginningOfPassWriteIndex = query_begin, + .endOfPassWriteIndex = query_end }; wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&pass_desc); #else @@ -618,23 +536,19 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi( pass.End(); #ifdef GGML_WEBGPU_GPU_PROFILE - encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0); - encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize()); - result.timestamp_query_bufs = ts_bufs; - result.pipeline_name = pipelines.front().name; + result.pipeline_name = pipelines.front().name; #endif return result; } -static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_global_context & ctx, - webgpu_param_arena & param_arena, +static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context & ctx, wgpu::CommandEncoder & encoder, webgpu_pipeline & pipeline, std::vector params, std::vector bind_group_entries, uint32_t wg_x, uint32_t wg_y = 1) { - return ggml_backend_webgpu_build_multi(ctx, param_arena, encoder, + return ggml_backend_webgpu_build_multi(ctx, encoder, { pipeline }, @@ -642,6 +556,30 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_global_context & { { wg_x, wg_y } }); } +#ifdef GGML_WEBGPU_GPU_PROFILE +static void ggml_backend_webgpu_collect_profile_results(webgpu_context & ctx, + const std::vector & pipeline_names) { + if (pipeline_names.empty()) { + return; + } + + const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t); + GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size()); + + ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0, + mapped_size); + const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size); + + for (size_t i = 0; i < pipeline_names.size(); ++i) { + // WebGPU timestamps are in ns; convert to ms. + const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6; + ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms; + } + + ctx->profile_timestamp_host_buf.Unmap(); +} +#endif + static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx, wgpu::Buffer & buf, uint32_t value, @@ -829,7 +767,7 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, }; uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, @@ -895,7 +833,7 @@ static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, @@ -953,7 +891,7 @@ static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, }; uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx, @@ -1015,7 +953,7 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx, const uint32_t wg_x = CEIL_DIV((uint32_t) src1->ne[0], decisions->wg_size); const uint32_t wg_y = (uint32_t) (dst->ne[2] * dst->ne[3]); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); } static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx, @@ -1072,7 +1010,7 @@ static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx, const uint32_t wg_x = CEIL_DIV((uint32_t) src0->ne[1], decisions->block_size); const uint32_t wg_y = token_tiles * (uint32_t) dst->ne[2]; - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); } static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, @@ -1158,7 +1096,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) } }; - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, h, n_seqs); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, h, n_seqs); } static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, @@ -1228,7 +1166,7 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3]; } uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, 1); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, 1); } // Workgroup size is a common constant @@ -1295,7 +1233,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, uint32_t total_threads = float_parallel ? blocks_per_row * total_rows : total_rows; uint32_t wg_x = CEIL_DIV(total_threads, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, @@ -1441,7 +1379,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); } - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); } static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, @@ -1597,7 +1535,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, entries_list.push_back(std::move(main_entries)); workgroups_list.push_back({ wg_x, wg_y }); - return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list, + return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, entries_list, workgroups_list); } @@ -1923,11 +1861,11 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, workgroups_list.push_back({ (uint32_t) nrows, 1u }); } - return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list, + return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, entries_list, workgroups_list); } - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } #endif // __EMSCRIPTEN__ @@ -2009,7 +1947,7 @@ static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx, @@ -2112,7 +2050,7 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, @@ -2169,7 +2107,7 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, @@ -2214,7 +2152,7 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_repeat_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, @@ -2260,7 +2198,7 @@ static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, }; webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, ggml_nrows(src)); } @@ -2366,7 +2304,7 @@ static webgpu_encoded_op ggml_webgpu_rope(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx, @@ -2432,7 +2370,7 @@ static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, @@ -2486,7 +2424,7 @@ static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx, @@ -2570,7 +2508,7 @@ static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, ggml_nrows(dst)); } @@ -2599,7 +2537,7 @@ static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_argmax_pipeline(shader_lib_ctx); uint32_t wg_x = ggml_nelements(dst); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, @@ -2696,7 +2634,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, workgroups_list.push_back({ wg_x_init, wg_y_init }); if (merge_passes == 0) { - return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list, + return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, entries_list, workgroups_list); } @@ -2758,7 +2696,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, in_is_tmp = !in_is_tmp; } - return ggml_backend_webgpu_build_multi(ctx->global_ctx, ctx->param_arena, encoder, pipelines, params_list, + return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, entries_list, workgroups_list); } @@ -2790,7 +2728,7 @@ static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_cumsum_pipeline(shader_lib_ctx); uint32_t wg_x = ggml_nrows(dst); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, @@ -2825,7 +2763,7 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_sum_rows_pipeline(shader_lib_ctx); uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst); - return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_arena, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); } // Returns the encoded command, or std::nullopt if the operation is a no-op @@ -2937,13 +2875,20 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str std::vector commands; #ifdef GGML_WEBGPU_GPU_PROFILE - std::vector profile_futures; + std::vector profile_pipeline_names; #endif uint32_t num_batched_kernels = 0; uint32_t num_inflight_batches = 0; bool contains_set_rows = false; wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder(); +#ifdef GGML_WEBGPU_GPU_PROFILE + ctx->profile_timestamp_query_count = 0; + if (ctx->profile_timestamp_host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { + ctx->profile_timestamp_host_buf.Unmap(); + } +#endif + for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) { contains_set_rows = true; @@ -2951,37 +2896,52 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str if (auto cmd = ggml_webgpu_encode_node(ctx, batch_encoder, cgraph->nodes[i])) { commands.push_back(*cmd); num_batched_kernels += cmd.value().num_kernels; +#ifdef GGML_WEBGPU_GPU_PROFILE + profile_pipeline_names.push_back(cmd->pipeline_name); +#endif } if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) { num_batched_kernels = 0; wgpu::CommandBuffer batch_commands = batch_encoder.Finish(); ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches); -#ifdef GGML_WEBGPU_GPU_PROFILE - ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures); -#endif ctx->param_arena.reset(); commands.clear(); batch_encoder = ctx->global_ctx->device.CreateCommandEncoder(); } } - if (!commands.empty()) { + if (num_batched_kernels > 0) { wgpu::CommandBuffer batch_commands = batch_encoder.Finish(); ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches); -#ifdef GGML_WEBGPU_GPU_PROFILE - ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures); -#endif ctx->param_arena.reset(); commands.clear(); } - // If there are SET_ROWS operations in this graph, copy the error buffers to the host for checking. - if (contains_set_rows) { +#ifdef GGML_WEBGPU_GPU_PROFILE + const size_t profile_buf_size = ctx->profile_timestamp_query_count * sizeof(uint64_t); +#endif + + // Copy any post-graph bookkeeping buffers to the host for checking. + if (contains_set_rows +#ifdef GGML_WEBGPU_GPU_PROFILE + || ctx->profile_timestamp_query_count > 0 +#endif + ) { wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder(); - encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0, - ctx->set_rows_host_error_buf.GetSize()); - wgpu::CommandBuffer set_rows_commands = encoder.Finish(); - ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches); + if (contains_set_rows) { + encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0, + ctx->set_rows_host_error_buf.GetSize()); + } +#ifdef GGML_WEBGPU_GPU_PROFILE + if (ctx->profile_timestamp_query_count > 0) { + encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count, + ctx->profile_timestamp_dev_buf, 0); + encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0, + profile_buf_size); + } +#endif + wgpu::CommandBuffer post_graph_commands = encoder.Finish(); + ggml_backend_webgpu_submit_commands(ctx, post_graph_commands, num_inflight_batches); } ggml_backend_webgpu_wait_queue(ctx->global_ctx); @@ -2997,7 +2957,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str } #ifdef GGML_WEBGPU_GPU_PROFILE - ggml_backend_webgpu_wait_profile_futures(ctx->global_ctx, profile_futures); + ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names); #endif WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx); return GGML_STATUS_SUCCESS; @@ -3539,14 +3499,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { "memset_params_buf"); ctx->webgpu_global_ctx->queue = ctx->webgpu_global_ctx->device.GetQueue(); -#ifdef GGML_WEBGPU_GPU_PROFILE - // Initialize buffer pool for timestamp queries, used for profiling - ctx->webgpu_global_ctx->timestamp_query_buf_pool.init( - ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, - wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, - wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst); -#endif - GGML_LOG_INFO( "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | " "device_desc: %s\n", @@ -3571,6 +3523,21 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) { WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf"); +#ifdef GGML_WEBGPU_GPU_PROFILE + ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, + WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, + wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, + "profile_timestamp_dev_buf"); + ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_host_buf, + WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, + "profile_timestamp_host_buf"); + wgpu::QuerySetDescriptor query_set_desc = {}; + query_set_desc.type = wgpu::QueryType::Timestamp; + query_set_desc.count = WEBGPU_MAX_PROFILE_QUERY_COUNT; + webgpu_ctx->profile_timestamp_query_set = webgpu_ctx->global_ctx->device.CreateQuerySet(&query_set_desc); +#endif + #ifdef GGML_WEBGPU_DEBUG // Initialize debug buffers ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->global_ctx->debug_host_buf, From 55c05a9af5dc4ac273e1699def185b50b73fef0b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 13 Apr 2026 13:37:29 -0700 Subject: [PATCH 08/10] Move to batching compute passes when not profiling --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 513 +++++++++++++-------------- 1 file changed, 239 insertions(+), 274 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b9bac216b87..f062e3e298f 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -245,9 +245,11 @@ struct webgpu_context_struct { std::unique_ptr shader_lib; - webgpu_param_arena param_arena; - wgpu::Buffer set_rows_dev_error_buf; - wgpu::Buffer set_rows_host_error_buf; + webgpu_param_arena param_arena; + wgpu::Buffer set_rows_dev_error_buf; + wgpu::Buffer set_rows_host_error_buf; + wgpu::CommandEncoder active_command_encoder; + wgpu::ComputePassEncoder active_compute_pass; size_t memset_bytes_per_thread; @@ -477,7 +479,6 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) { static webgpu_encoded_op ggml_backend_webgpu_build_multi( webgpu_context & ctx, - wgpu::CommandEncoder & encoder, const std::vector & pipelines, const std::vector> & params_list, const std::vector> & bind_group_entries_list, @@ -515,25 +516,35 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi( ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(), params_list[i].size() * sizeof(uint32_t)); } + + bool own_pass = false; + wgpu::ComputePassEncoder pass = ctx->active_compute_pass; #ifdef GGML_WEBGPU_GPU_PROFILE GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT); const uint32_t query_begin = ctx->profile_timestamp_query_count++; const uint32_t query_end = ctx->profile_timestamp_query_count++; - - wgpu::PassTimestampWrites ts_writes = { .querySet = ctx->profile_timestamp_query_set, - .beginningOfPassWriteIndex = query_begin, - .endOfPassWriteIndex = query_end }; - wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; - wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&pass_desc); +#endif + if (!pass) { + own_pass = true; +#ifdef GGML_WEBGPU_GPU_PROFILE + wgpu::PassTimestampWrites ts_writes = { .querySet = ctx->profile_timestamp_query_set, + .beginningOfPassWriteIndex = query_begin, + .endOfPassWriteIndex = query_end }; + wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; + pass = ctx->active_command_encoder.BeginComputePass(&pass_desc); #else - wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); + pass = ctx->active_command_encoder.BeginComputePass(); #endif + } + for (size_t i = 0; i < pipelines.size(); i++) { pass.SetPipeline(pipelines[i].pipeline); pass.SetBindGroup(0, bind_groups[i]); pass.DispatchWorkgroups(workgroups_list[i].first, workgroups_list[i].second, 1); } - pass.End(); + if (own_pass) { + pass.End(); + } #ifdef GGML_WEBGPU_GPU_PROFILE result.pipeline_name = pipelines.front().name; @@ -542,13 +553,12 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi( } static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, webgpu_pipeline & pipeline, std::vector params, std::vector bind_group_entries, uint32_t wg_x, uint32_t wg_y = 1) { - return ggml_backend_webgpu_build_multi(ctx, encoder, + return ggml_backend_webgpu_build_multi(ctx, { pipeline }, @@ -556,30 +566,6 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context & { { wg_x, wg_y } }); } -#ifdef GGML_WEBGPU_GPU_PROFILE -static void ggml_backend_webgpu_collect_profile_results(webgpu_context & ctx, - const std::vector & pipeline_names) { - if (pipeline_names.empty()) { - return; - } - - const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t); - GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size()); - - ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0, - mapped_size); - const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size); - - for (size_t i = 0; i < pipeline_names.size(); ++i) { - // WebGPU timestamps are in ns; convert to ms. - const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6; - ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms; - } - - ctx->profile_timestamp_host_buf.Unmap(); -} -#endif - static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx, wgpu::Buffer & buf, uint32_t value, @@ -726,10 +712,7 @@ static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0 return flags; } -static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src, .dst = dst, @@ -767,14 +750,13 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, }; uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { const bool inplace = ggml_webgpu_tensor_equal(src0, dst); ggml_webgpu_shader_lib_context shader_lib_ctx = { @@ -833,13 +815,10 @@ static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src, .dst = dst, .max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup }; @@ -891,14 +870,13 @@ static webgpu_encoded_op ggml_webgpu_pad(webgpu_context & ctx, }; uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -953,14 +931,13 @@ static webgpu_encoded_op ggml_webgpu_solve_tri(webgpu_context & ctx, const uint32_t wg_x = CEIL_DIV((uint32_t) src1->ne[0], decisions->wg_size); const uint32_t wg_y = (uint32_t) (dst->ne[2] * dst->ne[3]); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } -static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -1010,18 +987,17 @@ static webgpu_encoded_op ggml_webgpu_ssm_conv(webgpu_context & ctx, const uint32_t wg_x = CEIL_DIV((uint32_t) src0->ne[1], decisions->block_size); const uint32_t wg_y = token_tiles * (uint32_t) dst->ne[2]; - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); -} - -static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * src3, - ggml_tensor * src4, - ggml_tensor * src5, - ggml_tensor * dst) { + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); +} + +static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * src3, + ggml_tensor * src4, + ggml_tensor * src5, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -1096,14 +1072,13 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) } }; - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, h, n_seqs); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, h, n_seqs); } -static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * idx, - ggml_tensor * dst) { +static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, + ggml_tensor * src, + ggml_tensor * idx, + ggml_tensor * dst) { // For set rows specifically, we need to check if src and idx are empty // tensors. if (ggml_is_empty(src) || ggml_is_empty(idx)) { @@ -1166,7 +1141,7 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3]; } uint32_t wg_x = CEIL_DIV(threads, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, 1); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1); } // Workgroup size is a common constant @@ -1177,11 +1152,10 @@ static std::vector ggml_webgpu_wg_size_entry(uint32_t wg_si return constants; } -static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * idx, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, + ggml_tensor * src, + ggml_tensor * idx, + ggml_tensor * dst) { const bool float_parallel = src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16 || src->type == GGML_TYPE_I32; ggml_webgpu_shader_lib_context shader_lib_ctx = { @@ -1233,14 +1207,13 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, uint32_t total_threads = float_parallel ? blocks_per_row * total_rows : total_rows; uint32_t wg_x = CEIL_DIV(total_threads, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { // Determine if this is a mat-vec operation bool is_vec = (dst->ne[1] == 1); @@ -1379,15 +1352,14 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); } - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } -static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -1535,19 +1507,17 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, entries_list.push_back(std::move(main_entries)); workgroups_list.push_back({ wg_x, wg_y }); - return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, - entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); } #ifndef __EMSCRIPTEN__ -static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * Q, - ggml_tensor * K, - ggml_tensor * V, - ggml_tensor * mask, - ggml_tensor * sinks, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, + ggml_tensor * Q, + ggml_tensor * K, + ggml_tensor * V, + ggml_tensor * mask, + ggml_tensor * sinks, + ggml_tensor * dst) { float scale = *(float *) dst->op_params; float max_bias; memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); @@ -1861,18 +1831,14 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, workgroups_list.push_back({ (uint32_t) nrows, 1u }); } - return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, - entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); } - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } #endif // __EMSCRIPTEN__ -static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { bool is_unary = dst->op == GGML_OP_UNARY; bool inplace = ggml_webgpu_tensor_equal(src, dst) || (dst->op == GGML_OP_FILL); @@ -1947,14 +1913,13 @@ static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { binary_overlap_flags flags = ggml_webgpu_detect_binary_overlap(src0, src1, dst); ggml_webgpu_shader_lib_context shader_lib_ctx = { @@ -2050,14 +2015,13 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { uint32_t ne = (uint32_t) ggml_nelements(dst); uint32_t dim = (uint32_t) dst->op_params[0]; @@ -2107,13 +2071,10 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * dst) { uint32_t ne = (uint32_t) ggml_nelements(dst); std::vector params = { ne, @@ -2152,13 +2113,10 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_repeat_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { bool inplace = ggml_webgpu_tensor_equal(src, dst); std::vector params = { @@ -2198,16 +2156,14 @@ static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, }; webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, - ggml_nrows(src)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(src)); } -static webgpu_encoded_op ggml_webgpu_rope(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_rope(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -2304,14 +2260,13 @@ static webgpu_encoded_op ggml_webgpu_rope(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -2370,13 +2325,10 @@ static webgpu_encoded_op ggml_webgpu_glu(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { bool inplace = ggml_webgpu_tensor_equal(src, dst); ggml_webgpu_shader_lib_context shader_lib_ctx = { @@ -2424,15 +2376,14 @@ static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, } uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * dst) { ggml_webgpu_shader_lib_context shader_lib_ctx = { .src0 = src0, .src1 = src1, @@ -2508,14 +2459,10 @@ static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, - ggml_nrows(dst)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst)); } -static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) src->ne[0] }; @@ -2537,13 +2484,10 @@ static webgpu_encoded_op ggml_webgpu_argmax(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_argmax_pipeline(shader_lib_ctx); uint32_t wg_x = ggml_nelements(dst); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { bool is_top_k = dst->op == GGML_OP_TOP_K; ggml_webgpu_shader_lib_context shader_lib_ctx = { @@ -2634,8 +2578,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, workgroups_list.push_back({ wg_x_init, wg_y_init }); if (merge_passes == 0) { - return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, - entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); } bool in_is_tmp = start_in_tmp; @@ -2696,14 +2639,10 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, in_is_tmp = !in_is_tmp; } - return ggml_backend_webgpu_build_multi(ctx, encoder, pipelines, params_list, - entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); } -static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) src->ne[0] }; @@ -2728,13 +2667,10 @@ static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_cumsum_pipeline(shader_lib_ctx); uint32_t wg_x = ggml_nrows(dst); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * src, - ggml_tensor * dst) { +static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { bool total_sum = dst->op == GGML_OP_SUM; std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), @@ -2763,13 +2699,11 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->shader_lib->get_sum_rows_pipeline(shader_lib_ctx); uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst); - return ggml_backend_webgpu_build(ctx, encoder, pipeline, params, entries, wg_x); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } // Returns the encoded command, or std::nullopt if the operation is a no-op -static std::optional ggml_webgpu_encode_node(webgpu_context ctx, - wgpu::CommandEncoder & encoder, - ggml_tensor * node) { +static std::optional ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { if (ggml_is_empty(node)) { return std::nullopt; } @@ -2792,20 +2726,20 @@ static std::optional ggml_webgpu_encode_node(webgpu_context return std::nullopt; case GGML_OP_CPY: case GGML_OP_CONT: - return ggml_webgpu_cpy(ctx, encoder, src0, node); + return ggml_webgpu_cpy(ctx, src0, node); case GGML_OP_SET: - return ggml_webgpu_set(ctx, encoder, src0, src1, node); + return ggml_webgpu_set(ctx, src0, src1, node); case GGML_OP_SET_ROWS: - return ggml_webgpu_set_rows(ctx, encoder, src0, src1, node); + return ggml_webgpu_set_rows(ctx, src0, src1, node); case GGML_OP_GET_ROWS: - return ggml_webgpu_get_rows(ctx, encoder, src0, src1, node); + return ggml_webgpu_get_rows(ctx, src0, src1, node); case GGML_OP_MUL_MAT: - return ggml_webgpu_mul_mat(ctx, encoder, src0, src1, node); + return ggml_webgpu_mul_mat(ctx, src0, src1, node); case GGML_OP_MUL_MAT_ID: - return ggml_webgpu_mul_mat_id(ctx, encoder, src0, src1, src2, node); + return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node); case GGML_OP_FLASH_ATTN_EXT: #ifndef __EMSCRIPTEN__ - return ggml_webgpu_flash_attn(ctx, encoder, src0, src1, src2, node->src[3], node->src[4], node); + return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node); #else return std::nullopt; #endif @@ -2813,22 +2747,22 @@ static std::optional ggml_webgpu_encode_node(webgpu_context case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: - return ggml_webgpu_binary_op(ctx, encoder, src0, src1, node); + return ggml_webgpu_binary_op(ctx, src0, src1, node); case GGML_OP_CONCAT: - return ggml_webgpu_concat(ctx, encoder, src0, src1, node); + return ggml_webgpu_concat(ctx, src0, src1, node); case GGML_OP_REPEAT: - return ggml_webgpu_repeat(ctx, encoder, src0, node); + return ggml_webgpu_repeat(ctx, src0, node); case GGML_OP_RMS_NORM: case GGML_OP_L2_NORM: - return ggml_webgpu_row_norm(ctx, encoder, src0, node); + return ggml_webgpu_row_norm(ctx, src0, node); case GGML_OP_ROPE: - return ggml_webgpu_rope(ctx, encoder, src0, src1, src2, node); + return ggml_webgpu_rope(ctx, src0, src1, src2, node); case GGML_OP_GLU: - return ggml_webgpu_glu(ctx, encoder, src0, src1, node); + return ggml_webgpu_glu(ctx, src0, src1, node); case GGML_OP_SCALE: - return ggml_webgpu_scale(ctx, encoder, src0, node); + return ggml_webgpu_scale(ctx, src0, node); case GGML_OP_SOFT_MAX: - return ggml_webgpu_soft_max(ctx, encoder, src0, src1, src2, node); + return ggml_webgpu_soft_max(ctx, src0, src1, src2, node); case GGML_OP_UNARY: case GGML_OP_CLAMP: case GGML_OP_FILL: @@ -2839,32 +2773,80 @@ static std::optional ggml_webgpu_encode_node(webgpu_context case GGML_OP_COS: case GGML_OP_DIAG: case GGML_OP_TRI: - return ggml_webgpu_unary_op(ctx, encoder, src0, node); + return ggml_webgpu_unary_op(ctx, src0, node); case GGML_OP_SOLVE_TRI: - return ggml_webgpu_solve_tri(ctx, encoder, src0, src1, node); + return ggml_webgpu_solve_tri(ctx, src0, src1, node); case GGML_OP_SSM_CONV: - return ggml_webgpu_ssm_conv(ctx, encoder, src0, src1, node); + return ggml_webgpu_ssm_conv(ctx, src0, src1, node); case GGML_OP_GATED_DELTA_NET: - return ggml_webgpu_gated_delta_net(ctx, encoder, src0, src1, src2, node->src[3], node->src[4], node->src[5], - node); + return ggml_webgpu_gated_delta_net(ctx, src0, src1, src2, node->src[3], node->src[4], node->src[5], node); case GGML_OP_PAD: - return ggml_webgpu_pad(ctx, encoder, src0, node); + return ggml_webgpu_pad(ctx, src0, node); case GGML_OP_ARGMAX: - return ggml_webgpu_argmax(ctx, encoder, src0, node); + return ggml_webgpu_argmax(ctx, src0, node); case GGML_OP_ARGSORT: case GGML_OP_TOP_K: // we reuse the same argsort implementation for top_k - return ggml_webgpu_argsort(ctx, encoder, src0, node); + return ggml_webgpu_argsort(ctx, src0, node); case GGML_OP_CUMSUM: - return ggml_webgpu_cumsum(ctx, encoder, src0, node); + return ggml_webgpu_cumsum(ctx, src0, node); case GGML_OP_SUM: case GGML_OP_SUM_ROWS: - return ggml_webgpu_sum_rows(ctx, encoder, src0, node); + return ggml_webgpu_sum_rows(ctx, src0, node); default: return std::nullopt; } } +#ifdef GGML_WEBGPU_GPU_PROFILE +static void ggml_backend_webgpu_collect_profile_results(webgpu_context & ctx, + const std::vector & pipeline_names, + uint32_t & num_inflight_batches) { + if (pipeline_names.empty()) { + return; + } + + wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder(); + encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count, + ctx->profile_timestamp_dev_buf, 0); + encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0, + ctx->profile_timestamp_query_count * sizeof(uint64_t)); + + wgpu::CommandBuffer profile_commands = encoder.Finish(); + ggml_backend_webgpu_submit_commands(ctx, profile_commands, num_inflight_batches); + + const size_t mapped_size = ctx->profile_timestamp_query_count * sizeof(uint64_t); + GGML_ASSERT(ctx->profile_timestamp_query_count == 2 * pipeline_names.size()); + + ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->profile_timestamp_host_buf, wgpu::MapMode::Read, 0, + mapped_size); + const uint64_t * ts_data = (const uint64_t *) ctx->profile_timestamp_host_buf.GetConstMappedRange(0, mapped_size); + + for (size_t i = 0; i < pipeline_names.size(); ++i) { + // WebGPU timestamps are in ns; convert to ms. + const double elapsed_ms = double(ts_data[2 * i + 1] - ts_data[2 * i]) * 1e-6; + ctx->global_ctx->shader_gpu_time_ms[pipeline_names[i]] += elapsed_ms; + } + + ctx->profile_timestamp_host_buf.Unmap(); +} +#endif + +static void ggml_backend_webgpu_check_set_rows(webgpu_context & ctx, uint32_t & num_inflight_batches) { + wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0, + ctx->set_rows_host_error_buf.GetSize()); + wgpu::CommandBuffer commands = encoder.Finish(); + ggml_backend_webgpu_submit_commands(ctx, commands, num_inflight_batches); + ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->set_rows_host_error_buf, wgpu::MapMode::Read, 0, + ctx->set_rows_host_error_buf.GetSize()); + const uint32_t * error_data = (const uint32_t *) ctx->set_rows_host_error_buf.GetConstMappedRange(); + if (*error_data) { + GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported."); + } + ctx->set_rows_host_error_buf.Unmap(); +} + static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)"); @@ -2874,26 +2856,28 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); std::vector commands; + + uint32_t num_batched_kernels = 0; + uint32_t num_inflight_batches = 0; + bool contains_set_rows = false; + bool batch_compute_passes = true; + #ifdef GGML_WEBGPU_GPU_PROFILE + ctx->profile_timestamp_query_count = 0; + batch_compute_passes = false; std::vector profile_pipeline_names; #endif - uint32_t num_batched_kernels = 0; - uint32_t num_inflight_batches = 0; - bool contains_set_rows = false; - wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder(); -#ifdef GGML_WEBGPU_GPU_PROFILE - ctx->profile_timestamp_query_count = 0; - if (ctx->profile_timestamp_host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { - ctx->profile_timestamp_host_buf.Unmap(); + ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder(); + if (batch_compute_passes) { + ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass(); } -#endif for (int i = 0; i < cgraph->n_nodes; i++) { if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) { contains_set_rows = true; } - if (auto cmd = ggml_webgpu_encode_node(ctx, batch_encoder, cgraph->nodes[i])) { + if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { commands.push_back(*cmd); num_batched_kernels += cmd.value().num_kernels; #ifdef GGML_WEBGPU_GPU_PROFILE @@ -2902,63 +2886,46 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str } if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) { + if (ctx->active_compute_pass) { + ctx->active_compute_pass.End(); + } num_batched_kernels = 0; - wgpu::CommandBuffer batch_commands = batch_encoder.Finish(); + wgpu::CommandBuffer batch_commands = ctx->active_command_encoder.Finish(); ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches); + + // reset state for next batch + ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder(); + if (batch_compute_passes) { + ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass(); + } ctx->param_arena.reset(); commands.clear(); - batch_encoder = ctx->global_ctx->device.CreateCommandEncoder(); } } + + if (ctx->active_compute_pass) { + ctx->active_compute_pass.End(); + ctx->active_compute_pass = nullptr; + } + if (num_batched_kernels > 0) { - wgpu::CommandBuffer batch_commands = batch_encoder.Finish(); + wgpu::CommandBuffer batch_commands = ctx->active_command_encoder.Finish(); ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches); ctx->param_arena.reset(); commands.clear(); } + ctx->active_command_encoder = nullptr; #ifdef GGML_WEBGPU_GPU_PROFILE - const size_t profile_buf_size = ctx->profile_timestamp_query_count * sizeof(uint64_t); + ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names, num_inflight_batches); #endif - // Copy any post-graph bookkeeping buffers to the host for checking. - if (contains_set_rows -#ifdef GGML_WEBGPU_GPU_PROFILE - || ctx->profile_timestamp_query_count > 0 -#endif - ) { - wgpu::CommandEncoder encoder = ctx->global_ctx->device.CreateCommandEncoder(); - if (contains_set_rows) { - encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0, - ctx->set_rows_host_error_buf.GetSize()); - } -#ifdef GGML_WEBGPU_GPU_PROFILE - if (ctx->profile_timestamp_query_count > 0) { - encoder.ResolveQuerySet(ctx->profile_timestamp_query_set, 0, ctx->profile_timestamp_query_count, - ctx->profile_timestamp_dev_buf, 0); - encoder.CopyBufferToBuffer(ctx->profile_timestamp_dev_buf, 0, ctx->profile_timestamp_host_buf, 0, - profile_buf_size); - } -#endif - wgpu::CommandBuffer post_graph_commands = encoder.Finish(); - ggml_backend_webgpu_submit_commands(ctx, post_graph_commands, num_inflight_batches); + if (contains_set_rows) { + ggml_backend_webgpu_check_set_rows(ctx, num_inflight_batches); } ggml_backend_webgpu_wait_queue(ctx->global_ctx); - if (contains_set_rows) { - ggml_backend_webgpu_map_buffer(ctx->global_ctx, ctx->set_rows_host_error_buf, wgpu::MapMode::Read, 0, - ctx->set_rows_host_error_buf.GetSize()); - const uint32_t * error_data = (const uint32_t *) ctx->set_rows_host_error_buf.GetConstMappedRange(); - if (*error_data) { - GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported."); - } - ctx->set_rows_host_error_buf.Unmap(); - } - -#ifdef GGML_WEBGPU_GPU_PROFILE - ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names); -#endif WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx); return GGML_STATUS_SUCCESS; } @@ -3524,14 +3491,12 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) { wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf"); #ifdef GGML_WEBGPU_GPU_PROFILE - ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, - WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, - wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, - "profile_timestamp_dev_buf"); + ggml_webgpu_create_buffer( + webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, + wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf"); ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_host_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, - "profile_timestamp_host_buf"); + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "profile_timestamp_host_buf"); wgpu::QuerySetDescriptor query_set_desc = {}; query_set_desc.type = wgpu::QueryType::Timestamp; query_set_desc.count = WEBGPU_MAX_PROFILE_QUERY_COUNT; From 6468f3930c992fe8c3dcbcd63e552594da75538f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 13 Apr 2026 13:51:41 -0700 Subject: [PATCH 09/10] Refactor build_multi --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 175 ++++++++++++--------------- 1 file changed, 77 insertions(+), 98 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index f062e3e298f..f2cdc327e7c 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -162,10 +162,17 @@ struct webgpu_param_arena { struct webgpu_encoded_op { uint32_t num_kernels = 0; #ifdef GGML_WEBGPU_GPU_PROFILE - std::string pipeline_name; + std::vector pipeline_names; #endif }; +struct webgpu_dispatch_desc { + webgpu_pipeline pipeline; + std::vector params; + std::vector bind_group_entries; + std::pair workgroups = { 1, 1 }; +}; + struct webgpu_capabilities { wgpu::Limits limits; bool supports_subgroup_matrix = false; @@ -477,26 +484,19 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) { } #endif -static webgpu_encoded_op ggml_backend_webgpu_build_multi( - webgpu_context & ctx, - const std::vector & pipelines, - const std::vector> & params_list, - const std::vector> & bind_group_entries_list, - const std::vector> & workgroups_list) { - GGML_ASSERT(pipelines.size() == params_list.size()); - GGML_ASSERT(pipelines.size() == bind_group_entries_list.size()); - GGML_ASSERT(pipelines.size() == workgroups_list.size()); - +static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context & ctx, + const std::vector & dispatches) { webgpu_encoded_op result = {}; std::vector bind_groups; std::vector param_offsets; - result.num_kernels = pipelines.size(); + result.num_kernels = dispatches.size(); - for (size_t i = 0; i < pipelines.size(); i++) { - const size_t param_size = params_list[i].size() * sizeof(uint32_t); - const size_t param_offset = ctx->param_arena.alloc_slot(param_size); + for (size_t i = 0; i < dispatches.size(); i++) { + const webgpu_dispatch_desc & dispatch = dispatches[i]; + const size_t param_size = dispatch.params.size() * sizeof(uint32_t); + const size_t param_offset = ctx->param_arena.alloc_slot(param_size); - std::vector entries = bind_group_entries_list[i]; + std::vector entries = dispatch.bind_group_entries; uint32_t params_binding_num = entries.size(); entries.push_back({ .binding = params_binding_num, .buffer = ctx->param_arena.buffer, @@ -504,51 +504,44 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi( .size = ctx->param_arena.slot_size }); wgpu::BindGroupDescriptor bind_group_desc; - bind_group_desc.layout = pipelines[i].pipeline.GetBindGroupLayout(0); + bind_group_desc.layout = dispatch.pipeline.pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = entries.size(); bind_group_desc.entries = entries.data(); - bind_group_desc.label = pipelines[i].name.c_str(); + bind_group_desc.label = dispatch.pipeline.name.c_str(); bind_groups.push_back(ctx->global_ctx->device.CreateBindGroup(&bind_group_desc)); param_offsets.push_back(param_offset); } for (size_t i = 0; i < param_offsets.size(); i++) { - ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], params_list[i].data(), - params_list[i].size() * sizeof(uint32_t)); + ctx->global_ctx->queue.WriteBuffer(ctx->param_arena.buffer, param_offsets[i], dispatches[i].params.data(), + dispatches[i].params.size() * sizeof(uint32_t)); } - bool own_pass = false; - wgpu::ComputePassEncoder pass = ctx->active_compute_pass; -#ifdef GGML_WEBGPU_GPU_PROFILE - GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT); - const uint32_t query_begin = ctx->profile_timestamp_query_count++; - const uint32_t query_end = ctx->profile_timestamp_query_count++; -#endif - if (!pass) { - own_pass = true; #ifdef GGML_WEBGPU_GPU_PROFILE - wgpu::PassTimestampWrites ts_writes = { .querySet = ctx->profile_timestamp_query_set, - .beginningOfPassWriteIndex = query_begin, - .endOfPassWriteIndex = query_end }; - wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; - pass = ctx->active_command_encoder.BeginComputePass(&pass_desc); -#else - pass = ctx->active_command_encoder.BeginComputePass(); -#endif - } - - for (size_t i = 0; i < pipelines.size(); i++) { - pass.SetPipeline(pipelines[i].pipeline); + for (size_t i = 0; i < dispatches.size(); i++) { + GGML_ASSERT(ctx->profile_timestamp_query_count + 2 <= WEBGPU_MAX_PROFILE_QUERY_COUNT); + const uint32_t query_begin = ctx->profile_timestamp_query_count++; + const uint32_t query_end = ctx->profile_timestamp_query_count++; + wgpu::PassTimestampWrites ts_writes = { .querySet = ctx->profile_timestamp_query_set, + .beginningOfPassWriteIndex = query_begin, + .endOfPassWriteIndex = query_end }; + wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; + wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass(&pass_desc); + + pass.SetPipeline(dispatches[i].pipeline.pipeline); pass.SetBindGroup(0, bind_groups[i]); - pass.DispatchWorkgroups(workgroups_list[i].first, workgroups_list[i].second, 1); - } - if (own_pass) { + pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1); pass.End(); + result.pipeline_names.push_back(dispatches[i].pipeline.name); + } +#else + for (size_t i = 0; i < dispatches.size(); i++) { + ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline); + ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]); + ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1); } - -#ifdef GGML_WEBGPU_GPU_PROFILE - result.pipeline_name = pipelines.front().name; #endif + return result; } @@ -558,12 +551,10 @@ static webgpu_encoded_op ggml_backend_webgpu_build(webgpu_context & std::vector bind_group_entries, uint32_t wg_x, uint32_t wg_y = 1) { - return ggml_backend_webgpu_build_multi(ctx, - { - pipeline - }, - { std::move(params) }, { std::move(bind_group_entries) }, - { { wg_x, wg_y } }); + return ggml_backend_webgpu_build_multi( + ctx, { + { pipeline, std::move(params), std::move(bind_group_entries), { wg_x, wg_y } }, + }); } static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx, @@ -1371,10 +1362,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, // Get or create pipeline webgpu_pipeline gather_pipeline, main_pipeline; - std::vector pipelines; - std::vector> params_list; - std::vector> entries_list; - std::vector> workgroups_list; + std::vector dispatches; gather_pipeline = ctx->shader_lib->get_mul_mat_id_gather_pipeline(shader_lib_ctx); main_pipeline = ctx->shader_lib->get_mul_mat_id_pipeline(shader_lib_ctx); @@ -1434,10 +1422,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, const uint32_t gather_wg_x = std::min(gather_total_wg, max_wg_per_dim); const uint32_t gather_wg_y = CEIL_DIV(gather_total_wg, gather_wg_x); - pipelines.push_back(gather_pipeline); - params_list.push_back(std::move(gather_params)); - entries_list.push_back(std::move(gather_entries)); - workgroups_list.push_back({ gather_wg_x, gather_wg_y }); + dispatches.push_back({ + gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y } + }); // params for mul_mat_id.wgsl std::vector main_params = { @@ -1502,12 +1489,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); - pipelines.push_back(main_pipeline); - params_list.push_back(std::move(main_params)); - entries_list.push_back(std::move(main_entries)); - workgroups_list.push_back({ wg_x, wg_y }); + dispatches.push_back({ + main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y } + }); - return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, dispatches); } #ifndef __EMSCRIPTEN__ @@ -1809,29 +1795,26 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, const uint64_t split_wg_total = (uint64_t) wg_x * nwg; GGML_ASSERT(split_wg_total <= UINT32_MAX); - std::vector pipelines; - std::vector> params_list; - std::vector> entries_list; - std::vector> workgroups_list; + std::vector dispatches; if (use_blk) { - pipelines.push_back(blk_pipeline); - params_list.push_back(std::move(blk_params)); - entries_list.push_back(std::move(blk_entries)); - workgroups_list.push_back({ blk_nblk0, blk_nblk1 * blk_batch_count }); + dispatches.push_back({ + blk_pipeline, + std::move(blk_params), + std::move(blk_entries), + { blk_nblk0, blk_nblk1 * blk_batch_count } + }); } - pipelines.push_back(pipeline); - params_list.push_back(std::move(split_params)); - entries_list.push_back(std::move(split_entries)); - workgroups_list.push_back({ (uint32_t) split_wg_total, 1u }); + dispatches.push_back({ + pipeline, std::move(split_params), std::move(split_entries), { (uint32_t) split_wg_total, 1u } + }); if (use_vec_reduce) { - pipelines.push_back(reduce_pipeline); - params_list.push_back(std::move(reduce_params)); - entries_list.push_back(std::move(reduce_entries)); - workgroups_list.push_back({ (uint32_t) nrows, 1u }); + dispatches.push_back({ + reduce_pipeline, std::move(reduce_params), std::move(reduce_entries), { (uint32_t) nrows, 1u } + }); } - return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, dispatches); } return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); @@ -2545,10 +2528,7 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * const uint32_t stride_idx2 = out_ne0 * (uint32_t) dst->ne[1]; const uint32_t stride_idx3 = stride_idx2 * (uint32_t) dst->ne[2]; - std::vector pipelines; - std::vector> params_list; - std::vector> entries_list; - std::vector> workgroups_list; + std::vector dispatches; const uint32_t init_offset = start_in_tmp ? offset_tmp : offset_dst; const size_t init_align_offset = start_in_tmp ? tmp_offset : ggml_webgpu_tensor_align_offset(ctx, dst); @@ -2572,13 +2552,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * { .binding = 1, .buffer = ggml_webgpu_tensor_buf(dst), .offset = init_align_offset, .size = init_binding_size } }; - pipelines.push_back(argsort_pipeline); - params_list.push_back(std::move(init_params)); - entries_list.push_back(std::move(init_entries)); - workgroups_list.push_back({ wg_x_init, wg_y_init }); + dispatches.push_back({ + argsort_pipeline, std::move(init_params), std::move(init_entries), { wg_x_init, wg_y_init } + }); if (merge_passes == 0) { - return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, dispatches); } bool in_is_tmp = start_in_tmp; @@ -2630,16 +2609,15 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * const uint32_t total_wg_merge = nm * nrows; const uint32_t wg_x_merge = std::min(total_wg_merge, max_wg); const uint32_t wg_y_merge = CEIL_DIV(total_wg_merge, wg_x_merge); - workgroups_list.push_back({ wg_x_merge, wg_y_merge }); - pipelines.push_back(argsort_merge_pipeline); - params_list.push_back(std::move(merge_params)); - entries_list.push_back(std::move(merge_entries)); + dispatches.push_back({ + argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge } + }); len <<= 1; in_is_tmp = !in_is_tmp; } - return ggml_backend_webgpu_build_multi(ctx, pipelines, params_list, entries_list, workgroups_list); + return ggml_backend_webgpu_build_multi(ctx, dispatches); } static webgpu_encoded_op ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { @@ -2881,7 +2859,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str commands.push_back(*cmd); num_batched_kernels += cmd.value().num_kernels; #ifdef GGML_WEBGPU_GPU_PROFILE - profile_pipeline_names.push_back(cmd->pipeline_name); + profile_pipeline_names.insert(profile_pipeline_names.end(), cmd->pipeline_names.begin(), + cmd->pipeline_names.end()); #endif } From aa2b38054819744305f0e0c8c734de59c0630a43 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 13 Apr 2026 15:11:06 -0700 Subject: [PATCH 10/10] remove iOS throttling now that we're batching compute passes --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index f2cdc327e7c..c460df5952b 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -392,22 +392,8 @@ static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status, } } -#ifdef __EMSCRIPTEN__ -EM_JS(int, ggml_webgpu_is_ios_browser, (), { - const ua = navigator.userAgent; - return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0; -}); -#endif - // TODO: these next two functions may want tuning across different platforms and workloads, static uint32_t ggml_backend_webgpu_get_max_inflight_batches() { -#ifdef __EMSCRIPTEN__ - // iOS has very strict limits on the number of in-flight GPU commands, - // so we need to throttle to avoid failures. - if (ggml_webgpu_is_ios_browser()) { - return 1; - } -#endif return UINT32_MAX; }