Skip to content

Commit 69e22cb

Browse files
reeselevineArberSephirotheca
authored andcommitted
ggml-webgpu: parameterize submission size and add iOS specific limits (ggml-org#21533)
* Work towards removing bitcast * Move rest of existing types over * Add timeout back to wait and remove synchronous set_tensor/memset_tensor * move to unpackf16 for wider compatibility * cleanup * Remove deadlock condition in free_bufs * Start work on removing parameter buffer pools * Simplify and optimize further * simplify profile futures * Fix stride * Try using a single command buffer per batch * formatting * Add parameters for different browsers in-flight submissions * Update handling of batch size too * Throttle ios as much as possible * Increase timeout for llvm-pipe testing
1 parent 1455ede commit 69e22cb

1 file changed

Lines changed: 113 additions & 35 deletions

File tree

ggml/src/ggml-webgpu/ggml-webgpu.cpp

Lines changed: 113 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include <webgpu/webgpu_cpp.h>
1717

1818
#include <atomic>
19-
#include <condition_variable>
2019
#include <cstdint>
2120
#include <cstring>
2221
#ifdef GGML_WEBGPU_GPU_PROFILE
@@ -25,7 +24,6 @@
2524
#if defined(GGML_WEBGPU_DEBUG) || defined(GGML_WEBGPU_CPU_PROFILE) || defined(GGML_WEBGPU_GPU_PROFILE)
2625
# include <iostream>
2726
#endif
28-
#include <map>
2927
#include <memory>
3028
#include <mutex>
3129
#include <optional>
@@ -81,13 +79,13 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
8179

8280
/* Constants */
8381

84-
#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 32u
85-
#define WEBGPU_NUM_PARAM_SLOTS \
86-
(WEBGPU_COMMAND_SUBMIT_BATCH_SIZE + 10) // a few extra for safety, since some operations may need multiple slots
87-
#define WEBGPU_WAIT_ANY_TIMEOUT_MS 100
88-
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
89-
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
90-
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
82+
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
83+
#define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
84+
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
85+
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
86+
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
87+
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
88+
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
9189

9290
// For operations which process a row in parallel, this seems like a reasonable
9391
// default
@@ -252,6 +250,8 @@ struct webgpu_global_context_struct {
252250
wgpu::Adapter adapter;
253251
wgpu::Device device;
254252
wgpu::Queue queue;
253+
uint32_t command_submit_batch_size = WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
254+
uint32_t max_inflight_batches = UINT32_MAX;
255255

256256
webgpu_capabilities capabilities;
257257
// Shared buffer to move data from device to host
@@ -417,31 +417,104 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
417417
}
418418
#endif
419419

420+
template <typename T>
421+
static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
422+
T callback_status,
423+
T success_status,
424+
const char * wait_name,
425+
const char * failure_name,
426+
const char * callback_message) {
427+
if (wait_status == wgpu::WaitStatus::TimedOut) {
428+
GGML_ABORT("ggml_webgpu: %s timed out after %u ms\n", wait_name, WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
429+
}
430+
if (wait_status == wgpu::WaitStatus::Error) {
431+
GGML_ABORT("ggml_webgpu: %s failed\n", wait_name);
432+
}
433+
if (callback_status != success_status) {
434+
GGML_ABORT("ggml_webgpu: %s failed with status %d: %s\n", failure_name, static_cast<int>(callback_status),
435+
callback_message);
436+
}
437+
}
438+
439+
#ifdef __EMSCRIPTEN__
440+
// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
441+
EM_JS(int, ggml_webgpu_is_ios_browser, (), {
442+
const ua = navigator.userAgent;
443+
return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
444+
});
445+
#endif
446+
447+
static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
448+
#ifdef __EMSCRIPTEN__
449+
if (ggml_webgpu_is_ios_browser()) {
450+
return 1;
451+
}
452+
#else
453+
GGML_UNUSED(info);
454+
#endif
455+
456+
return UINT32_MAX;
457+
}
458+
459+
static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
460+
#ifdef __EMSCRIPTEN__
461+
if (ggml_webgpu_is_ios_browser()) {
462+
return 16;
463+
}
464+
#else
465+
GGML_UNUSED(info);
466+
#endif
467+
468+
return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
469+
}
470+
420471
static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
421-
ctx->instance.WaitAny(
422-
ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
423-
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
424-
if (status != wgpu::QueueWorkDoneStatus::Success) {
425-
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
426-
std::string(message).c_str());
427-
}
428-
}),
429-
UINT64_MAX);
472+
wgpu::QueueWorkDoneStatus callback_status = wgpu::QueueWorkDoneStatus::Error;
473+
std::string callback_message;
474+
475+
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
476+
ctx->queue.OnSubmittedWorkDone(
477+
wgpu::CallbackMode::AllowSpontaneous,
478+
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
479+
callback_status = status;
480+
callback_message = std::string(message);
481+
}),
482+
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
483+
484+
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::QueueWorkDoneStatus::Success,
485+
"Queue wait", "Queue work", callback_message.c_str());
430486
}
431487

432488
static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
433489
wgpu::Buffer & buffer,
434490
wgpu::MapMode mode,
435491
size_t offset,
436492
size_t size) {
437-
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
438-
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
439-
if (status != wgpu::MapAsyncStatus::Success) {
440-
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
441-
message.data);
442-
}
443-
}),
444-
UINT64_MAX);
493+
wgpu::MapAsyncStatus callback_status = wgpu::MapAsyncStatus::Error;
494+
std::string callback_message;
495+
496+
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
497+
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
498+
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
499+
callback_status = status;
500+
callback_message = std::string(message);
501+
}),
502+
WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
503+
504+
ggml_backend_webgpu_check_wait_status(wait_status, callback_status, wgpu::MapAsyncStatus::Success,
505+
"Buffer map wait", "Buffer map", callback_message.c_str());
506+
}
507+
508+
static void ggml_backend_webgpu_submit_commands(webgpu_context & ctx,
509+
const wgpu::CommandBuffer commands,
510+
uint32_t & num_inflight_batches) {
511+
if (num_inflight_batches >= ctx->global_ctx->max_inflight_batches) {
512+
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
513+
num_inflight_batches = 0;
514+
}
515+
516+
ctx->global_ctx->queue.Submit(1, &commands);
517+
num_inflight_batches++;
445518
}
446519

447520
#ifdef GGML_WEBGPU_DEBUG
@@ -2871,9 +2944,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28712944
#ifdef GGML_WEBGPU_GPU_PROFILE
28722945
std::vector<wgpu::FutureWaitInfo> profile_futures;
28732946
#endif
2874-
uint32_t num_batched_kernels = 0;
2875-
bool contains_set_rows = false;
2876-
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
2947+
uint32_t num_batched_kernels = 0;
2948+
uint32_t num_inflight_batches = 0;
2949+
bool contains_set_rows = false;
2950+
wgpu::CommandEncoder batch_encoder = ctx->global_ctx->device.CreateCommandEncoder();
28772951

28782952
for (int i = 0; i < cgraph->n_nodes; i++) {
28792953
if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
@@ -2884,10 +2958,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28842958
num_batched_kernels += cmd.value().num_kernels;
28852959
}
28862960

2887-
if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
2961+
if (num_batched_kernels >= ctx->global_ctx->command_submit_batch_size) {
28882962
num_batched_kernels = 0;
28892963
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
2890-
ctx->global_ctx->queue.Submit(1, &batch_commands);
2964+
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
28912965
#ifdef GGML_WEBGPU_GPU_PROFILE
28922966
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
28932967
#endif
@@ -2898,7 +2972,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28982972
}
28992973
if (!commands.empty()) {
29002974
wgpu::CommandBuffer batch_commands = batch_encoder.Finish();
2901-
ctx->global_ctx->queue.Submit(1, &batch_commands);
2975+
ggml_backend_webgpu_submit_commands(ctx, batch_commands, num_inflight_batches);
29022976
#ifdef GGML_WEBGPU_GPU_PROFILE
29032977
ggml_backend_webgpu_collect_profile_futures(ctx->global_ctx, commands, profile_futures);
29042978
#endif
@@ -2912,7 +2986,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
29122986
encoder.CopyBufferToBuffer(ctx->set_rows_dev_error_buf, 0, ctx->set_rows_host_error_buf, 0,
29132987
ctx->set_rows_host_error_buf.GetSize());
29142988
wgpu::CommandBuffer set_rows_commands = encoder.Finish();
2915-
ctx->global_ctx->queue.Submit(1, &set_rows_commands);
2989+
ggml_backend_webgpu_submit_commands(ctx, set_rows_commands, num_inflight_batches);
29162990
}
29172991

29182992
ggml_backend_webgpu_wait_queue(ctx->global_ctx);
@@ -3363,6 +3437,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
33633437
}
33643438
#endif
33653439
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
3440+
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
3441+
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info);
33663442
wgpu::SupportedFeatures features;
33673443
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
33683444
// we require f16 support
@@ -3483,8 +3559,10 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
34833559
webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
34843560
webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
34853561
webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
3486-
webgpu_ctx->param_arena.init(webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES, WEBGPU_NUM_PARAM_SLOTS,
3487-
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
3562+
webgpu_ctx->param_arena.init(
3563+
webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
3564+
webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
3565+
webgpu_ctx->global_ctx->capabilities.limits.minUniformBufferOffsetAlignment);
34883566
ggml_webgpu_create_buffer(webgpu_ctx->global_ctx->device, webgpu_ctx->set_rows_dev_error_buf,
34893567
WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
34903568
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "set_rows_dev_error_buf");

0 commit comments

Comments
 (0)