1616#include < webgpu/webgpu_cpp.h>
1717
1818#include < atomic>
19- #include < condition_variable>
2019#include < cstdint>
2120#include < cstring>
2221#ifdef GGML_WEBGPU_GPU_PROFILE
2524#if defined(GGML_WEBGPU_DEBUG) || defined(GGML_WEBGPU_CPU_PROFILE) || defined(GGML_WEBGPU_GPU_PROFILE)
2625# include < iostream>
2726#endif
28- #include < map>
2927#include < memory>
3028#include < mutex>
3129#include < optional>
@@ -81,13 +79,13 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
8179
8280/* Constants */
8381
84- #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 32u
85- #define WEBGPU_NUM_PARAM_SLOTS \
86- (WEBGPU_COMMAND_SUBMIT_BATCH_SIZE + 10 ) // a few extra for safety, since some operations may need multiple slots
87- #define WEBGPU_WAIT_ANY_TIMEOUT_MS 100
88- #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
89- #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
90- #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
82+ #define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
83+ #define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
84+ # define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
85+ #define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6 )
86+ #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
87+ #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
88+ #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
9189
9290// For operations which process a row in parallel, this seems like a reasonable
9391// default
@@ -252,6 +250,8 @@ struct webgpu_global_context_struct {
252250 wgpu::Adapter adapter;
253251 wgpu::Device device;
254252 wgpu::Queue queue;
253+ uint32_t command_submit_batch_size = WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
254+ uint32_t max_inflight_batches = UINT32_MAX;
255255
256256 webgpu_capabilities capabilities;
257257 // Shared buffer to move data from device to host
@@ -417,31 +417,104 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
417417}
418418#endif
419419
420+ template <typename T>
421+ static void ggml_backend_webgpu_check_wait_status (wgpu::WaitStatus wait_status,
422+ T callback_status,
423+ T success_status,
424+ const char * wait_name,
425+ const char * failure_name,
426+ const char * callback_message) {
427+ if (wait_status == wgpu::WaitStatus::TimedOut) {
428+ GGML_ABORT (" ggml_webgpu: %s timed out after %u ms\n " , wait_name, WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
429+ }
430+ if (wait_status == wgpu::WaitStatus::Error) {
431+ GGML_ABORT (" ggml_webgpu: %s failed\n " , wait_name);
432+ }
433+ if (callback_status != success_status) {
434+ GGML_ABORT (" ggml_webgpu: %s failed with status %d: %s\n " , failure_name, static_cast <int >(callback_status),
435+ callback_message);
436+ }
437+ }
438+
439+ #ifdef __EMSCRIPTEN__
440+ // iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
441+ EM_JS (int , ggml_webgpu_is_ios_browser, (), {
442+ const ua = navigator.userAgent ;
443+ return (ua.includes (' iPhone' ) || ua.includes (' iPad' )) ? 1 : 0 ;
444+ });
445+ #endif
446+
447+ static uint32_t ggml_backend_webgpu_get_max_inflight_batches (const wgpu::AdapterInfo & info) {
448+ #ifdef __EMSCRIPTEN__
449+ if (ggml_webgpu_is_ios_browser ()) {
450+ return 1 ;
451+ }
452+ #else
453+ GGML_UNUSED (info);
454+ #endif
455+
456+ return UINT32_MAX;
457+ }
458+
459+ static uint32_t ggml_backend_webgpu_get_command_submit_batch_size (const wgpu::AdapterInfo & info) {
460+ #ifdef __EMSCRIPTEN__
461+ if (ggml_webgpu_is_ios_browser ()) {
462+ return 16 ;
463+ }
464+ #else
465+ GGML_UNUSED (info);
466+ #endif
467+
468+ return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
469+ }
470+
420471static void ggml_backend_webgpu_wait_queue (webgpu_global_context & ctx) {
421- ctx->instance .WaitAny (
422- ctx->queue .OnSubmittedWorkDone (wgpu::CallbackMode::AllowSpontaneous,
423- [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
424- if (status != wgpu::QueueWorkDoneStatus::Success) {
425- GGML_LOG_ERROR (" ggml_webgpu: Failed to submit commands: %s\n " ,
426- std::string (message).c_str ());
427- }
428- }),
429- UINT64_MAX);
472+ wgpu::QueueWorkDoneStatus callback_status = wgpu::QueueWorkDoneStatus::Error;
473+ std::string callback_message;
474+
475+ const wgpu::WaitStatus wait_status = ctx->instance .WaitAny (
476+ ctx->queue .OnSubmittedWorkDone (
477+ wgpu::CallbackMode::AllowSpontaneous,
478+ [&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
479+ callback_status = status;
480+ callback_message = std::string (message);
481+ }),
482+ WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
483+
484+ ggml_backend_webgpu_check_wait_status (wait_status, callback_status, wgpu::QueueWorkDoneStatus::Success,
485+ " Queue wait" , " Queue work" , callback_message.c_str ());
430486}
431487
432488static void ggml_backend_webgpu_map_buffer (webgpu_global_context & ctx,
433489 wgpu::Buffer & buffer,
434490 wgpu::MapMode mode,
435491 size_t offset,
436492 size_t size) {
437- ctx->instance .WaitAny (buffer.MapAsync (mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
438- [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
439- if (status != wgpu::MapAsyncStatus::Success) {
440- GGML_LOG_ERROR (" ggml_webgpu: Failed to map buffer: %s\n " ,
441- message.data );
442- }
443- }),
444- UINT64_MAX);
493+ wgpu::MapAsyncStatus callback_status = wgpu::MapAsyncStatus::Error;
494+ std::string callback_message;
495+
496+ const wgpu::WaitStatus wait_status = ctx->instance .WaitAny (
497+ buffer.MapAsync (mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
498+ [&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
499+ callback_status = status;
500+ callback_message = std::string (message);
501+ }),
502+ WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
503+
504+ ggml_backend_webgpu_check_wait_status (wait_status, callback_status, wgpu::MapAsyncStatus::Success,
505+ " Buffer map wait" , " Buffer map" , callback_message.c_str ());
506+ }
507+
508+ static void ggml_backend_webgpu_submit_commands (webgpu_context & ctx,
509+ const wgpu::CommandBuffer commands,
510+ uint32_t & num_inflight_batches) {
511+ if (num_inflight_batches >= ctx->global_ctx ->max_inflight_batches ) {
512+ ggml_backend_webgpu_wait_queue (ctx->global_ctx );
513+ num_inflight_batches = 0 ;
514+ }
515+
516+ ctx->global_ctx ->queue .Submit (1 , &commands);
517+ num_inflight_batches++;
445518}
446519
447520#ifdef GGML_WEBGPU_DEBUG
@@ -2871,9 +2944,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28712944#ifdef GGML_WEBGPU_GPU_PROFILE
28722945 std::vector<wgpu::FutureWaitInfo> profile_futures;
28732946#endif
2874- uint32_t num_batched_kernels = 0 ;
2875- bool contains_set_rows = false ;
2876- wgpu::CommandEncoder batch_encoder = ctx->global_ctx ->device .CreateCommandEncoder ();
2947+ uint32_t num_batched_kernels = 0 ;
2948+ uint32_t num_inflight_batches = 0 ;
2949+ bool contains_set_rows = false ;
2950+ wgpu::CommandEncoder batch_encoder = ctx->global_ctx ->device .CreateCommandEncoder ();
28772951
28782952 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
28792953 if (cgraph->nodes [i]->op == GGML_OP_SET_ROWS) {
@@ -2884,10 +2958,10 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28842958 num_batched_kernels += cmd.value ().num_kernels ;
28852959 }
28862960
2887- if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE ) {
2961+ if (num_batched_kernels >= ctx-> global_ctx -> command_submit_batch_size ) {
28882962 num_batched_kernels = 0 ;
28892963 wgpu::CommandBuffer batch_commands = batch_encoder.Finish ();
2890- ctx-> global_ctx -> queue . Submit ( 1 , & batch_commands);
2964+ ggml_backend_webgpu_submit_commands (ctx, batch_commands, num_inflight_batches );
28912965#ifdef GGML_WEBGPU_GPU_PROFILE
28922966 ggml_backend_webgpu_collect_profile_futures (ctx->global_ctx , commands, profile_futures);
28932967#endif
@@ -2898,7 +2972,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
28982972 }
28992973 if (!commands.empty ()) {
29002974 wgpu::CommandBuffer batch_commands = batch_encoder.Finish ();
2901- ctx-> global_ctx -> queue . Submit ( 1 , & batch_commands);
2975+ ggml_backend_webgpu_submit_commands (ctx, batch_commands, num_inflight_batches );
29022976#ifdef GGML_WEBGPU_GPU_PROFILE
29032977 ggml_backend_webgpu_collect_profile_futures (ctx->global_ctx , commands, profile_futures);
29042978#endif
@@ -2912,7 +2986,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
29122986 encoder.CopyBufferToBuffer (ctx->set_rows_dev_error_buf , 0 , ctx->set_rows_host_error_buf , 0 ,
29132987 ctx->set_rows_host_error_buf .GetSize ());
29142988 wgpu::CommandBuffer set_rows_commands = encoder.Finish ();
2915- ctx-> global_ctx -> queue . Submit ( 1 , & set_rows_commands);
2989+ ggml_backend_webgpu_submit_commands (ctx, set_rows_commands, num_inflight_batches );
29162990 }
29172991
29182992 ggml_backend_webgpu_wait_queue (ctx->global_ctx );
@@ -3363,6 +3437,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
33633437 }
33643438#endif
33653439 ctx->webgpu_global_ctx ->adapter .GetInfo (&info);
3440+ ctx->webgpu_global_ctx ->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size (info);
3441+ ctx->webgpu_global_ctx ->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches (info);
33663442 wgpu::SupportedFeatures features;
33673443 ctx->webgpu_global_ctx ->adapter .GetFeatures (&features);
33683444 // we require f16 support
@@ -3483,8 +3559,10 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
34833559 webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
34843560 webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx ;
34853561 webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx ->device );
3486- webgpu_ctx->param_arena .init (webgpu_ctx->global_ctx ->device , WEBGPU_PARAMS_BUF_SIZE_BYTES, WEBGPU_NUM_PARAM_SLOTS,
3487- webgpu_ctx->global_ctx ->capabilities .limits .minUniformBufferOffsetAlignment );
3562+ webgpu_ctx->param_arena .init (
3563+ webgpu_ctx->global_ctx ->device , WEBGPU_PARAMS_BUF_SIZE_BYTES,
3564+ webgpu_ctx->global_ctx ->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
3565+ webgpu_ctx->global_ctx ->capabilities .limits .minUniformBufferOffsetAlignment );
34883566 ggml_webgpu_create_buffer (webgpu_ctx->global_ctx ->device , webgpu_ctx->set_rows_dev_error_buf ,
34893567 WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
34903568 wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, " set_rows_dev_error_buf" );
0 commit comments