ggml-org · reeselevine · Apr 10, 2026 · Mar 23, 2026 · Apr 2, 2026 · Apr 3, 2026
@@ -1115,6 +1115,32 @@ class ggml_webgpu_shader_lib {
                     std::string type_upper = type_str;
                     std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
 
+                    switch (key.src_type)
+                    {
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q5_0:
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q3_K:
+                        case GGML_TYPE_Q6_K:
+                        case GGML_TYPE_IQ2_XXS:
+                        case GGML_TYPE_IQ2_XS:
+                        case GGML_TYPE_IQ2_S:
+                        case GGML_TYPE_IQ3_XXS:
+                        case GGML_TYPE_IQ3_S:
+                        case GGML_TYPE_IQ1_S:
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                // Quantized types using u32 buffers for portability.
+                                defines.push_back("SRC_TYPE=u32");
+                                defines.push_back("U32_DEQUANT_HELPERS");
+                                break;
+                            }
+                        default:
+                        {
+                            defines.push_back(std::string("SRC_TYPE=") + type_str);
+                        }
+                    }
+
                     defines.push_back("BYTE_HELPERS");
                     defines.push_back(type_upper + "_T");
                     defines.push_back(type_upper);
@@ -1125,7 +1151,6 @@ class ggml_webgpu_shader_lib {
                     variant += "_";
                     variant += type_str;
 
-                    defines.push_back(std::string("SRC_TYPE=") + type_str);
                     defines.push_back("DST_TYPE=f32");
 
                     if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
@@ -1593,11 +1618,35 @@ class ggml_webgpu_shader_lib {
                 break;
             default:
                 {
-                    // quantized types
                     std::string type_upper = src0_name;
                     std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
 
-                    defines.push_back(std::string("SRC0_TYPE=") + src0_name);
+                    switch (context.src0->type)
+                    {
+                        case GGML_TYPE_Q4_0:
+                        case GGML_TYPE_Q5_0:
+                        case GGML_TYPE_Q8_0:
+                        case GGML_TYPE_Q3_K:
+                        case GGML_TYPE_Q6_K:
+                        case GGML_TYPE_IQ2_XXS:
+                        case GGML_TYPE_IQ2_XS:
+                        case GGML_TYPE_IQ2_S:
+                        case GGML_TYPE_IQ3_XXS:
+                        case GGML_TYPE_IQ3_S:
+                        case GGML_TYPE_IQ1_S:
+                        case GGML_TYPE_IQ4_NL:
+                            {
+                                // Quantized types using u32 buffers for portability.
+                                defines.push_back("SRC0_TYPE=u32");
+                                defines.push_back("U32_DEQUANT_HELPERS");
+                                break;
+                            }
+                        default:
+                        {
+                            defines.push_back(std::string("SRC0_TYPE=") + src0_name);
+                        }
+                    }
+
                     defines.push_back("BYTE_HELPERS");
                     defines.push_back(type_upper + "_T");
                     defines.push_back(type_upper);

@@ -97,6 +97,14 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
 
 /* End Constants */
 
+static inline wgpu::CallbackMode ggml_webgpu_callback_mode() {
+#ifdef __EMSCRIPTEN__
+    return wgpu::CallbackMode::AllowProcessEvents;
+#else
+    return wgpu::CallbackMode::AllowSpontaneous;
+#endif
+}
+
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to
 // their locations.
 static void * const webgpu_ptr_base = (void *) (uintptr_t) 0x1000;  // NOLINT
@@ -474,7 +482,7 @@ static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
 
     const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
         ctx->queue.OnSubmittedWorkDone(
-            wgpu::CallbackMode::AllowSpontaneous,
+            ggml_webgpu_callback_mode(),
             [&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
                 callback_status  = status;
                 callback_message = std::string(message);
@@ -494,7 +502,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
     std::string          callback_message;
 
     const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
-        buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
+        buffer.MapAsync(mode, offset, size, ggml_webgpu_callback_mode(),
                         [&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
                             callback_status  = status;
                             callback_message = std::string(message);
@@ -526,7 +534,11 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
     encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
     wgpu::CommandBuffer commands = encoder.Finish();
     ctx->queue.Submit(1, &commands);
-    ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize());
+    if (!ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0,
+                                        ctx->debug_host_buf.GetSize())) {
+        GGML_LOG_ERROR("ggml_webgpu: Debug buffer map failed\n");
+        return;
+    }
     const float * debug_data = (const float *) ctx->debug_host_buf.GetConstMappedRange();
     std::cout << "debug[0]: " << debug_data[0] << "\n";
     ctx->debug_host_buf.Unmap();
@@ -542,7 +554,7 @@ static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context &
         auto ts_bufs = command.timestamp_query_bufs;
 
         wgpu::Future f = ts_bufs.host_buf.MapAsync(
-            wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
+            wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), ggml_webgpu_callback_mode(),
             [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
                 if (status != wgpu::MapAsyncStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
@@ -3420,7 +3432,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
 
     ctx->webgpu_global_ctx->instance.WaitAny(
         ctx->webgpu_global_ctx->instance.RequestAdapter(
-            &options, wgpu::CallbackMode::AllowSpontaneous,
+            &options, ggml_webgpu_callback_mode(),
             [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
                 if (status != wgpu::RequestAdapterStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
@@ -3491,8 +3503,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     dev_desc.requiredFeatures     = required_features.data();
     dev_desc.requiredFeatureCount = required_features.size();
     dev_desc.SetDeviceLostCallback(
-        wgpu::CallbackMode::AllowSpontaneous,
-        [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+        ggml_webgpu_callback_mode(),
+        [ctx](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
             if (reason == wgpu::DeviceLostReason::Destroyed) {
                 return;
             }
@@ -3525,7 +3537,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
 
     ctx->webgpu_global_ctx->instance.WaitAny(
         ctx->webgpu_global_ctx->adapter.RequestDevice(
-            &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
+            &dev_desc, ggml_webgpu_callback_mode(),
             [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
                 if (status != wgpu::RequestDeviceStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
@@ -4046,6 +4058,13 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     ctx.name         = GGML_WEBGPU_NAME;
     ctx.device_count = 0;
 
+    // Keep one Dawn/WebGPU instance alive for the lifetime of the static backend
+    // registry. Recreating it on repeated registry lookups can invalidate
+    // adapter/device references that are still held by the backend/device layer.
+    if (ctx.webgpu_global_ctx != nullptr && ctx.webgpu_global_ctx->instance != nullptr) {
+        return &reg;
+    }
+
     wgpu::InstanceDescriptor               instance_descriptor{};
     std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
     instance_descriptor.requiredFeatures                     = instance_features.data();
@@ -4063,11 +4082,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     ctx.webgpu_global_ctx           = webgpu_global_context(new webgpu_global_context_struct());
     ctx.webgpu_global_ctx->instance = std::move(inst);
 
+    // Probe for adapter support
     wgpu::Adapter adapter;
     if (ctx.webgpu_global_ctx->instance != nullptr) {
         wgpu::RequestAdapterOptions options = {};
 
-        // probe for adapter support
         ctx.webgpu_global_ctx->instance.WaitAny(
             ctx.webgpu_global_ctx->instance.RequestAdapter(
                 &options, wgpu::CallbackMode::AllowSpontaneous,

@@ -9,36 +9,44 @@ fn get_byte_i32(value: u32, index: u32) -> i32 {
 #endif
 
 #ifdef U32_DEQUANT_HELPERS
-fn load_src0_u16_at(byte_offset: u32) -> u32 {
-    let word = src0[byte_offset / 4u];
-    let shift = (byte_offset & 2u) * 8u;
-    return (word >> shift) & 0xFFFFu;
+fn load_u16_at(
+        buf: ptr<storage, array<u32>, read_write>,
+        byte_offset: u32) -> u32 {
+    let word = buf[byte_offset / 4];
+    let shift = (byte_offset & 0x2) * 8;
+    return (word >> shift) & 0xFFFF;
 }
 
-fn load_src0_u32_at(byte_offset: u32) -> u32 {
-    let word_idx = byte_offset / 4u;
-    let shift = (byte_offset & 3u) * 8u;
-    let lo = src0[word_idx];
-    if (shift == 0u) {
-        return lo;
-    }
-    let hi = src0[word_idx + 1u];
-    return (lo >> shift) | (hi << (32u - shift));
+fn load_u32_at(
+        buf: ptr<storage, array<u32>, read_write>,
+        byte_offset: u32) -> u32 {
+    let word_idx = byte_offset / 4;
+    let shift = (byte_offset & 0x3) * 8;
+    let lo = buf[word_idx];
+    let hi = buf[word_idx + 1];
+    let shifted = (lo >> shift) | (hi << (32 - shift));
+    return select(shifted, lo, shift == 0);
 }
 
-fn load_src0_f16_at(byte_offset: u32) -> f16 {
-    let packed = unpack2x16float(load_src0_u16_at(byte_offset));
+fn load_f16_at(
+        buf: ptr<storage, array<u32>, read_write>,
+        byte_offset: u32) -> f16 {
+    let packed = unpack2x16float(load_u16_at(buf, byte_offset));
     return f16(packed[0]);
 }
-#endif
 
-#ifdef Q4_0_T
-struct q4_0 {
-    d: f16,
-    qs: array<f16, 8>
-};
+fn load_f16_as_f32_at(
+        buf: ptr<storage, array<u32>, read_write>,
+        byte_offset: u32) -> f32 {
+    let word = buf[byte_offset / 4];
+    let shift = (byte_offset & 0x2) * 8;
+    let d_bits = (word >> shift) & 0xFFFF;
+    return unpack2x16float(d_bits)[0];
+}
 #endif
 
+
+
 #ifdef Q4_1_T
 struct q4_1 {
     d: f16,
@@ -47,13 +55,6 @@ struct q4_1 {
 };
 #endif
 
-#ifdef Q5_0_T
-struct q5_0 {
-    d: f16,
-    qh: array<f16, 2>,
-    qs: array<f16, 8>
-};
-#endif
 
 #ifdef Q5_1_T
 struct q5_1 {
@@ -64,12 +65,6 @@ struct q5_1 {
 };
 #endif
 
-#ifdef Q8_0_T
-struct q8_0 {
-    d: f16,
-    qs: array<f16, 16>
-};
-#endif
 
 #ifdef Q8_1_T
 struct q8_1 {
@@ -88,14 +83,6 @@ struct q2_K {
 };
 #endif
 
-#ifdef Q3_K_T
-struct q3_K {
-    hmask: array<f16, 16>,
-    qs: array<f16, 32>,
-    scales: array<f16, 6>,
-    d: f16
-};
-#endif
 
 #if defined(Q4_K_SCALE_MIN) || defined(Q5_K_SCALE_MIN)
 fn get_scale_min(is: u32, scales: array<u32, 3>) -> vec2<f32> {
@@ -132,64 +119,6 @@ struct q5_K {
 };
 #endif
 
-#ifdef Q6_K_T
-struct q6_K {
-    ql: array<f16, 64>,
-    qh: array<f16, 32>,
-    scales: array<f16, 8>,
-    d: f16
-};
-#endif
-
-#ifdef IQ2_XXS_T
-struct iq2_xxs {
-    d: f16,
-    qs: array<f16, 32>
-};
-#endif
-
-#ifdef IQ2_XS_T
-struct iq2_xs {
-    d: f16,
-    qs: array<f16, 32>,
-    scales: array<f16, 4>
-};
-#endif
-
-#ifdef IQ2_S_T
-struct iq2_s {
-    d: f16,
-    qs: array<f16, 32>,
-    qh: array<f16, 4>,
-    scales: array<f16, 4>
-};
-#endif
-
-#ifdef IQ3_XXS_T
-struct iq3_xxs {
-    d: f16,
-    qs: array<f16, 48>
-};
-#endif
-
-#ifdef IQ3_S_T
-struct iq3_s {
-    d: f16,
-    qs: array<f16, 32>,
-    qh: array<f16, 4>,
-    signs: array<f16, 16>,
-    scales: array<f16, 2>
-};
-#endif
-
-#ifdef IQ1_S_T
-struct iq1_s {
-    d: f16,
-    qs: array<f16, 16>,
-    qh: array<f16, 8>
-};
-#endif
-
 #ifdef IQ1_M_T
 struct iq1_m {
     qs: array<u32, 8>,
@@ -198,17 +127,9 @@ struct iq1_m {
 };
 #endif
 
-#ifdef IQ4_NL_T
-struct iq4_nl {
-    d: f16,
-    qs: array<f16, 8>,
-};
-#endif
-
 #ifdef IQ4_XS_T
 struct iq4_xs {
-    d: f16,
-    scales_h: f16,
+    d_scales_h: u32,
     scales_l: u32,
     qs: array<u32, 32>
 };