Skip to content

Commit 5b0746f

Browse files
reeselevinemengqin
authored andcommitted
ggml-webgpu: Update register tiling matmul to use f32 accumulation (ggml-org#21644)
* Update register tiling matmul to use f32 accumulation * fix profiling code * Fix register tiling matmul for chrome, i'm blaming dawn * Update batch tuning value for iOS * compile fix * Fix use of new load function
1 parent a9ed796 commit 5b0746f

4 files changed

Lines changed: 40 additions & 61 deletions

File tree

ggml/src/ggml-webgpu/ggml-webgpu.cpp

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
7979

8080
/* Constants */
8181

82-
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 32u
82+
#define WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE 64u
8383
#define WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN 10u
8484
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_MS 30000u
8585
#define WEBGPU_RUNTIME_WAIT_TIMEOUT_NS (WEBGPU_RUNTIME_WAIT_TIMEOUT_MS * 1e6)
@@ -97,14 +97,6 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim
9797

9898
/* End Constants */
9999

100-
static inline wgpu::CallbackMode ggml_webgpu_callback_mode() {
101-
#ifdef __EMSCRIPTEN__
102-
return wgpu::CallbackMode::AllowProcessEvents;
103-
#else
104-
return wgpu::CallbackMode::AllowSpontaneous;
105-
#endif
106-
}
107-
108100
// This is a "fake" base pointer, since WebGPU buffers do not have pointers to
109101
// their locations.
110102
static void * const webgpu_ptr_base = (void *) (uintptr_t) 0x1000; // NOLINT
@@ -445,34 +437,25 @@ static void ggml_backend_webgpu_check_wait_status(wgpu::WaitStatus wait_status,
445437
}
446438

447439
#ifdef __EMSCRIPTEN__
448-
// iOS browsers seem to have very strict limits on the number of in-flight GPU commands, so we need to throttle to avoid failures.
449440
EM_JS(int, ggml_webgpu_is_ios_browser, (), {
450441
const ua = navigator.userAgent;
451442
return (ua.includes('iPhone') || ua.includes('iPad')) ? 1 : 0;
452443
});
453444
#endif
454445

455-
static uint32_t ggml_backend_webgpu_get_max_inflight_batches(const wgpu::AdapterInfo & info) {
446+
// TODO: these next two functions may want tuning across different platforms and workloads,
447+
static uint32_t ggml_backend_webgpu_get_max_inflight_batches() {
456448
#ifdef __EMSCRIPTEN__
449+
// iOS has very strict limits on the number of in-flight GPU commands,
450+
// so we need to throttle to avoid failures.
457451
if (ggml_webgpu_is_ios_browser()) {
458452
return 1;
459453
}
460-
#else
461-
GGML_UNUSED(info);
462454
#endif
463-
464455
return UINT32_MAX;
465456
}
466457

467-
static uint32_t ggml_backend_webgpu_get_command_submit_batch_size(const wgpu::AdapterInfo & info) {
468-
#ifdef __EMSCRIPTEN__
469-
if (ggml_webgpu_is_ios_browser()) {
470-
return 16;
471-
}
472-
#else
473-
GGML_UNUSED(info);
474-
#endif
475-
458+
static uint32_t ggml_backend_webgpu_get_command_submit_batch_size() {
476459
return WEBGPU_DEFAULT_COMMAND_SUBMIT_BATCH_SIZE;
477460
}
478461

@@ -482,7 +465,7 @@ static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {
482465

483466
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
484467
ctx->queue.OnSubmittedWorkDone(
485-
ggml_webgpu_callback_mode(),
468+
wgpu::CallbackMode::AllowSpontaneous,
486469
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
487470
callback_status = status;
488471
callback_message = std::string(message);
@@ -502,7 +485,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
502485
std::string callback_message;
503486

504487
const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
505-
buffer.MapAsync(mode, offset, size, ggml_webgpu_callback_mode(),
488+
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
506489
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
507490
callback_status = status;
508491
callback_message = std::string(message);
@@ -542,15 +525,15 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
542525
#endif
543526

544527
#ifdef GGML_WEBGPU_GPU_PROFILE
545-
static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context & ctx,
546-
const std::vector<webgpu_command> & commands,
547-
std::vector<wgpu::FutureWaitInfo> & futures) {
528+
static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context & ctx,
529+
const std::vector<webgpu_encoded_op> & commands,
530+
std::vector<wgpu::FutureWaitInfo> & futures) {
548531
for (const auto & command : commands) {
549532
auto label = command.pipeline_name;
550533
auto ts_bufs = command.timestamp_query_bufs;
551534

552535
wgpu::Future f = ts_bufs.host_buf.MapAsync(
553-
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), ggml_webgpu_callback_mode(),
536+
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
554537
[ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
555538
if (status != wgpu::MapAsyncStatus::Success) {
556539
GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
@@ -3428,7 +3411,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
34283411

34293412
ctx->webgpu_global_ctx->instance.WaitAny(
34303413
ctx->webgpu_global_ctx->instance.RequestAdapter(
3431-
&options, ggml_webgpu_callback_mode(),
3414+
&options, wgpu::CallbackMode::AllowSpontaneous,
34323415
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
34333416
if (status != wgpu::RequestAdapterStatus::Success) {
34343417
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
@@ -3449,8 +3432,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
34493432
}
34503433
#endif
34513434
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
3452-
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(info);
3453-
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(info);
3435+
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
3436+
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches();
34543437
wgpu::SupportedFeatures features;
34553438
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
34563439
// we require f16 support
@@ -3501,7 +3484,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
35013484
dev_desc.requiredFeatures = required_features.data();
35023485
dev_desc.requiredFeatureCount = required_features.size();
35033486
dev_desc.SetDeviceLostCallback(
3504-
ggml_webgpu_callback_mode(),
3487+
wgpu::CallbackMode::AllowSpontaneous,
35053488
[ctx](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
35063489
if (reason == wgpu::DeviceLostReason::Destroyed) {
35073490
return;
@@ -3535,7 +3518,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
35353518

35363519
ctx->webgpu_global_ctx->instance.WaitAny(
35373520
ctx->webgpu_global_ctx->adapter.RequestDevice(
3538-
&dev_desc, ggml_webgpu_callback_mode(),
3521+
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
35393522
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
35403523
if (status != wgpu::RequestDeviceStatus::Success) {
35413524
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());

ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -502,12 +502,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
502502
let d = load_f16_at(&src0, block_byte_base);
503503
let dmin = load_f16_at(&src0, block_byte_base + 2u);
504504

505-
// Load packed scales
506-
var scale_vals: array<u32, 3>;
507-
for (var i: u32 = 0u; i < 3u; i++) {
508-
scale_vals[i] = load_u32_at(&src0, block_byte_base + 4u + 4u * i);
509-
}
510-
511505
// Map k_in_block to loop structure:
512506
// Outer loop over 64-element groups (alternating q_b_idx)
513507
// Inner loop over 2 shifts per group
@@ -523,15 +517,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
523517
var sc: u32;
524518
var mn: u32;
525519

520+
let scale_base = block_byte_base + 4u;
521+
526522
if (is < 4u) {
527-
let sc_byte = get_byte(scale_vals[is / 4u], is % 4u);
528-
let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u);
523+
let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u);
524+
let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
529525
sc = sc_byte & 63u;
530526
mn = min_byte & 63u;
531527
} else {
532-
let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u);
533-
let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u);
534-
let min_hi = get_byte(scale_vals[is / 4u], is % 4u);
528+
let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u);
529+
let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u);
530+
let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
535531

536532
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
537533
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
@@ -578,11 +574,6 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
578574
let d = load_f16_at(&src0, block_byte_base);
579575
let dmin = load_f16_at(&src0, block_byte_base + 2u);
580576

581-
// Load packed scales
582-
var scale_vals: array<u32, 3>;
583-
for (var i: u32 = 0u; i < 3u; i++) {
584-
scale_vals[i] = load_u32_at(&src0, block_byte_base + 4u + 4u * i);
585-
}
586577

587578
// The original loop processes elements in groups of 64
588579
// Each group of 64: q_b_idx cycles through [0,32,64,96], shift cycles [0,4]
@@ -603,15 +594,17 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
603594
var sc: u32;
604595
var mn: u32;
605596

597+
let scale_base = block_byte_base + 4u;
598+
606599
if (is < 4u) {
607-
let sc_byte = get_byte(scale_vals[is / 4u], is % 4u);
608-
let min_byte = get_byte(scale_vals[(is + 4u) / 4u], is % 4u);
600+
let sc_byte = get_byte(load_u32_at(&src0, scale_base), is % 4u);
601+
let min_byte = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
609602
sc = sc_byte & 63u;
610603
mn = min_byte & 63u;
611604
} else {
612-
let sc_min_lo = get_byte(scale_vals[(is + 4u) / 4u], (is + 4u) % 4u);
613-
let sc_hi = get_byte(scale_vals[(is - 4u) / 4u], (is - 4u) % 4u);
614-
let min_hi = get_byte(scale_vals[is / 4u], is % 4u);
605+
let sc_min_lo = get_byte(load_u32_at(&src0, scale_base + 8), (is + 4u) % 4u);
606+
let sc_hi = get_byte(load_u32_at(&src0, scale_base), (is - 4u) % 4u);
607+
let min_hi = get_byte(load_u32_at(&src0, scale_base + 4), is % 4u);
615608

616609
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
617610
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);

ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ enable f16;
44
#include "mul_mat_decls.tmpl"
55

66
#ifdef VEC
7-
fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
8-
return vec4<f32>(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn]));
7+
fn store_val(acc: array<array<f32, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
8+
return vec4<f32>(acc[tm][tn], acc[tm + 1][tn], acc[tm + 2][tn], acc[tm + 3][tn]);
99
}
1010
#endif
1111

1212
#ifdef SCALAR
13-
fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
14-
return f32(acc[tm][tn]);
13+
fn store_val(acc: array<array<f32, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
14+
return acc[tm][tn];
1515
}
1616
#endif
1717

@@ -98,7 +98,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
9898
let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M;
9999
let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N;
100100

101-
var acc: array<array<f16, TILE_N>, TILE_M>;
101+
var acc: array<array<f32, TILE_N>, TILE_M>;
102102

103103
for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
104104

@@ -122,7 +122,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
122122
let src1_idx = src1_n * TILE_K + k_inner;
123123
let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx];
124124
for (var tm = 0u; tm < TILE_M; tm++) {
125-
acc[tm][tn] += src0_tile[tm] * src1_val;
125+
acc[tm][tn] += f32(src0_tile[tm]) * f32(src1_val);
126126
}
127127
}
128128
}

ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ enable chromium_experimental_subgroup_matrix;
66
#include "common_decls.tmpl"
77
#include "mul_mat_decls.tmpl"
88

9+
// TODO: this shader path does not work with some models like qwen2.5 on Metal devices, f16 accumulation causes NaNs.
10+
// See https://github.com/ggml-org/llama.cpp/issues/21602
11+
912
#ifdef VEC
1013
fn store_dst(shmem_idx: u32, dst_idx: u32) {
1114
dst[dst_idx] = vec4<f32>(

0 commit comments

Comments
 (0)