Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e391405
ggml(webgpu): fix the busy-polls in Emscripten in the waitAny after …
Constannnnnt Mar 23, 2026
eb62cae
Merge branch 'master' of github.com:ggml-org/llama.cpp into constant/…
jjhartmann Apr 2, 2026
d24af3b
Merge branch 'master' of github.com:ggml-org/llama.cpp into constant/…
jjhartmann Apr 3, 2026
43dfbdf
Merge with upstream
jjhartmann Apr 3, 2026
09c49b3
Fix GET_ROWS packed integer NaN when using f16 as memory buffer in sh…
jjhartmann Apr 4, 2026
4cc515f
Update Unary wgsl EXP and EXPM1 for f16 stability
jjhartmann Apr 5, 2026
b86e765
Fix GET_ROWS IQ4_XS strcut for NaN f16 canonicalization
jjhartmann Apr 5, 2026
ae9dac6
Fix numerical percision for unary sqrt when working with f16
jjhartmann Apr 5, 2026
518e315
Fix NaN canonicalization for packed integers using f16
jjhartmann Apr 5, 2026
956d910
Update err threshold for binary div ops when using f16
jjhartmann Apr 5, 2026
2747c8f
backend: Keep one Dawn/WebGPU instance alive for the lifetime of the …
Constannnnnt Apr 5, 2026
e4a97ce
merge: quant fix + static backend
Constannnnnt Apr 5, 2026
34e9216
merge: merge with upstream master
Constannnnnt Apr 6, 2026
f272f06
merge: merge with upstream master and uncomment webgpu logs
Constannnnnt Apr 7, 2026
7a8d382
clean: uncomment existing code logs
Constannnnnt Apr 7, 2026
f599675
clean: clean the unncessary debug info
Constannnnnt Apr 7, 2026
f501dc7
Refactor and generalize dequant helpers
jjhartmann Apr 9, 2026
018d470
Remove deprecated quant structs
jjhartmann Apr 9, 2026
bc8b42e
Refactor shader defines to reduce repetition
jjhartmann Apr 9, 2026
b149e92
Remove error override for F16 type
jjhartmann Apr 9, 2026
e364006
Merge pull request #1 from noumena-labs/jeremy/dev/llama.cpp-pr-21521…
jjhartmann Apr 9, 2026
71dee38
merge with upstream
Constannnnnt Apr 9, 2026
f1ba334
fix: fix the accidential removal of the proper initialization of ctx
Constannnnnt Apr 9, 2026
41e0a26
clean: clean legacy and format code
Constannnnnt Apr 9, 2026
7b09fbf
fix: did not modify tests ops
Constannnnnt Apr 9, 2026
fa0f181
merge: fix the test change and merge with upstream
Constannnnnt Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,32 @@ class ggml_webgpu_shader_lib {
std::string type_upper = type_str;
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);

switch (key.src_type)
{
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_NL:
{
// Quantized types using u32 buffers for portability.
defines.push_back("SRC_TYPE=u32");
defines.push_back("U32_DEQUANT_HELPERS");
break;
}
default:
{
defines.push_back(std::string("SRC_TYPE=") + type_str);
}
}

defines.push_back("BYTE_HELPERS");
defines.push_back(type_upper + "_T");
defines.push_back(type_upper);
Expand All @@ -1125,7 +1151,6 @@ class ggml_webgpu_shader_lib {
variant += "_";
variant += type_str;

defines.push_back(std::string("SRC_TYPE=") + type_str);
defines.push_back("DST_TYPE=f32");

if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
Expand Down Expand Up @@ -1593,11 +1618,35 @@ class ggml_webgpu_shader_lib {
break;
default:
{
// quantized types
std::string type_upper = src0_name;
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);

defines.push_back(std::string("SRC0_TYPE=") + src0_name);
switch (context.src0->type)
{
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_NL:
{
// Quantized types using u32 buffers for portability.
defines.push_back("SRC0_TYPE=u32");
defines.push_back("U32_DEQUANT_HELPERS");
break;
}
default:
{
defines.push_back(std::string("SRC0_TYPE=") + src0_name);
}
}

defines.push_back("BYTE_HELPERS");
defines.push_back(type_upper + "_T");
defines.push_back(type_upper);
Expand Down
37 changes: 28 additions & 9 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ static inline void compute_2d_workgroups(uint32_t total_wg, uint32_t max_per_dim

/* End Constants */

static inline wgpu::CallbackMode ggml_webgpu_callback_mode() {
#ifdef __EMSCRIPTEN__
return wgpu::CallbackMode::AllowProcessEvents;
#else
return wgpu::CallbackMode::AllowSpontaneous;
#endif
}

// This is a "fake" base pointer, since WebGPU buffers do not have pointers to
// their locations.
static void * const webgpu_ptr_base = (void *) (uintptr_t) 0x1000; // NOLINT
Expand Down Expand Up @@ -474,7 +482,7 @@ static void ggml_backend_webgpu_wait_queue(webgpu_global_context & ctx) {

const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
ctx->queue.OnSubmittedWorkDone(
wgpu::CallbackMode::AllowSpontaneous,
ggml_webgpu_callback_mode(),
[&callback_status, &callback_message](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
Expand All @@ -494,7 +502,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_global_context & ctx,
std::string callback_message;

const wgpu::WaitStatus wait_status = ctx->instance.WaitAny(
buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
buffer.MapAsync(mode, offset, size, ggml_webgpu_callback_mode(),
[&callback_status, &callback_message](wgpu::MapAsyncStatus status, wgpu::StringView message) {
callback_status = status;
callback_message = std::string(message);
Expand Down Expand Up @@ -526,7 +534,11 @@ static void ggml_backend_webgpu_debug(webgpu_global_context & ctx) {
encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
wgpu::CommandBuffer commands = encoder.Finish();
ctx->queue.Submit(1, &commands);
ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0, ctx->debug_host_buf.GetSize());
if (!ggml_backend_webgpu_map_buffer(ctx, ctx->debug_host_buf, wgpu::MapMode::Read, 0,
ctx->debug_host_buf.GetSize())) {
GGML_LOG_ERROR("ggml_webgpu: Debug buffer map failed\n");
return;
}
const float * debug_data = (const float *) ctx->debug_host_buf.GetConstMappedRange();
std::cout << "debug[0]: " << debug_data[0] << "\n";
ctx->debug_host_buf.Unmap();
Expand All @@ -542,7 +554,7 @@ static void ggml_backend_webgpu_collect_profile_futures(webgpu_global_context &
auto ts_bufs = command.timestamp_query_bufs;

wgpu::Future f = ts_bufs.host_buf.MapAsync(
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), ggml_webgpu_callback_mode(),
[ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
if (status != wgpu::MapAsyncStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
Expand Down Expand Up @@ -3420,7 +3432,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {

ctx->webgpu_global_ctx->instance.WaitAny(
ctx->webgpu_global_ctx->instance.RequestAdapter(
&options, wgpu::CallbackMode::AllowSpontaneous,
&options, ggml_webgpu_callback_mode(),
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
if (status != wgpu::RequestAdapterStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
Expand Down Expand Up @@ -3491,8 +3503,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
dev_desc.requiredFeatures = required_features.data();
dev_desc.requiredFeatureCount = required_features.size();
dev_desc.SetDeviceLostCallback(
wgpu::CallbackMode::AllowSpontaneous,
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
ggml_webgpu_callback_mode(),
[ctx](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
if (reason == wgpu::DeviceLostReason::Destroyed) {
return;
}
Expand Down Expand Up @@ -3525,7 +3537,7 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {

ctx->webgpu_global_ctx->instance.WaitAny(
ctx->webgpu_global_ctx->adapter.RequestDevice(
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
&dev_desc, ggml_webgpu_callback_mode(),
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
if (status != wgpu::RequestDeviceStatus::Success) {
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str());
Expand Down Expand Up @@ -4046,6 +4058,13 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
ctx.name = GGML_WEBGPU_NAME;
ctx.device_count = 0;

// Keep one Dawn/WebGPU instance alive for the lifetime of the static backend
// registry. Recreating it on repeated registry lookups can invalidate
// adapter/device references that are still held by the backend/device layer.
if (ctx.webgpu_global_ctx != nullptr && ctx.webgpu_global_ctx->instance != nullptr) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change generally seems reasonable, but is it actually a real problem? My understanding is that normally, the reg function is only called once by llama.cpp, and I haven't seen any errors related to this running in different contexts or in the browser.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I added this was that in the inference system we built around llama.cpp and wasm, we tested a scenario where a new model was loaded to replace the model already in the system, and the system crashed. My understanding of this issue at that time was that resources like the buffer and pipelines are tied to the wgpu::device and wgpu::instance that created them. When I switched the model, llama.cpp created a new device and the system tried to use an old buffer with the new device or clean up old resources using the new context. Therefore, to prevent this, I think it would be safer to forces llama.cpp to reuse the existing instance and device etc for the entire lifetime of the system.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok thanks for the context, this makes sense!

return &reg;
}

wgpu::InstanceDescriptor instance_descriptor{};
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
instance_descriptor.requiredFeatures = instance_features.data();
Expand All @@ -4063,11 +4082,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
ctx.webgpu_global_ctx = webgpu_global_context(new webgpu_global_context_struct());
ctx.webgpu_global_ctx->instance = std::move(inst);

// Probe for adapter support
wgpu::Adapter adapter;
if (ctx.webgpu_global_ctx->instance != nullptr) {
wgpu::RequestAdapterOptions options = {};

// probe for adapter support
ctx.webgpu_global_ctx->instance.WaitAny(
ctx.webgpu_global_ctx->instance.RequestAdapter(
&options, wgpu::CallbackMode::AllowSpontaneous,
Expand Down
139 changes: 30 additions & 109 deletions ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,44 @@ fn get_byte_i32(value: u32, index: u32) -> i32 {
#endif

#ifdef U32_DEQUANT_HELPERS
fn load_src0_u16_at(byte_offset: u32) -> u32 {
let word = src0[byte_offset / 4u];
let shift = (byte_offset & 2u) * 8u;
Comment thread
reeselevine marked this conversation as resolved.
return (word >> shift) & 0xFFFFu;
fn load_u16_at(
buf: ptr<storage, array<u32>, read_write>,
byte_offset: u32) -> u32 {
let word = buf[byte_offset / 4];
let shift = (byte_offset & 0x2) * 8;
return (word >> shift) & 0xFFFF;
}

fn load_src0_u32_at(byte_offset: u32) -> u32 {
let word_idx = byte_offset / 4u;
let shift = (byte_offset & 3u) * 8u;
let lo = src0[word_idx];
if (shift == 0u) {
return lo;
}
let hi = src0[word_idx + 1u];
return (lo >> shift) | (hi << (32u - shift));
fn load_u32_at(
buf: ptr<storage, array<u32>, read_write>,
byte_offset: u32) -> u32 {
let word_idx = byte_offset / 4;
let shift = (byte_offset & 0x3) * 8;
let lo = buf[word_idx];
let hi = buf[word_idx + 1];
let shifted = (lo >> shift) | (hi << (32 - shift));
return select(shifted, lo, shift == 0);
}

fn load_src0_f16_at(byte_offset: u32) -> f16 {
let packed = unpack2x16float(load_src0_u16_at(byte_offset));
fn load_f16_at(
buf: ptr<storage, array<u32>, read_write>,
byte_offset: u32) -> f16 {
let packed = unpack2x16float(load_u16_at(buf, byte_offset));
return f16(packed[0]);
}
#endif

#ifdef Q4_0_T
struct q4_0 {
d: f16,
qs: array<f16, 8>
};
fn load_f16_as_f32_at(
buf: ptr<storage, array<u32>, read_write>,
byte_offset: u32) -> f32 {
let word = buf[byte_offset / 4];
let shift = (byte_offset & 0x2) * 8;
let d_bits = (word >> shift) & 0xFFFF;
return unpack2x16float(d_bits)[0];
}
#endif



#ifdef Q4_1_T
struct q4_1 {
d: f16,
Expand All @@ -47,13 +55,6 @@ struct q4_1 {
};
#endif

#ifdef Q5_0_T
struct q5_0 {
d: f16,
qh: array<f16, 2>,
qs: array<f16, 8>
};
#endif

#ifdef Q5_1_T
struct q5_1 {
Expand All @@ -64,12 +65,6 @@ struct q5_1 {
};
#endif

#ifdef Q8_0_T
struct q8_0 {
d: f16,
qs: array<f16, 16>
};
#endif

#ifdef Q8_1_T
struct q8_1 {
Expand All @@ -88,14 +83,6 @@ struct q2_K {
};
#endif

#ifdef Q3_K_T
struct q3_K {
hmask: array<f16, 16>,
qs: array<f16, 32>,
scales: array<f16, 6>,
d: f16
};
#endif

#if defined(Q4_K_SCALE_MIN) || defined(Q5_K_SCALE_MIN)
fn get_scale_min(is: u32, scales: array<u32, 3>) -> vec2<f32> {
Expand Down Expand Up @@ -132,64 +119,6 @@ struct q5_K {
};
#endif

#ifdef Q6_K_T
struct q6_K {
ql: array<f16, 64>,
qh: array<f16, 32>,
scales: array<f16, 8>,
d: f16
};
#endif

#ifdef IQ2_XXS_T
struct iq2_xxs {
d: f16,
qs: array<f16, 32>
};
#endif

#ifdef IQ2_XS_T
struct iq2_xs {
d: f16,
qs: array<f16, 32>,
scales: array<f16, 4>
};
#endif

#ifdef IQ2_S_T
struct iq2_s {
d: f16,
qs: array<f16, 32>,
qh: array<f16, 4>,
scales: array<f16, 4>
};
#endif

#ifdef IQ3_XXS_T
struct iq3_xxs {
d: f16,
qs: array<f16, 48>
};
#endif

#ifdef IQ3_S_T
struct iq3_s {
d: f16,
qs: array<f16, 32>,
qh: array<f16, 4>,
signs: array<f16, 16>,
scales: array<f16, 2>
};
#endif

#ifdef IQ1_S_T
struct iq1_s {
d: f16,
qs: array<f16, 16>,
qh: array<f16, 8>
};
#endif

#ifdef IQ1_M_T
struct iq1_m {
qs: array<u32, 8>,
Expand All @@ -198,17 +127,9 @@ struct iq1_m {
};
#endif

#ifdef IQ4_NL_T
struct iq4_nl {
d: f16,
qs: array<f16, 8>,
};
#endif

#ifdef IQ4_XS_T
struct iq4_xs {
d: f16,
scales_h: f16,
d_scales_h: u32,
scales_l: u32,
qs: array<u32, 32>
};
Expand Down
Loading
Loading