ArberSephirotheca
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/ops.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/ops.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/ops/WebGPU.csv‎
Lines changed: 5737 additions & 5728 deletions b/‎docs/ops/WebGPU.csv‎
Lines changed: 5737 additions & 5728 deletions
diff --git a/‎ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp‎
Lines changed: 66 additions & 2 deletions b/‎ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp‎
Lines changed: 66 additions & 2 deletions
diff --git a/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 59 additions & 2 deletions b/‎ggml/src/ggml-webgpu/ggml-webgpu.cpp‎
Lines changed: 59 additions & 2 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl‎
Lines changed: 64 additions & 0 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl‎
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl‎
Lines changed: 7 additions & 0 deletions
@@ -456,7 +456,8 @@ jobs:
         run: |
           cd build
           # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 900
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
 
   ubuntu-24-webgpu-wasm:
     runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
 
@@ -18,7 +18,7 @@ Legend:
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
@@ -71,7 +71,7 @@ Legend:
 |                 MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
@@ -118,5 +118,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
@@ -495,6 +495,22 @@ struct ggml_webgpu_binary_pipeline_key_hash {
     }
 };
 
+/* Add_Id */
+
+struct ggml_webgpu_add_id_pipeline_key {
+    bool inplace;
+
+    bool operator==(const ggml_webgpu_add_id_pipeline_key & other) const { return inplace == other.inplace; }
+};
+
+struct ggml_webgpu_add_id_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_add_id_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.inplace);
+        return seed;
+    }
+};
+
 /** Unary **/
 
 struct ggml_webgpu_unary_pipeline_key {
@@ -1058,7 +1074,9 @@ class ggml_webgpu_shader_lib {
     std::unordered_map<ggml_webgpu_pad_pipeline_key, webgpu_pipeline, ggml_webgpu_pad_pipeline_key_hash>
         pad_pipelines;              // circular/non-circular
     std::unordered_map<ggml_webgpu_binary_pipeline_key, webgpu_pipeline, ggml_webgpu_binary_pipeline_key_hash>
-        binary_pipelines;           // type/op/inplace/overlap
+        binary_pipelines;           // type/op/inplace/overlap/src_overlap
+    std::unordered_map<ggml_webgpu_add_id_pipeline_key, webgpu_pipeline, ggml_webgpu_add_id_pipeline_key_hash>
+        add_id_pipelines;           // inplace
     std::unordered_map<ggml_webgpu_concat_pipeline_key, webgpu_pipeline, ggml_webgpu_concat_pipeline_key_hash>
         concat_pipelines;           // type
     std::unordered_map<ggml_webgpu_repeat_pipeline_key, webgpu_pipeline, ggml_webgpu_repeat_pipeline_key_hash>
@@ -1433,6 +1451,7 @@ class ggml_webgpu_shader_lib {
                         case GGML_TYPE_IQ3_S:
                         case GGML_TYPE_IQ1_S:
                         case GGML_TYPE_IQ4_NL:
+                        case GGML_TYPE_MXFP4:
                             {
                                 // Quantized types using u32 buffers for portability.
                                 defines.push_back("SRC_TYPE=u32");
@@ -1451,6 +1470,7 @@ class ggml_webgpu_shader_lib {
                     defines.push_back(type_upper + "_SCALE_MIN");
                     defines.push_back(type_upper + "_TABLES");
                     defines.push_back(type_upper + "_GRID");
+                    defines.push_back(type_upper + "_LUT");
 
                     variant += "_";
                     variant += type_str;
@@ -1460,7 +1480,7 @@ class ggml_webgpu_shader_lib {
                     if (key.src_type == GGML_TYPE_Q1_0) {
                         defines.push_back("BLOCK_SIZE=128u");
                     } else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
-                               key.src_type == GGML_TYPE_IQ4_NL) {
+                               key.src_type == GGML_TYPE_IQ4_NL || key.src_type == GGML_TYPE_MXFP4) {
                         defines.push_back("BLOCK_SIZE=32u");
                     } else if (key.src_type >= GGML_TYPE_Q2_K) {
                         defines.push_back("BLOCK_SIZE=256u");
@@ -1774,6 +1794,9 @@ class ggml_webgpu_shader_lib {
                             defines.push_back(type_upper + "_GRID");
                             defines.push_back(type_upper + "_TABLES");
                             break;
+                        case GGML_TYPE_MXFP4:
+                            defines.push_back(type_upper + "_LUT");
+                            break;
                         default:
                             break;
                     }
@@ -1908,6 +1931,9 @@ class ggml_webgpu_shader_lib {
                             defines.push_back(type_upper + "_GRID");
                             defines.push_back(type_upper + "_TABLES");
                             break;
+                        case GGML_TYPE_MXFP4:
+                            defines.push_back(type_upper + "_LUT");
+                            break;
                         default:
                             break;
                     }
@@ -2042,6 +2068,7 @@ class ggml_webgpu_shader_lib {
                         case GGML_TYPE_IQ3_S:
                         case GGML_TYPE_IQ1_S:
                         case GGML_TYPE_IQ4_NL:
+                        case GGML_TYPE_MXFP4:
                             {
                                 // Quantized types using u32 buffers for portability.
                                 defines.push_back("SRC0_TYPE=u32");
@@ -2169,6 +2196,9 @@ class ggml_webgpu_shader_lib {
                             defines.push_back(type_upper + "_GRID");
                             defines.push_back(type_upper + "_TABLES");
                             break;
+                        case GGML_TYPE_MXFP4:
+                            defines.push_back(type_upper + "_LUT");
+                            break;
                         default:
                             break;
                     }
@@ -2286,6 +2316,9 @@ class ggml_webgpu_shader_lib {
                             defines.push_back(type_upper + "_GRID");
                             defines.push_back(type_upper + "_TABLES");
                             break;
+                        case GGML_TYPE_MXFP4:
+                            defines.push_back(type_upper + "_LUT");
+                            break;
                         default:
                             break;
                     }
@@ -2503,6 +2536,37 @@ class ggml_webgpu_shader_lib {
         return binary_pipelines[key];
     }
 
+    webgpu_pipeline get_add_id_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_add_id_pipeline_key key = {};
+        key.inplace                         = ggml_webgpu_tensor_equal(context.src0, context.dst);
+
+        auto it = add_id_pipelines.find(key);
+        if (it != add_id_pipelines.end()) {
+            return it->second;
+        }
+
+        std::vector<std::string> defines;
+        std::string              variant    = "add_id";
+        const char *             shader_src = wgsl_add_id;
+
+        if (key.inplace) {
+            defines.push_back("INPLACE");
+            variant += "_inplace";
+        }
+
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
+
+        auto processed              = preprocessor.preprocess(shader_src, defines);
+        auto pipeline_decisions     = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        pipeline_decisions->wg_size = context.max_wg_size;
+        pipeline_decisions->inplace = key.inplace;
+
+        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context         = pipeline_decisions;
+        add_id_pipelines[key]    = pipeline;
+        return pipeline;
+    }
+
     webgpu_pipeline get_concat_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_concat_pipeline_key key = {};
         key.type                            = context.dst->type;
 
@@ -1411,8 +1411,6 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                 case GGML_TYPE_Q3_K:
                 case GGML_TYPE_Q2_K:
                 case GGML_TYPE_Q1_0:
-                    use_fast = true;
-                    break;
                 case GGML_TYPE_IQ1_S:
                 case GGML_TYPE_IQ1_M:
                 case GGML_TYPE_IQ2_XXS:
@@ -1422,6 +1420,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                 case GGML_TYPE_IQ3_S:
                 case GGML_TYPE_IQ4_NL:
                 case GGML_TYPE_IQ4_XS:
+                case GGML_TYPE_MXFP4:
                     use_fast = true;
                     break;
                 default:
@@ -2145,6 +2144,56 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx,
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
+static webgpu_encoded_op ggml_webgpu_add_id(webgpu_context & ctx,
+                                            ggml_tensor *    src0,
+                                            ggml_tensor *    src1,
+                                            ggml_tensor *    src2,
+                                            ggml_tensor *    dst) {
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+    shader_lib_ctx.src0                           = src0;
+    shader_lib_ctx.src1                           = src1;
+    shader_lib_ctx.src2                           = src2;
+    shader_lib_ctx.dst                            = dst;
+    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+
+    webgpu_pipeline pipeline = ctx->shader_lib->get_add_id_pipeline(shader_lib_ctx);
+
+    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
+
+    std::vector<uint32_t> params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
+        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
+        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
+        (uint32_t) (src2->nb[0] / ggml_type_size(src2->type)),
+        (uint32_t) (src2->nb[1] / ggml_type_size(src2->type)),
+        (uint32_t) dst->ne[0],
+        (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2],
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries;
+
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
+    entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, src2));
+
+    if (!decisions->inplace) {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 3, dst));
+    }
+
+    uint32_t       wg_x           = 1;
+    uint32_t       wg_y           = 1;
+    uint32_t       total_wg       = ggml_nrows(dst);
+    const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+    compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
+
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+}
+
 static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
                                             ggml_tensor *    src0,
                                             ggml_tensor *    src1,
@@ -2918,6 +2967,8 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
         case GGML_OP_MUL:
         case GGML_OP_DIV:
             return ggml_webgpu_binary_op(ctx, src0, src1, node);
+        case GGML_OP_ADD_ID:
+            return ggml_webgpu_add_id(ctx, src0, src1, src2, node);
         case GGML_OP_CONCAT:
             return ggml_webgpu_concat(ctx, src0, src1, node);
         case GGML_OP_REPEAT:
@@ -3867,6 +3918,7 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) {
         case GGML_TYPE_IQ1_M:
         case GGML_TYPE_IQ4_NL:
         case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_MXFP4:
             return true;
         default:
             return false;
@@ -3905,6 +3957,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
             supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
                           (src1->type == op->type);
             break;
+        case GGML_OP_ADD_ID:
+            supports_op = src0->type == GGML_TYPE_F32;
+            break;
         case GGML_OP_CONCAT:
             supports_op = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
             break;
@@ -3962,6 +4017,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                             case GGML_TYPE_IQ1_M:
                             case GGML_TYPE_IQ4_NL:
                             case GGML_TYPE_IQ4_XS:
+                            case GGML_TYPE_MXFP4:
                                 supports_op = true;
                                 break;
                             default:
@@ -4001,6 +4057,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                         case GGML_TYPE_IQ3_S:
                         case GGML_TYPE_IQ4_NL:
                         case GGML_TYPE_IQ4_XS:
+                        case GGML_TYPE_MXFP4:
                             supports_op = true;
                             break;
                         default:
 
@@ -0,0 +1,64 @@
+struct Params {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_ids: u32,
+    offset_dst: u32,
+
+    nb01: u32,
+    nb02: u32,
+    nb11: u32,
+    nb20: u32,
+    nb21: u32,
+
+    ne0: u32,
+    ne1: u32,
+    ne2: u32,
+};
+
+@group(0) @binding(0) var<storage, read_write> src0: array<f32>; // [n_embd, n_experts_used, n_token]
+@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // [n_embd, n_experts]
+@group(0) @binding(2) var<storage, read_write> ids:  array<i32>; // [n_experts_used, n_token]
+
+#ifdef INPLACE
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+#else
+
+@group(0) @binding(3)
+var<storage, read_write> dst: array<f32>;
+
+@group(0) @binding(4)
+var<uniform> params: Params;
+
+#endif
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(num_workgroups) num_wg: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>) {
+
+    let wg_linear = wg_id.x + wg_id.y * num_wg.x;
+
+    if (wg_linear < params.ne1 * params.ne2) {
+        let thread_id = local_id.x;
+        let i2 = wg_linear / params.ne1;
+        let i1 = wg_linear % params.ne1;
+
+        let i11 = u32(ids[params.offset_ids + i1 * params.nb20 + i2 * params.nb21]);
+
+        let src0_row = params.offset_src0 + i1 * params.nb01 + i2 * params.nb02;
+        let src1_row = params.offset_src1 + i11 * params.nb11;
+        let dst_row = params.offset_dst + i1 * params.ne0 + i2 * (params.ne0 * params.ne1);
+
+        for (var i = thread_id;i < params.ne0; i += WG_SIZE) {
+#ifdef INPLACE
+            src0[src0_row + i] = src0[src0_row + i] + src1[src1_row + i];
+#else
+            dst[dst_row + i] = src0[src0_row + i] + src1[src1_row + i];
+#endif
+        }
+    }
+
+}
@@ -896,3 +896,10 @@ const kvalues_iq4nl = array<i32, 16>(
 );
 
 #endif
+
+#ifdef MXFP4_LUT
+const kvalues_mxfp4 = array<i32, 16>(
+    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12
+);
+#endif
+