Ethan-a2
diff --git a/‎tools/mtmd/clip-graph.h‎
Lines changed: 5 additions & 0 deletions b/‎tools/mtmd/clip-graph.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tools/mtmd/clip.cpp‎
Lines changed: 12 additions & 8 deletions b/‎tools/mtmd/clip.cpp‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎tools/mtmd/models/cogvlm.cpp‎
Lines changed: 5 additions & 5 deletions b/‎tools/mtmd/models/cogvlm.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tools/mtmd/models/conformer.cpp‎
Lines changed: 8 additions & 8 deletions b/‎tools/mtmd/models/conformer.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎tools/mtmd/models/glm4v.cpp‎
Lines changed: 1 addition & 1 deletion b/‎tools/mtmd/models/glm4v.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tools/mtmd/models/llama4.cpp‎
Lines changed: 4 additions & 4 deletions b/‎tools/mtmd/models/llama4.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tools/mtmd/models/llava.cpp‎
Lines changed: 21 additions & 21 deletions b/‎tools/mtmd/models/llava.cpp‎
Lines changed: 21 additions & 21 deletions
@@ -41,6 +41,11 @@ struct clip_graph {
     virtual ~clip_graph() = default;
     virtual ggml_cgraph * build() = 0;
 
+    // wrapper around ggml_mul_mat, allow hooking (e.g. LoRA, clamping) depending on the model
+    // tensor w should be the weight matrix, and tensor x should be the input
+    virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
+    // TODO: build_mm(w, b, x) to support bias
+
     //
     // utility functions
     //
 
@@ -255,6 +255,10 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
     gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 }
 
+ggml_tensor * clip_graph::build_mm(ggml_tensor * w, ggml_tensor * x) const {
+    return ggml_mul_mat(ctx0, w, x);
+}
+
 void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
     if (il >= 0) {
         ggml_format_name(cur, "%s-%d", name, il);
@@ -326,7 +330,7 @@ ggml_tensor * clip_graph::build_vit(
             ggml_tensor * Vcur = nullptr;
             if (layer.qkv_w != nullptr) {
                 // fused qkv
-                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+                cur = build_mm(layer.qkv_w, cur);
                 if (layer.qkv_b != nullptr) {
                     cur = ggml_add(ctx0, cur, layer.qkv_b);
                 }
@@ -360,17 +364,17 @@ ggml_tensor * clip_graph::build_vit(
 
             } else {
                 // separate q, k, v
-                Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                Qcur = build_mm(layer.q_w, cur);
                 if (layer.q_b) {
                     Qcur = ggml_add(ctx0, Qcur, layer.q_b);
                 }
 
-                Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                Kcur = build_mm(layer.k_w, cur);
                 if (layer.k_b) {
                     Kcur = ggml_add(ctx0, Kcur, layer.k_b);
                 }
 
-                Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                Vcur = build_mm(layer.v_w, cur);
                 if (layer.v_b) {
                     Vcur = ggml_add(ctx0, Vcur, layer.v_b);
                 }
@@ -517,7 +521,7 @@ ggml_tensor * clip_graph::build_ffn(
         ffn_op_type type_op,
         int il) const {
 
-    ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
+    ggml_tensor * tmp = up ? build_mm(up, cur) : cur;
     cb(tmp, "ffn_up", il);
 
     if (up_b) {
@@ -526,7 +530,7 @@ ggml_tensor * clip_graph::build_ffn(
     }
 
     if (gate) {
-        cur = ggml_mul_mat(ctx0, gate, cur);
+        cur = build_mm(gate, cur);
         cb(cur, "ffn_gate", il);
 
         if (gate_b) {
@@ -580,7 +584,7 @@ ggml_tensor * clip_graph::build_ffn(
     }
 
     if (down) {
-        cur = ggml_mul_mat(ctx0, down, cur);
+        cur = build_mm(down, cur);
     }
 
     if (down_b) {
@@ -646,7 +650,7 @@ ggml_tensor * clip_graph::build_attn(
     cb(cur, "kqv_out", il);
 
     if (wo) {
-        cur = ggml_mul_mat(ctx0, wo, cur);
+        cur = build_mm(wo, cur);
     }
 
     if (wo_b) {
 
@@ -19,7 +19,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
         auto & layer = model.layers[il];
         ggml_tensor * cur = inpL;
 
-        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+        cur = build_mm(layer.qkv_w, cur);
 
         cur = ggml_add(ctx0, cur, layer.qkv_b);
 
@@ -67,7 +67,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
         ggml_row_size(inpL->type, n_embd), 0);
 
     // Multiply with mm_model_proj
-    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cur = build_mm(model.mm_model_proj, cur);
 
     // Apply layernorm, weight, bias
     cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
@@ -76,16 +76,16 @@ ggml_cgraph * clip_graph_cogvlm::build() {
     cur = ggml_gelu_inplace(ctx0, cur);
 
     // Branch 1: multiply with mm_h_to_4h_w
-    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+    ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur);
 
     // Branch 2: multiply with mm_gate_w
-    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+    ggml_tensor * gate = build_mm(model.mm_gate_w, cur);
 
     // Apply silu
     gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
 
     // Apply mm_4h_to_h_w
-    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+    cur = build_mm(model.mm_4h_to_h_w, gate);
 
     // Concatenate with boi and eoi
     cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
 
@@ -56,7 +56,7 @@ ggml_cgraph * clip_graph_conformer::build() {
         cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
 
         // calculate out
-        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
+        cur = build_mm(model.pre_encode_out_w, cur);
         cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
         cb(cur, "conformer.pre_encode.out", -1);
     }
@@ -87,7 +87,7 @@ ggml_cgraph * clip_graph_conformer::build() {
             cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
             cb(cur, "conformer.layers.{}.norm_self_att", il);
 
-            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
+            ggml_tensor * Qcur     = build_mm(layer.q_w, cur);
             Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
             Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
             ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
@@ -96,12 +96,12 @@ ggml_cgraph * clip_graph_conformer::build() {
             Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
 
             // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
-            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            ggml_tensor * Kcur = build_mm(layer.k_w, cur);
             Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
             Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
             Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            ggml_tensor * Vcur = build_mm(layer.v_w, cur);
             Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
             Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
             Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
@@ -111,7 +111,7 @@ ggml_cgraph * clip_graph_conformer::build() {
             matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
             cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
 
-            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
+            auto * p = build_mm(layer.linear_pos_w, pos_emb);
             cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
             p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
             p = ggml_permute(ctx0, p, 0, 2, 1, 3);
@@ -143,7 +143,7 @@ ggml_cgraph * clip_graph_conformer::build() {
             x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
             x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
 
-            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
+            ggml_tensor * out = build_mm(layer.o_w, x);
             out               = ggml_add(ctx0, out, layer.o_b);
             cb(out, "conformer.layers.{}.self_attn.linear_out", il);
 
@@ -157,7 +157,7 @@ ggml_cgraph * clip_graph_conformer::build() {
         // conv
         {
             auto * x = cur;
-            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
+            x = build_mm(layer.conv_pw1_w, x);
             x = ggml_add(ctx0, x, layer.conv_pw1_b);
             cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
 
@@ -181,7 +181,7 @@ ggml_cgraph * clip_graph_conformer::build() {
             x = ggml_silu(ctx0, x);
 
             // pointwise_conv2
-            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
+            x = build_mm(layer.conv_pw2_w, x);
             x = ggml_add(ctx0, x, layer.conv_pw2_b);
 
             cur = x;
 
@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
 
     // FC projector
     {
-        cur = ggml_mul_mat(ctx0, model.projection, cur);
+        cur = build_mm(model.projection, cur);
         // default LayerNorm (post_projection_norm)
         cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
         cur = ggml_gelu_erf(ctx0, cur);
 
@@ -22,7 +22,7 @@ ggml_cgraph * clip_graph_llama4::build() {
         ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
                                                 patch_size, patch_size, 3, n_embd);
         inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
-        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+        inp = build_mm(model.patch_embeddings_0, inp);
         inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
         cb(inp, "patch_conv", -1);
     }
@@ -78,15 +78,15 @@ ggml_cgraph * clip_graph_llama4::build() {
 
     // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
     {
-        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+        cur = build_mm(model.mm_model_mlp_1_w, cur);
         cur = ggml_gelu(ctx0, cur);
-        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+        cur = build_mm(model.mm_model_mlp_2_w, cur);
         cur = ggml_gelu(ctx0, cur);
         cb(cur, "adapter_mlp", -1);
     }
 
     // Llama4MultiModalProjector
-    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cur = build_mm(model.mm_model_proj, cur);
     cb(cur, "projected", -1);
 
     // build the graph
 
@@ -70,17 +70,17 @@ ggml_cgraph * clip_graph_llava::build() {
 
         // self-attention
         {
-            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+            ggml_tensor * Qcur = build_mm(layer.q_w, cur);
             if (layer.q_b) {
                 Qcur = ggml_add(ctx0, Qcur, layer.q_b);
             }
 
-            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            ggml_tensor * Kcur = build_mm(layer.k_w, cur);
             if (layer.k_b) {
                 Kcur = ggml_add(ctx0, Kcur, layer.k_b);
             }
 
-            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            ggml_tensor * Vcur = build_mm(layer.v_w, cur);
             if (layer.v_b) {
                 Vcur = ggml_add(ctx0, Vcur, layer.v_b);
             }
@@ -164,17 +164,17 @@ ggml_cgraph * clip_graph_llava::build() {
 
         // llava projector
         if (proj_type == PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = build_mm(model.mm_0_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
             embeddings = ggml_gelu(ctx0, embeddings);
             if (model.mm_2_w) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                embeddings = build_mm(model.mm_2_w, embeddings);
                 embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
             }
         }
         else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = build_mm(model.mm_0_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
             // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
             // First LayerNorm
@@ -186,7 +186,7 @@ ggml_cgraph * clip_graph_llava::build() {
             embeddings = ggml_gelu(ctx0, embeddings);
 
             // Second linear layer
-            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = build_mm(model.mm_3_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
 
             // Second LayerNorm
@@ -197,10 +197,10 @@ ggml_cgraph * clip_graph_llava::build() {
         else if (proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projector
             int n_patch = 24;
-            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings);
             mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
             mlp_1 = ggml_gelu(ctx0, mlp_1);
-            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1);
             mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
             // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
 
@@ -229,10 +229,10 @@ ggml_cgraph * clip_graph_llava::build() {
                 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                 // pointwise conv
                 block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1);
                 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
                 block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1);
                 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
                 block_1 = ggml_hardsigmoid(ctx0, block_1);
                 // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
@@ -244,7 +244,7 @@ ggml_cgraph * clip_graph_llava::build() {
                 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
 
                 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1);
                 block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
 
                 // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
@@ -277,10 +277,10 @@ ggml_cgraph * clip_graph_llava::build() {
                 // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                 // pointwise conv
                 block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1);
                 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
                 block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1);
                 block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                 block_1 = ggml_hardsigmoid(ctx0, block_1);
 
@@ -292,7 +292,7 @@ ggml_cgraph * clip_graph_llava::build() {
                 block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                 block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                 // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1);
                 block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
 
 
@@ -307,10 +307,10 @@ ggml_cgraph * clip_graph_llava::build() {
         else if (proj_type == PROJECTOR_TYPE_LDPV2)
         {
             int n_patch = 24;
-            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings);
             mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
             mlp_0 = ggml_gelu(ctx0, mlp_0);
-            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0);
             mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
             // mlp_2 ne = [2048, 576, 1, 1]
             // // AVG Pool Layer 2*2, strides = 2
@@ -344,15 +344,15 @@ ggml_cgraph * clip_graph_llava::build() {
         embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
         // GLU
         {
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            embeddings = build_mm(model.mm_model_mlp_0_w, embeddings);
             embeddings = ggml_norm(ctx0, embeddings, eps);
             embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
             embeddings = ggml_gelu_inplace(ctx0, embeddings);
             ggml_tensor * x = embeddings;
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
-            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+            embeddings = build_mm(model.mm_model_mlp_2_w, embeddings);
+            x = build_mm(model.mm_model_mlp_1_w,x);
             embeddings = ggml_swiglu_split(ctx0, embeddings, x);
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+            embeddings = build_mm(model.mm_model_mlp_3_w, embeddings);
         }
         // arrangement of BOI/EOI token embeddings
         // note: these embeddings are not present in text model, hence we cannot process them as text tokens
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {`
`97`	`97`
`98`	`98`	`// FC projector`
`99`	`99`	`{`
`100`		`- cur = ggml_mul_mat(ctx0, model.projection, cur);`
	`100`	`+ cur = build_mm(model.projection, cur);`
`101`	`101`	`// default LayerNorm (post_projection_norm)`
`102`	`102`	`cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);`
`103`	`103`	`cur = ggml_gelu_erf(ctx0, cur);`