Merge branch 'upstream' into concedo_experimental

LostRuins · LostRuins · commit bb5cef17561d · 2025-10-06T22:41:46.000+08:00
# Conflicts:
#	.devops/nix/package.nix
#	ci/run.sh
#	ggml/src/ggml-cpu/amx/amx.cpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl
#	tools/server/README.md
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -894,6 +894,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
             # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
             res = "llada-moe"
+        if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
+            # ref: https://huggingface.co/ibm-granite/granite-docling-258M
+            res = "granite-docling"
 
         if res is None:
             logger.warning("\n")
@@ -1328,6 +1331,7 @@ def __init__(self, *args, **kwargs):
         self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
 
         # load preprocessor config
+        self.preprocessor_config = {}
         if not self.is_mistral_format:
             with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
                 self.preprocessor_config = json.load(f)
@@ -1350,7 +1354,8 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
 
             # vision config
-            self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
+            self.image_size = self.find_vparam(["image_size"])
+            self.gguf_writer.add_vision_image_size(self.image_size)
             self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
             self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
             self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
@@ -2383,6 +2388,10 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
         self.gguf_writer.add_vision_use_gelu(True)
 
+        # Add the preprocessor longest edge size
+        preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
+        self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
+
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         if ".embeddings." in name:
             return gguf.GGMLQuantizationType.F32
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
     {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
+    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
@@ -654,11 +654,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
         }
         // leftovers
         // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
-        if (np < n) {
-            svbool_t pg = svwhilelt_b32(np, n);
-            ay1 = svld1_f32(pg, y + np);
+        for (int i = np; i < n; i += ggml_f32_epr) {
+            svbool_t pg = svwhilelt_b32(i, n);
+            ay1 = svld1_f32(pg, y + i);
             ay1 = svmul_f32_m(pg, ay1, vx);
-            svst1_f32(pg, y + np, ay1);
+            svst1_f32(pg, y + i, ay1);
         }
     #elif defined(__riscv_v_intrinsic)
         for (int i = 0, avl; i < n; i += avl) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl
@@ -300,6 +300,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
         workgroupBarrier();
     }
     let row_max = scratch[0];
+    workgroupBarrier();
 
     var sum = 0.0f;
     col = lid.x;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -261,6 +261,7 @@ class Clip:
 
     class ClipVision:
         IMAGE_SIZE          = "clip.vision.image_size"
+        PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
         PATCH_SIZE          = "clip.vision.patch_size"
         EMBEDDING_LENGTH    = "clip.vision.embedding_length"
         FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -1037,6 +1037,9 @@ def add_vision_attention_layernorm_eps(self, value: float) -> None:
     def add_vision_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
 
+    def add_vision_preproc_image_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
+
     def add_vision_image_mean(self, values: Sequence[float]) -> None:
         self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -572,6 +572,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
             case LLAMA_VOCAB_PRE_TYPE_OLMO:
             case LLAMA_VOCAB_PRE_TYPE_JAIS:
             case LLAMA_VOCAB_PRE_TYPE_TRILLION:
+            case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
                 regex_exprs = {
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                 };
@@ -2197,6 +2198,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "trillion") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "granite-docling") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
+                clean_spaces = false;
             } else if (
                 tokenizer_pre == "bailingmoe" ||
                 tokenizer_pre == "llada-moe") {
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -9,46 +9,47 @@
 
 // pre-tokenization types
 enum llama_vocab_pre_type {
-    LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-    LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-    LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-    LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-    LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-    LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-    LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-    LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
-    LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
-    LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
-    LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
-    LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
-    LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-    LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
-    LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
-    LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
-    LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
-    LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
-    LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
-    LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
-    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
-    LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
-    LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-    LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
-    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
-    LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
-    LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
-    LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
-    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
-    LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
-    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
-    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
-    LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
-    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE  = 38,
-    LLAMA_VOCAB_PRE_TYPE_GROK_2         = 39,
+    LLAMA_VOCAB_PRE_TYPE_DEFAULT         = 0,
+    LLAMA_VOCAB_PRE_TYPE_LLAMA3          = 1,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM    = 2,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER  = 3,
+    LLAMA_VOCAB_PRE_TYPE_FALCON          = 4,
+    LLAMA_VOCAB_PRE_TYPE_MPT             = 5,
+    LLAMA_VOCAB_PRE_TYPE_STARCODER       = 6,
+    LLAMA_VOCAB_PRE_TYPE_GPT2            = 7,
+    LLAMA_VOCAB_PRE_TYPE_REFACT          = 8,
+    LLAMA_VOCAB_PRE_TYPE_COMMAND_R       = 9,
+    LLAMA_VOCAB_PRE_TYPE_STABLELM2       = 10,
+    LLAMA_VOCAB_PRE_TYPE_QWEN2           = 11,
+    LLAMA_VOCAB_PRE_TYPE_OLMO            = 12,
+    LLAMA_VOCAB_PRE_TYPE_DBRX            = 13,
+    LLAMA_VOCAB_PRE_TYPE_SMAUG           = 14,
+    LLAMA_VOCAB_PRE_TYPE_PORO            = 15,
+    LLAMA_VOCAB_PRE_TYPE_CHATGLM3        = 16,
+    LLAMA_VOCAB_PRE_TYPE_CHATGLM4        = 17,
+    LLAMA_VOCAB_PRE_TYPE_VIKING          = 18,
+    LLAMA_VOCAB_PRE_TYPE_JAIS            = 19,
+    LLAMA_VOCAB_PRE_TYPE_TEKKEN          = 20,
+    LLAMA_VOCAB_PRE_TYPE_SMOLLM          = 21,
+    LLAMA_VOCAB_PRE_TYPE_CODESHELL       = 22,
+    LLAMA_VOCAB_PRE_TYPE_BLOOM           = 23,
+    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH    = 24,
+    LLAMA_VOCAB_PRE_TYPE_EXAONE          = 25,
+    LLAMA_VOCAB_PRE_TYPE_CHAMELEON       = 26,
+    LLAMA_VOCAB_PRE_TYPE_MINERVA         = 27,
+    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM   = 28,
+    LLAMA_VOCAB_PRE_TYPE_GPT4O           = 29,
+    LLAMA_VOCAB_PRE_TYPE_SUPERBPE        = 30,
+    LLAMA_VOCAB_PRE_TYPE_TRILLION        = 31,
+    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE      = 32,
+    LLAMA_VOCAB_PRE_TYPE_LLAMA4          = 33,
+    LLAMA_VOCAB_PRE_TYPE_PIXTRAL         = 34,
+    LLAMA_VOCAB_PRE_TYPE_SEED_CODER      = 35,
+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN         = 36,
+    LLAMA_VOCAB_PRE_TYPE_KIMI_K2         = 37,
+    LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE   = 38,
+    LLAMA_VOCAB_PRE_TYPE_GROK_2          = 39,
+    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
 };
 
 struct LLM_KV;
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -31,6 +31,7 @@
 
 // vision-specific
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -188,7 +188,9 @@ struct clip_hparams {
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
-    int32_t proj_scale_factor = 0; // idefics3
+    // idefics3
+    int32_t preproc_image_size = 0;
+    int32_t proj_scale_factor = 0;
 
     float image_mean[3];
     float image_std[3];
@@ -2289,6 +2291,7 @@ struct clip_model_loader {
 
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
                 get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                 get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -3726,10 +3729,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         // res_imgs->data[0] = *res;
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
-    }
-    else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
+        // The refined size has two steps:
+        // 1. Resize w/ aspect-ratio preserving such that the longer side is
+        //      the preprocessor longest size
+        // 2. Resize w/out preserving aspect ratio such that both sides are
+        //      multiples of image_size (always rounding up)
+        //
+        // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+        const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
+            original_size, params.image_size, params.preproc_image_size);
+
+        llava_uhd::slice_instructions instructions;
+        instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+        instructions.refined_size = refined_size;
+        instructions.grid_size = clip_image_size{
+            static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+            static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+        };
+        for (int y = 0; y < refined_size.height; y += params.image_size) {
+            for (int x = 0; x < refined_size.width; x += params.image_size) {
+                instructions.slices.push_back(llava_uhd::slice_coordinates{
+                    /* x    */x,
+                    /* y    */y,
+                    /* size */clip_image_size{
+                        std::min(params.image_size, refined_size.width - x),
+                        std::min(params.image_size, refined_size.height - y)
+                    }
+                });
+            }
+        }
+        auto imgs = llava_uhd::slice_image(img, instructions);
+
+        // cast and normalize to f32
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+            clip_image_f32_ptr res(clip_image_f32_init());
+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+            res_imgs->entries.push_back(std::move(res));
+        }
+
+        res_imgs->grid_x = instructions.grid_size.width;
+        res_imgs->grid_y = instructions.grid_size.height;
+        return true;
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
             || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
-            || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
             || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
     ) {
         clip_image_u8 resized_image;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh

Original file line number	Diff line number	Diff line change
`@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):`
`140`	`140`	`{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },`
`141`	`141`	`{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },`
`142`	`142`	`{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },`
	`143`	`+ {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },`
`143`	`144`	`]`
`144`	`145`
`145`	`146`	`# some models are known to be broken upstream, so we will skip them as exceptions`
Original file line number	Diff line number	Diff line change
`@@ -300,6 +300,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,`
`300`	`300`	`workgroupBarrier();`
`301`	`301`	`}`
`302`	`302`	`let row_max = scratch[0];`
	`303`	`+ workgroupBarrier();`
`303`	`304`
`304`	`305`	`var sum = 0.0f;`
`305`	`306`	`col = lid.x;`