Skip to content

Commit bb5cef1

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # ci/run.sh # ggml/src/ggml-cpu/amx/amx.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl # tools/server/README.md
2 parents f2b9b93 + a23b9bd commit bb5cef1

12 files changed

Lines changed: 170 additions & 101 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
894894
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
895895
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
896896
res = "llada-moe"
897+
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
898+
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
899+
res = "granite-docling"
897900

898901
if res is None:
899902
logger.warning("\n")
@@ -1328,6 +1331,7 @@ def __init__(self, *args, **kwargs):
13281331
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
13291332

13301333
# load preprocessor config
1334+
self.preprocessor_config = {}
13311335
if not self.is_mistral_format:
13321336
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
13331337
self.preprocessor_config = json.load(f)
@@ -1350,7 +1354,8 @@ def set_gguf_parameters(self):
13501354
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
13511355

13521356
# vision config
1353-
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1357+
self.image_size = self.find_vparam(["image_size"])
1358+
self.gguf_writer.add_vision_image_size(self.image_size)
13541359
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
13551360
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
13561361
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
@@ -2383,6 +2388,10 @@ def set_gguf_parameters(self):
23832388
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
23842389
self.gguf_writer.add_vision_use_gelu(True)
23852390

2391+
# Add the preprocessor longest edge size
2392+
preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
2393+
self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
2394+
23862395
def tensor_force_quant(self, name, new_name, bid, n_dims):
23872396
if ".embeddings." in name:
23882397
return gguf.GGMLQuantizationType.F32

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
140140
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
141141
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142142
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
143+
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
143144
]
144145

145146
# some models are known to be broken upstream, so we will skip them as exceptions

ggml/src/ggml-cpu/vec.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -654,11 +654,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
654654
}
655655
// leftovers
656656
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
657-
if (np < n) {
658-
svbool_t pg = svwhilelt_b32(np, n);
659-
ay1 = svld1_f32(pg, y + np);
657+
for (int i = np; i < n; i += ggml_f32_epr) {
658+
svbool_t pg = svwhilelt_b32(i, n);
659+
ay1 = svld1_f32(pg, y + i);
660660
ay1 = svmul_f32_m(pg, ay1, vx);
661-
svst1_f32(pg, y + np, ay1);
661+
svst1_f32(pg, y + i, ay1);
662662
}
663663
#elif defined(__riscv_v_intrinsic)
664664
for (int i = 0, avl; i < n; i += avl) {

ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
300300
workgroupBarrier();
301301
}
302302
let row_max = scratch[0];
303+
workgroupBarrier();
303304

304305
var sum = 0.0f;
305306
col = lid.x;

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ class Clip:
261261

262262
class ClipVision:
263263
IMAGE_SIZE = "clip.vision.image_size"
264+
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
264265
PATCH_SIZE = "clip.vision.patch_size"
265266
EMBEDDING_LENGTH = "clip.vision.embedding_length"
266267
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,9 @@ def add_vision_attention_layernorm_eps(self, value: float) -> None:
10371037
def add_vision_image_size(self, value: int) -> None:
10381038
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
10391039

1040+
def add_vision_preproc_image_size(self, value: int) -> None:
1041+
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
1042+
10401043
def add_vision_image_mean(self, values: Sequence[float]) -> None:
10411044
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
10421045

src/llama-vocab.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
572572
case LLAMA_VOCAB_PRE_TYPE_OLMO:
573573
case LLAMA_VOCAB_PRE_TYPE_JAIS:
574574
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
575+
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
575576
regex_exprs = {
576577
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
577578
};
@@ -2197,6 +2198,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21972198
tokenizer_pre == "trillion") {
21982199
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
21992200
clean_spaces = false;
2201+
} else if (
2202+
tokenizer_pre == "granite-docling") {
2203+
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
2204+
clean_spaces = false;
22002205
} else if (
22012206
tokenizer_pre == "bailingmoe" ||
22022207
tokenizer_pre == "llada-moe") {

src/llama-vocab.h

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,46 +9,47 @@
99

1010
// pre-tokenization types
1111
enum llama_vocab_pre_type {
12-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
13-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
14-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
15-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
16-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
17-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
18-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
19-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
20-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
21-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
22-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
23-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
24-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
25-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
26-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
27-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
28-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
29-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
30-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
31-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
32-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
33-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
34-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
35-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
36-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
37-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
38-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
39-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
40-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
41-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
42-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
43-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
44-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
45-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
46-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
47-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
48-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
49-
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
50-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
51-
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
12+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
13+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
14+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
15+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
16+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
17+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
18+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
19+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
20+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
21+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
22+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
23+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
24+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
25+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
26+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
27+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
28+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
29+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
30+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
31+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
32+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
33+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
34+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
35+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
36+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
37+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
38+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
39+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
40+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
41+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
42+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
43+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
44+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
45+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
46+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
47+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
48+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
49+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
50+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
51+
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
52+
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
5253
};
5354

5455
struct LLM_KV;

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
// vision-specific
3333
#define KEY_IMAGE_SIZE "clip.vision.image_size"
34+
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
3435
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3536
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
3637
#define KEY_IMAGE_STD "clip.vision.image_std"

tools/mtmd/clip.cpp

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,9 @@ struct clip_hparams {
188188
int32_t projection_dim;
189189
int32_t n_head;
190190
int32_t n_layer;
191-
int32_t proj_scale_factor = 0; // idefics3
191+
// idefics3
192+
int32_t preproc_image_size = 0;
193+
int32_t proj_scale_factor = 0;
192194

193195
float image_mean[3];
194196
float image_std[3];
@@ -2289,6 +2291,7 @@ struct clip_model_loader {
22892291

22902292
if (is_vision) {
22912293
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2294+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
22922295
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
22932296
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
22942297
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -3726,10 +3729,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
37263729
// res_imgs->data[0] = *res;
37273730
res_imgs->entries.push_back(std::move(img_f32));
37283731
return true;
3729-
}
3730-
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
3732+
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
3733+
// The refined size has two steps:
3734+
// 1. Resize w/ aspect-ratio preserving such that the longer side is
3735+
// the preprocessor longest size
3736+
// 2. Resize w/out preserving aspect ratio such that both sides are
3737+
// multiples of image_size (always rounding up)
3738+
//
3739+
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3740+
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
3741+
original_size, params.image_size, params.preproc_image_size);
3742+
3743+
llava_uhd::slice_instructions instructions;
3744+
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
3745+
instructions.refined_size = refined_size;
3746+
instructions.grid_size = clip_image_size{
3747+
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
3748+
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
3749+
};
3750+
for (int y = 0; y < refined_size.height; y += params.image_size) {
3751+
for (int x = 0; x < refined_size.width; x += params.image_size) {
3752+
instructions.slices.push_back(llava_uhd::slice_coordinates{
3753+
/* x */x,
3754+
/* y */y,
3755+
/* size */clip_image_size{
3756+
std::min(params.image_size, refined_size.width - x),
3757+
std::min(params.image_size, refined_size.height - y)
3758+
}
3759+
});
3760+
}
3761+
}
3762+
auto imgs = llava_uhd::slice_image(img, instructions);
3763+
3764+
// cast and normalize to f32
3765+
for (size_t i = 0; i < imgs.size(); ++i) {
3766+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3767+
clip_image_f32_ptr res(clip_image_f32_init());
3768+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3769+
res_imgs->entries.push_back(std::move(res));
3770+
}
3771+
3772+
res_imgs->grid_x = instructions.grid_size.width;
3773+
res_imgs->grid_y = instructions.grid_size.height;
3774+
return true;
3775+
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
37313776
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
3732-
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
37333777
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
37343778
) {
37353779
clip_image_u8 resized_image;

0 commit comments

Comments
 (0)