Skip to content

Commit a8681a0

Browse files
authored
mtmd : DeepSeek-OCR image processing fixes, img_tool::resize padding refactor (ggml-org#23345)
* mtmd : deepseek-ocr fixes, improvements and refactoring - image processing changes to achieve full parity with Pillow (reference impl) - SAM mask casting only when flash-attn is on - SAM refactor (build_sam() extracted so deepseek-ocr-2 can reuse it) - llama-chat changes to fix server/WebUI issue (new media_markers_first()) - adapted test-chat-template and added test cases for deepseek-ocr - changed regression test for deepseek-ocr to use CER+chrF scores for ground-truth comparison; removed embedding-model - ty.toml ignore unresolved-import for tools/mtmd/tests/** * image-text reordering fix removed * refactor bool add_padding + pad_rounding enum into a single pad_style enum
1 parent acd604f commit a8681a0

11 files changed

Lines changed: 432 additions & 471 deletions

tools/mtmd/clip-model.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@ enum resize_algo {
3535
// RESIZE_ALGO_LANCZOS, // TODO
3636
};
3737

38+
// Padding style for img_tool::resize
39+
// PAD_NONE - no padding; direct resize to target dimensions
40+
// PAD_CEIL - aspect-preserving pad (default)
41+
// PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
42+
enum pad_style {
43+
PAD_NONE,
44+
PAD_CEIL,
45+
PAD_NEAREST,
46+
};
47+
3848
struct clip_hparams {
3949
int32_t image_size = 0;
4050
int32_t patch_size = 0;
@@ -52,7 +62,7 @@ struct clip_hparams {
5262
int32_t image_min_pixels = -1;
5363
int32_t image_max_pixels = -1;
5464
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
55-
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
65+
pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
5666
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
5767

5868
// (preprocessor) for llava-uhd style models
@@ -61,8 +71,8 @@ struct clip_hparams {
6171
int32_t preproc_max_tiles = 0;
6272
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
6373
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
64-
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
65-
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
74+
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
75+
pad_style image_pad_ov = PAD_NONE; // padding style for the overview image (e.g. llava-1.6)
6676
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
6777
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
6878

tools/mtmd/clip.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,20 +1233,20 @@ struct clip_model_loader {
12331233
hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
12341234
hparams.image_pad_color = {122, 116, 104};
12351235
if (!hparams.image_res_candidates.empty()) {
1236-
hparams.image_resize_pad = true;
1236+
hparams.image_resize_pad = PAD_CEIL;
12371237
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
12381238
} else {
12391239
// llava-1.6 default params
1240-
hparams.image_pad_ov = false;
1241-
hparams.image_pad_rf = true;
1240+
hparams.image_pad_ov = PAD_NONE;
1241+
hparams.image_pad_rf = PAD_CEIL;
12421242
hparams.image_pad_color_rf = {122, 116, 104};
12431243
hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
12441244
hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
12451245
}
12461246
} break;
12471247
case PROJECTOR_TYPE_GLM_EDGE:
12481248
{
1249-
hparams.image_resize_pad = true;
1249+
hparams.image_resize_pad = PAD_CEIL;
12501250
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
12511251
} break;
12521252
case PROJECTOR_TYPE_MINICPMV:
@@ -1441,7 +1441,7 @@ struct clip_model_loader {
14411441
{
14421442
hparams.n_merge = 2;
14431443
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
1444-
hparams.image_resize_pad = false;
1444+
hparams.image_resize_pad = PAD_NONE;
14451445
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
14461446
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
14471447
std::vector<int> wa_layer_indexes_vec;
@@ -1461,7 +1461,7 @@ struct clip_model_loader {
14611461

14621462
// reka model performs better when using resize_bicubic, which stretches
14631463
// the image to fit fixed square size
1464-
hparams.image_resize_pad = false;
1464+
hparams.image_resize_pad = PAD_NONE;
14651465
} break;
14661466
case PROJECTOR_TYPE_GLM4V:
14671467
{
@@ -1516,9 +1516,7 @@ struct clip_model_loader {
15161516
hparams.image_size = 1024;
15171517
hparams.warmup_image_size = 1024;
15181518
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
1519-
hparams.image_pad_color[0] = hparams.image_mean[0];
1520-
hparams.image_pad_color[1] = hparams.image_mean[1];
1521-
hparams.image_pad_color[2] = hparams.image_mean[2];
1519+
hparams.image_pad_color = {127, 127, 127};
15221520

15231521
get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
15241522
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
@@ -1537,7 +1535,7 @@ struct clip_model_loader {
15371535
{
15381536
hparams.n_merge = 2;
15391537
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
1540-
hparams.image_resize_pad = false;
1538+
hparams.image_resize_pad = PAD_NONE;
15411539
hparams.ffn_op = FFN_GELU;
15421540
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
15431541
hparams.set_limit_image_tokens(256, 16384);

0 commit comments

Comments
 (0)