diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094c7..9edf1042e95 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7084,8 +7084,11 @@ class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): - self._set_vocab_sentencepiece() - + # PaliGemmaForConditionalGeneration uses tokenizer.json (no tokenizer.model) + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + self._set_vocab_gpt2() self.gguf_writer.add_add_space_prefix(False) def set_gguf_parameters(self): @@ -7116,6 +7119,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return + # PaliGemmaForConditionalGeneration wraps Gemma2 under language_model.*; + # strip the prefix so the standard tensor map works unchanged. + if name.startswith("language_model."): + name = name[len("language_model."):] + elif not name.startswith("model."): + return # skip vision_tower.*, multi_modal_projector.* tensors + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -7309,6 +7319,28 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return # skip other tensors +@ModelBase.register("PaliGemmaForConditionalGeneration") +class PaliGemma2VisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + vision_config = self.hparams.get("vision_config", self.hparams) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PALIGEMMA2) + self.gguf_writer.add_vision_attention_layernorm_eps(vision_config.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."): + yield from super().modify_tensors(data_torch, name, bid) + # skip language_model.* and all other tensors + + class ConformerAudioModel(MmprojModel): _batch_norm_tensors: list[dict[str, Tensor]] | None = None diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83ae51ce9ce..cb2eadea059 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -4133,6 +4133,7 @@ class VisionProjectorType: QWEN2VL = "qwen2vl_merger" QWEN25VL = "qwen2.5vl_merger" QWEN3VL = "qwen3vl_merger" + PALIGEMMA2 = "paligemma2" STEP3VL = "step3vl" ULTRAVOX = "ultravox" INTERNVL = "internvl" diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 01a9b236000..b75a6662722 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1678,6 +1678,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_PROJ: ( "multi_modal_projector.mm_input_projection", + "multi_modal_projector.linear", # paligemma2 ), MODEL_TENSOR.V_MM_INP_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 7d6484eea85..920f1d3b8c1 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -107,7 +107,8 @@ #define TN_IMAGE_SEPERATOR "v.view_seperator" #define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_NORM_B "mm.input_norm.bias" -#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3, paligemma2 +#define TN_MM_INP_PROJ_B "mm.input_projection.bias" // paligemma2: projector has bias #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr #define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v @@ -268,6 +269,7 @@ enum projector_type { PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN3VL, + PROJECTOR_TYPE_PALIGEMMA2, PROJECTOR_TYPE_STEP3VL, PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3NV, @@ -316,6 +318,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_PALIGEMMA2, "paligemma2"}, { PROJECTOR_TYPE_STEP3VL, "step3vl"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index bf8031b55b2..11f4bbd71f8 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -403,8 +403,9 @@ struct clip_model { ggml_tensor * mm_model_ln_post_w = nullptr; ggml_tensor * mm_model_ln_post_b = nullptr; - // gemma3 + // gemma3 / paligemma2 ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_input_proj_b = nullptr; // paligemma2: linear projector bias ggml_tensor * mm_soft_emb_norm_w = nullptr; // mobilenetv5 for gemma3n diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 45e39898d82..05eb18b5a0a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -832,6 +832,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 std::unique_ptr builder; switch (ctx->proj_type()) { + case PROJECTOR_TYPE_PALIGEMMA2: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_LFM2: @@ -1321,6 +1322,13 @@ struct clip_model_loader { hparams.set_limit_image_tokens(2, 4096); } } break; + case PROJECTOR_TYPE_PALIGEMMA2: + { + // PaliGemma2: no patch merging (n_merge=1), bilinear resize + // 224px → 16x16 = 256 tokens; 448px → 32x32 = 1024 tokens + hparams.n_merge = 1; + hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; + } break; case PROJECTOR_TYPE_GEMMA3: { // default value (used by all model sizes in gemma 3 family) @@ -1927,6 +1935,11 @@ struct clip_model_loader { model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight")); model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias")); } break; + case PROJECTOR_TYPE_PALIGEMMA2: + { + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_input_proj_b = get_tensor(TN_MM_INP_PROJ_B, false); + } break; case PROJECTOR_TYPE_GEMMA3: { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); @@ -2977,6 +2990,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int y_patch = img->ny / (params.patch_size * params.n_merge); n_patches = x_patch * y_patch; } break; + case PROJECTOR_TYPE_PALIGEMMA2: + break; // no pooling: all patches pass through (256 for 224px, 1024 for 448px) case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA4V: case PROJECTOR_TYPE_IDEFICS3: @@ -3528,6 +3543,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("rel_pos_indices_local", rel_pos_indices_local); set_input_i32("rel_pos_indices_global", rel_pos_indices_global); } break; + case PROJECTOR_TYPE_PALIGEMMA2: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: case PROJECTOR_TYPE_IDEFICS3: @@ -3809,6 +3825,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); case PROJECTOR_TYPE_STEP3VL: return ctx->model.mm_model_proj->ne[1]; + case PROJECTOR_TYPE_PALIGEMMA2: + return ctx->model.mm_input_proj_w->ne[1]; // output dim = LM hidden_size case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp index 7ef98eed0ec..be80887285d 100644 --- a/tools/mtmd/models/siglip.cpp +++ b/tools/mtmd/models/siglip.cpp @@ -83,6 +83,15 @@ ggml_cgraph * clip_graph_siglip::build() { FFN_GELU, -1); + } else if (proj_type == PROJECTOR_TYPE_PALIGEMMA2) { + // PaliGemma2: direct linear projection, no pooling, no norm before projector + // weight stored as [in=1152, out=2304] in ggml → mul_mat directly (no transpose) + // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py + cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + if (model.mm_input_proj_b) { + cur = ggml_add(ctx0, cur, model.mm_input_proj_b); + } + } else { GGML_ABORT("SigLIP: Unsupported projector type"); } diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index be958bd175a..2fea640fb88 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -81,7 +81,8 @@ struct mtmd_cli_context { // chat template common_chat_templates_ptr tmpls; std::vector chat_history; - bool use_jinja = false; + bool use_jinja = false; + bool raw_prompt = false; // true when model has no chat template (e.g. PaliGemma2 PT) // TODO: support for --system-prompt with /clear command // support for legacy templates (models not having EOT token) @@ -105,12 +106,12 @@ struct mtmd_cli_context { exit(1); } - if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) { - LOG_ERR("Model does not have chat template.\n"); - LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n"); - LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n"); - LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n"); - exit(1); + raw_prompt = !llama_model_chat_template(model, nullptr) && params.chat_template.empty(); + if (raw_prompt) { + LOG_WRN("Model does not have chat template — using raw prompt mode (e.g. PaliGemma2 PT).\n"); + LOG_WRN(" For old llava models, you may need to use '--chat-template vicuna'\n"); + LOG_WRN(" For MobileVLM models, use '--chat-template deepseek'\n"); + LOG_WRN(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n"); } tmpls = common_chat_templates_init(model, params.chat_template); @@ -230,7 +231,15 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { bool add_bos = ctx.chat_history.empty(); - auto formatted_chat = chat_add_and_format(ctx, msg); + std::string formatted_chat; + if (ctx.raw_prompt) { + // raw mode: pass content directly without chat template wrapping + // (used by pre-trained models like PaliGemma2 PT) + formatted_chat = msg.content; + ctx.chat_history.push_back(msg); + } else { + formatted_chat = chat_add_and_format(ctx, msg); + } LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 59907786786..72ca2c340d1 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -339,6 +339,12 @@ struct mtmd_context { // script. image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_PALIGEMMA2: + { + // PaliGemma2: image embeddings injected as prefix tokens without wrapper + // 224px → 256 tokens; 448px → 1024 tokens + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: { @@ -1079,6 +1085,7 @@ bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk proj_type = ctx->proj_type_a(); } switch (proj_type) { + case PROJECTOR_TYPE_PALIGEMMA2: case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA4V: return true;