Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7084,8 +7084,11 @@ class Gemma2Model(TextModel):
model_arch = gguf.MODEL_ARCH.GEMMA2

def set_vocab(self):
self._set_vocab_sentencepiece()

# PaliGemmaForConditionalGeneration uses tokenizer.json (no tokenizer.model)
if (self.dir_model / "tokenizer.model").is_file():
self._set_vocab_sentencepiece()
else:
self._set_vocab_gpt2()
self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
Expand Down Expand Up @@ -7116,6 +7119,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
return

# PaliGemmaForConditionalGeneration wraps Gemma2 under language_model.*;
# strip the prefix so the standard tensor map works unchanged.
if name.startswith("language_model."):
name = name[len("language_model."):]
elif not name.startswith("model."):
return # skip vision_tower.*, multi_modal_projector.* tensors

# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
if name.endswith("norm.weight"):
data_torch = data_torch + 1
Expand Down Expand Up @@ -7309,6 +7319,28 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return # skip other tensors


@ModelBase.register("PaliGemmaForConditionalGeneration")
class PaliGemma2VisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
vision_config = self.hparams.get("vision_config", self.hparams)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PALIGEMMA2)
self.gguf_writer.add_vision_attention_layernorm_eps(vision_config.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)

def tensor_force_quant(self, name, new_name, bid, n_dims):
if "input_projection" in name:
return gguf.GGMLQuantizationType.F16
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
yield from super().modify_tensors(data_torch, name, bid)
# skip language_model.* and all other tensors


class ConformerAudioModel(MmprojModel):
_batch_norm_tensors: list[dict[str, Tensor]] | None = None

Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4133,6 +4133,7 @@ class VisionProjectorType:
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
QWEN3VL = "qwen3vl_merger"
PALIGEMMA2 = "paligemma2"
STEP3VL = "step3vl"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,6 +1678,7 @@ class TensorNameMap:

MODEL_TENSOR.V_MM_INP_PROJ: (
"multi_modal_projector.mm_input_projection",
"multi_modal_projector.linear", # paligemma2
),

MODEL_TENSOR.V_MM_INP_NORM: (
Expand Down
5 changes: 4 additions & 1 deletion tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@
#define TN_IMAGE_SEPERATOR "v.view_seperator"
#define TN_MM_INP_NORM "mm.input_norm.weight"
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3, paligemma2
#define TN_MM_INP_PROJ_B "mm.input_projection.bias" // paligemma2: projector has bias
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
Expand Down Expand Up @@ -268,6 +269,7 @@ enum projector_type {
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_PALIGEMMA2,
PROJECTOR_TYPE_STEP3VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_GEMMA3NV,
Expand Down Expand Up @@ -316,6 +318,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_PALIGEMMA2, "paligemma2"},
{ PROJECTOR_TYPE_STEP3VL, "step3vl"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
Expand Down
3 changes: 2 additions & 1 deletion tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,9 @@ struct clip_model {
ggml_tensor * mm_model_ln_post_w = nullptr;
ggml_tensor * mm_model_ln_post_b = nullptr;

// gemma3
// gemma3 / paligemma2
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_input_proj_b = nullptr; // paligemma2: linear projector bias
ggml_tensor * mm_soft_emb_norm_w = nullptr;

// mobilenetv5 for gemma3n
Expand Down
18 changes: 18 additions & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
std::unique_ptr<clip_graph> builder;

switch (ctx->proj_type()) {
case PROJECTOR_TYPE_PALIGEMMA2:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_LFM2:
Expand Down Expand Up @@ -1321,6 +1322,13 @@ struct clip_model_loader {
hparams.set_limit_image_tokens(2, 4096);
}
} break;
case PROJECTOR_TYPE_PALIGEMMA2:
{
// PaliGemma2: no patch merging (n_merge=1), bilinear resize
// 224px → 16x16 = 256 tokens; 448px → 32x32 = 1024 tokens
hparams.n_merge = 1;
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
} break;
case PROJECTOR_TYPE_GEMMA3:
{
// default value (used by all model sizes in gemma 3 family)
Expand Down Expand Up @@ -1927,6 +1935,11 @@ struct clip_model_loader {
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
} break;
case PROJECTOR_TYPE_PALIGEMMA2:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_input_proj_b = get_tensor(TN_MM_INP_PROJ_B, false);
} break;
case PROJECTOR_TYPE_GEMMA3:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
Expand Down Expand Up @@ -2977,6 +2990,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int y_patch = img->ny / (params.patch_size * params.n_merge);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PALIGEMMA2:
break; // no pooling: all patches pass through (256 for 224px, 1024 for 448px)
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_IDEFICS3:
Expand Down Expand Up @@ -3528,6 +3543,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
} break;
case PROJECTOR_TYPE_PALIGEMMA2:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
Expand Down Expand Up @@ -3809,6 +3825,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_STEP3VL:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_PALIGEMMA2:
return ctx->model.mm_input_proj_w->ne[1]; // output dim = LM hidden_size
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
Expand Down
9 changes: 9 additions & 0 deletions tools/mtmd/models/siglip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ ggml_cgraph * clip_graph_siglip::build() {
FFN_GELU,
-1);

} else if (proj_type == PROJECTOR_TYPE_PALIGEMMA2) {
// PaliGemma2: direct linear projection, no pooling, no norm before projector
// weight stored as [in=1152, out=2304] in ggml → mul_mat directly (no transpose)
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
if (model.mm_input_proj_b) {
cur = ggml_add(ctx0, cur, model.mm_input_proj_b);
}

} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
Expand Down
25 changes: 17 additions & 8 deletions tools/mtmd/mtmd-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ struct mtmd_cli_context {
// chat template
common_chat_templates_ptr tmpls;
std::vector<common_chat_msg> chat_history;
bool use_jinja = false;
bool use_jinja = false;
bool raw_prompt = false; // true when model has no chat template (e.g. PaliGemma2 PT)
// TODO: support for --system-prompt with /clear command

// support for legacy templates (models not having EOT token)
Expand All @@ -105,12 +106,12 @@ struct mtmd_cli_context {
exit(1);
}

if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
LOG_ERR("Model does not have chat template.\n");
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
exit(1);
raw_prompt = !llama_model_chat_template(model, nullptr) && params.chat_template.empty();
if (raw_prompt) {
LOG_WRN("Model does not have chat template — using raw prompt mode (e.g. PaliGemma2 PT).\n");
LOG_WRN(" For old llava models, you may need to use '--chat-template vicuna'\n");
LOG_WRN(" For MobileVLM models, use '--chat-template deepseek'\n");
LOG_WRN(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
}

tmpls = common_chat_templates_init(model, params.chat_template);
Expand Down Expand Up @@ -230,7 +231,15 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &

static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
std::string formatted_chat;
if (ctx.raw_prompt) {
// raw mode: pass content directly without chat template wrapping
// (used by pre-trained models like PaliGemma2 PT)
formatted_chat = msg.content;
ctx.chat_history.push_back(msg);
} else {
formatted_chat = chat_add_and_format(ctx, msg);
}
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());

mtmd_input_text text;
Expand Down
7 changes: 7 additions & 0 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ struct mtmd_context {
// script.
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
} break;
case PROJECTOR_TYPE_PALIGEMMA2:
{
// PaliGemma2: image embeddings injected as prefix tokens without wrapper
// 224px → 256 tokens; 448px → 1024 tokens
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
{
Expand Down Expand Up @@ -1079,6 +1085,7 @@ bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk
proj_type = ctx->proj_type_a();
}
switch (proj_type) {
case PROJECTOR_TYPE_PALIGEMMA2:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
return true;
Expand Down