Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 10 additions & 60 deletions conversion/hunyuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ def set_vocab(self):
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
# Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1;
# guard SpecialVocab so it doesn't try to emit an invalid pad id.
token_types = None
if (self.hparams.get("pad_token_id") or 0) < 0:
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
Expand Down Expand Up @@ -250,7 +251,8 @@ def set_vocab(self):
self._fix_special_tokens()

def set_gguf_parameters(self):
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
# Some HunYuanVL variants set num_experts=1 (not real MoE);
# prevent the parent class from emitting expert_count metadata in that case.
saved_num_experts = self.hparams.pop("num_experts", None)
super().set_gguf_parameters()
if saved_num_experts is not None and saved_num_experts > 1:
Expand Down Expand Up @@ -288,51 +290,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLVisionModel(MmprojModel):
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
# Each variant maps to a different projector type in clip.cpp so image
# preprocessing follows the correct code path.

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
# HunyuanVL uses max_image_size instead of image_size
if "image_size" not in self.hparams_vision:
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)

@staticmethod
def is_ocr_variant(hparams: dict) -> bool:
"""Return True for HunyuanOCR, False for HunyuanVL.

The projector's output dim must equal the text model's hidden_size by
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
ViT -> LLM projection dim is a hard architectural signature, not a
magic number.
"""
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
return vision_out == 1024

def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
vcfg = self.hparams_vision

if self.is_ocr_variant(self.global_config):
# --- HunyuanOCR ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
return

# --- HunyuanVL ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))

Expand All @@ -353,48 +325,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

def tensor_force_quant(self, name, new_name, bid, n_dims):
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
# HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)


@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLTextModel(HunYuanModel):
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
# the config and pick the matching GGUF architecture.
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL

@staticmethod
def _is_ocr_config(hparams: dict) -> bool:
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
# HunyuanVLVisionModel.is_ocr_variant.
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024

def __init__(self, dir_model: Path, *args, **kwargs):
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
if self._is_ocr_config(raw_hparams):
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
else:
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
super().__init__(dir_model, *args, **kwargs)

def set_gguf_parameters(self):
super().set_gguf_parameters()

# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
# the HunYuan-Dense arch which already handles standard rope in super().
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
return

# XD-RoPE metadata for the HunyuanVL;
if self.rope_parameters.get("rope_type") != "xdrope":
return

# defaults for HunyuanVL. The C++ side later computes:
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
Expand Down
7 changes: 3 additions & 4 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ class MODEL_TENSOR(IntEnum):
V_LAYER_OUT_SCALE = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_MM_PRE_NORM = auto() # hunyuanocr
V_MM_PRE_NORM = auto() # hunyuanvl
V_MM_POST_NORM = auto()
V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3
Expand Down Expand Up @@ -791,8 +791,8 @@ class MODEL_TENSOR(IntEnum):
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
V_TOK_IMG_BEGIN = auto() # hunyuanocr
V_TOK_IMG_END = auto() # hunyuanocr
V_TOK_IMG_BEGIN = auto() # hunyuanvl
V_TOK_IMG_END = auto() # hunyuanvl
V_STD_BIAS = auto() # gemma4
V_STD_SCALE = auto() # gemma4
V_SAM_POS_EMBD = auto() # Deepseek-OCR
Expand Down Expand Up @@ -4273,7 +4273,6 @@ class VisionProjectorType:
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
MINICPMV4_6 = "minicpmv4_6"
GRANITE_SPEECH = "granite_speech" # audio
Expand Down
36 changes: 18 additions & 18 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1366,15 +1366,15 @@ class TensorNameMap:
"mlp_AR.linear_{bid}", # PaddleOCR-VL
"merger.mlp.{bid}",
"vision_tower.merger.mlp.{bid}", # dots.ocr
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
"vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2)
),

MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"model.vision.linear_proj.linear_proj", # cogvlm
"model.projector.layers", # Deepseek-OCR
"visual.merger.proj", # glm4v
"vit.perceive.mlp", # HunyuanOCR
"vit.perceive.mlp", # HunyuanVL
),

MODEL_TENSOR.V_MMPROJ_MLP: (
Expand Down Expand Up @@ -1403,7 +1403,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vit.embeddings.patch_embedding", # HunyuanOCR
"vit.embeddings.patch_embedding", # HunyuanVL
"vision_tower.patch_conv", # pixtral-hf
"vision_encoder.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
Expand All @@ -1429,7 +1429,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
"vit.embeddings.position_embedding", # HunyuanOCR
"vit.embeddings.position_embedding", # HunyuanVL
"vision_model.positional_embedding_vlm", # llama 4
"vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl
Expand All @@ -1442,12 +1442,12 @@ class TensorNameMap:

MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
"model.image_newline", # Deepseek-OCR
"vit.perceive.image_newline", # HunyuanOCR
"vit.perceive.image_newline", # HunyuanVL
),

MODEL_TENSOR.V_ENC_EMBD_VSEP: (
"model.view_seperator", # Deepseek-OCR
"vit.perceive.image_sep", # HunyuanOCR
"vit.perceive.image_sep", # HunyuanVL
),

MODEL_TENSOR.V_ENC_ATTN_QKV: (
Expand All @@ -1466,7 +1466,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
"vit.layers.{bid}.self_attn.q_proj", # HunyuanVL
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
Expand All @@ -1490,7 +1490,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
"vit.layers.{bid}.self_attn.k_proj", # HunyuanVL
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
Expand All @@ -1514,7 +1514,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
"vit.layers.{bid}.self_attn.v_proj", # HunyuanVL
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
Expand All @@ -1532,7 +1532,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
"vit.layers.{bid}.input_layernorm", # HunyuanVL
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
Expand All @@ -1553,7 +1553,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
"vit.layers.{bid}.self_attn.o_proj", # HunyuanVL
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
Expand All @@ -1580,7 +1580,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
"vit.layers.{bid}.post_attention_layernorm", # HunyuanVL
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
Expand All @@ -1601,7 +1601,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
Expand Down Expand Up @@ -1630,7 +1630,7 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
Expand Down Expand Up @@ -1694,7 +1694,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
"vision_tower.post_trunk_norm", # dots.ocr
"vit.perceive.after_rms", # HunyuanOCR
"vit.perceive.after_rms", # HunyuanVL
),

MODEL_TENSOR.V_MM_INP_PROJ: (
Expand Down Expand Up @@ -1899,15 +1899,15 @@ class TensorNameMap:
),

MODEL_TENSOR.V_MM_PRE_NORM: (
"vit.perceive.before_rms", # HunyuanOCR
"vit.perceive.before_rms", # HunyuanVL
),

MODEL_TENSOR.V_TOK_IMG_BEGIN: (
"vit.perceive.image_begin", # HunyuanOCR
"vit.perceive.image_begin", # HunyuanVL
),

MODEL_TENSOR.V_TOK_IMG_END: (
"vit.perceive.image_end", # HunyuanOCR
"vit.perceive.image_end", # HunyuanVL
),

MODEL_TENSOR.V_STD_BIAS: (
Expand Down
8 changes: 4 additions & 4 deletions src/llama-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
{ "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
Expand Down Expand Up @@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
Expand Down Expand Up @@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
// tencent/HunyuanOCR
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
// tencent/HunyuanOCR & tencent/HunyuanVL
ss << "<|hy_begin▁of▁sentence|>";
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
Expand Down
2 changes: 1 addition & 1 deletion src/llama-chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
LLM_CHAT_TEMPLATE_HUNYUAN_VL,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
Expand Down
Loading
Loading