@@ -10611,6 +10611,18 @@ def set_gguf_parameters(self):
1061110611 logger.info("gguf: (granite) logits_scale = %s", logits_scale)
1061210612
1061310613
10614+ @ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.TEXT)
10615+ class GraniteSpeechTextModel(GraniteModel):
10616+ model_arch = gguf.MODEL_ARCH.GRANITE
10617+
10618+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10619+ if name.startswith(("encoder.", "projector.")):
10620+ return
10621+ if name.startswith("language_model."):
10622+ name = name[len("language_model."):]
10623+ yield from super().modify_tensors(data_torch, name, bid)
10624+
10625+
1061410626@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
1061510627class GraniteMoeModel(GraniteModel):
1061610628 """Conversion for IBM's GraniteMoeForCausalLM"""
@@ -10912,14 +10924,14 @@ def set_vocab(self):
1091210924 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
1091310925 self.hparams["vocab_size"] = vocab_size
1091410926
10915- assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
10927+ assert max(tokenizer.vocab.values()) < vocab_size
1091610928
1091710929 tokpre = self.get_vocab_base_pre(tokenizer)
1091810930
10919- reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
10920- added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
10931+ reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
10932+ added_vocab = tokenizer.get_added_vocab()
1092110933
10922- added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
10934+ added_tokens_decoder = tokenizer.added_tokens_decoder
1092310935
1092410936 for i in range(vocab_size):
1092510937 if i not in reverse_vocab:
@@ -10930,7 +10942,7 @@ def set_vocab(self):
1093010942 if token in added_vocab:
1093110943 if not added_tokens_decoder[i].normalized:
1093210944 previous_token = token
10933- token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
10945+ token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1093410946 if previous_token != token:
1093510947 logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1093610948
@@ -12347,6 +12359,154 @@ def modify_tensors(self, data_torch, name, bid):
1234712359 yield from super().modify_tensors(data_torch, name, bid)
1234812360
1234912361
12362+ @ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.MMPROJ)
12363+ class GraniteSpeechMmprojModel(MmprojModel):
12364+ has_vision_encoder = False
12365+ has_audio_encoder = True
12366+
12367+ _batch_norm_tensors: list[dict[str, Tensor]] | None = None
12368+
12369+ def get_audio_config(self) -> dict[str, Any] | None:
12370+ return self.global_config.get("encoder_config")
12371+
12372+ def set_gguf_parameters(self):
12373+ assert self.hparams_audio is not None
12374+ a = self.hparams_audio
12375+ a["hidden_size"] = a["hidden_dim"]
12376+ a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
12377+ a["num_attention_heads"] = a["num_heads"]
12378+ a["num_hidden_layers"] = a["num_layers"]
12379+
12380+ super().set_gguf_parameters()
12381+
12382+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
12383+ self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
12384+ self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
12385+
12386+ def tensor_force_quant(self, name, new_name, bid, n_dims):
12387+ if "encoder" in name or "projector" in name:
12388+ if ".conv" in name and ".weight" in name:
12389+ return gguf.GGMLQuantizationType.F32
12390+ return super().tensor_force_quant(name, new_name, bid, n_dims)
12391+
12392+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
12393+ if name.startswith("language_model."):
12394+ return
12395+ if "attention_dists" in name:
12396+ return
12397+ if "num_batches_tracked" in name:
12398+ return
12399+
12400+ # fold running_mean, running_var and eps into weight and bias for batch_norm
12401+ if "batch_norm" in name and "encoder.layers." in name:
12402+ if self._batch_norm_tensors is None:
12403+ self._batch_norm_tensors = [{} for _ in range(self.block_count)]
12404+ assert bid is not None
12405+ self._batch_norm_tensors[bid][name] = data_torch
12406+ if len(self._batch_norm_tensors[bid]) < 4:
12407+ return
12408+ prefix = f"encoder.layers.{bid}.conv.batch_norm"
12409+ weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
12410+ bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
12411+ running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
12412+ running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
12413+ eps = 1e-5
12414+ a = weight / torch.sqrt(running_var + eps)
12415+ b = bias - running_mean * a
12416+ yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
12417+ yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
12418+ return
12419+
12420+ if ".attn.to_kv.weight" in name:
12421+ k_weight, v_weight = data_torch.chunk(2, dim=0)
12422+ yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
12423+ yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
12424+ return
12425+
12426+ if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
12427+ if data_torch.ndim == 3 and data_torch.shape[2] == 1:
12428+ data_torch = data_torch.squeeze(2)
12429+
12430+ if "depth_conv" in name and name.endswith(".weight"):
12431+ if data_torch.ndim == 3 and data_torch.shape[1] == 1:
12432+ data_torch = data_torch.squeeze(1)
12433+
12434+ if name.startswith("projector."):
12435+ gguf_name = self._map_projector_tensor(name)
12436+ if gguf_name is None:
12437+ return
12438+ yield (gguf_name, data_torch)
12439+ return
12440+
12441+ global_map = {
12442+ "encoder.input_linear.weight": "a.enc_inp_linear.weight",
12443+ "encoder.input_linear.bias": "a.enc_inp_linear.bias",
12444+ "encoder.out.weight": "a.enc_ctc_out.weight",
12445+ "encoder.out.bias": "a.enc_ctc_out.bias",
12446+ "encoder.out_mid.weight": "a.enc_ctc_out_mid.weight",
12447+ "encoder.out_mid.bias": "a.enc_ctc_out_mid.bias",
12448+ }
12449+ if name in global_map:
12450+ yield (global_map[name], data_torch)
12451+ return
12452+
12453+ if ".attn.rel_pos_emb.weight" in name:
12454+ assert bid is not None
12455+ yield (f"a.blk.{bid}.attn_rel_pos_emb", data_torch)
12456+ return
12457+
12458+ yield from super().modify_tensors(data_torch, name, bid)
12459+
12460+ @staticmethod
12461+ def _map_projector_tensor(name: str) -> str | None:
12462+ static_map = {
12463+ "projector.query": "a.proj_query",
12464+ "projector.qformer.layernorm.weight": "a.proj_norm.weight",
12465+ "projector.qformer.layernorm.bias": "a.proj_norm.bias",
12466+ "projector.linear.weight": "a.proj_linear.weight",
12467+ "projector.linear.bias": "a.proj_linear.bias",
12468+ }
12469+ if name in static_map:
12470+ return static_map[name]
12471+ m = re.match(r"projector\.qformer\.encoder\.layer\.(\d+)\.(.*)", name)
12472+ if not m:
12473+ return None
12474+ lid = m.group(1)
12475+ rest = m.group(2)
12476+ layer_map = {
12477+ "attention.attention.query.weight": "self_attn_q.weight",
12478+ "attention.attention.query.bias": "self_attn_q.bias",
12479+ "attention.attention.key.weight": "self_attn_k.weight",
12480+ "attention.attention.key.bias": "self_attn_k.bias",
12481+ "attention.attention.value.weight": "self_attn_v.weight",
12482+ "attention.attention.value.bias": "self_attn_v.bias",
12483+ "attention.output.dense.weight": "self_attn_out.weight",
12484+ "attention.output.dense.bias": "self_attn_out.bias",
12485+ "attention.output.LayerNorm.weight": "self_attn_norm.weight",
12486+ "attention.output.LayerNorm.bias": "self_attn_norm.bias",
12487+ "crossattention.attention.query.weight": "cross_attn_q.weight",
12488+ "crossattention.attention.query.bias": "cross_attn_q.bias",
12489+ "crossattention.attention.key.weight": "cross_attn_k.weight",
12490+ "crossattention.attention.key.bias": "cross_attn_k.bias",
12491+ "crossattention.attention.value.weight": "cross_attn_v.weight",
12492+ "crossattention.attention.value.bias": "cross_attn_v.bias",
12493+ "crossattention.output.dense.weight": "cross_attn_out.weight",
12494+ "crossattention.output.dense.bias": "cross_attn_out.bias",
12495+ "crossattention.output.LayerNorm.weight": "cross_attn_norm.weight",
12496+ "crossattention.output.LayerNorm.bias": "cross_attn_norm.bias",
12497+ "intermediate_query.dense.weight": "ffn_up.weight",
12498+ "intermediate_query.dense.bias": "ffn_up.bias",
12499+ "output_query.dense.weight": "ffn_down.weight",
12500+ "output_query.dense.bias": "ffn_down.bias",
12501+ "output_query.LayerNorm.weight": "ffn_norm.weight",
12502+ "output_query.LayerNorm.bias": "ffn_norm.bias",
12503+ }
12504+ suffix = layer_map.get(rest)
12505+ if suffix is None:
12506+ return None
12507+ return f"a.proj_blk.{lid}.{suffix}"
12508+
12509+
1235012510@ModelBase.register("Lfm25AudioTokenizer")
1235112511class LFM25AudioTokenizer(LFM2Model):
1235212512 model_arch = gguf.MODEL_ARCH.LFM2
@@ -13356,6 +13516,8 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1335613516 # TODO: refactor this later to avoid adding exception here
1335713517 if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
1335813518 return arch
13519+ if model_type == ModelType.TEXT and arch == "GraniteSpeechForConditionalGeneration":
13520+ return arch
1335913521
1336013522 # if "architectures" is found in the sub-config, use that instead
1336113523 if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
0 commit comments