Skip to content

Commit 5493edb

Browse files
mtmd: use tensor_mapping.py for all granite_speech tensors
1 parent bbf2625 commit 5493edb

4 files changed

Lines changed: 120 additions & 75 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -12439,81 +12439,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1243912439
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
1244012440
data_torch = data_torch.squeeze(1)
1244112441

12442-
if name.startswith("projector."):
12443-
gguf_name = self._map_projector_tensor(name)
12444-
if gguf_name is None:
12445-
return
12446-
yield (gguf_name, data_torch)
12447-
return
12448-
12449-
global_map = {
12450-
"encoder.input_linear.weight": "a.enc_inp_linear.weight",
12451-
"encoder.input_linear.bias": "a.enc_inp_linear.bias",
12452-
"encoder.out.weight": "a.enc_ctc_out.weight",
12453-
"encoder.out.bias": "a.enc_ctc_out.bias",
12454-
"encoder.out_mid.weight": "a.enc_ctc_out_mid.weight",
12455-
"encoder.out_mid.bias": "a.enc_ctc_out_mid.bias",
12456-
}
12457-
if name in global_map:
12458-
yield (global_map[name], data_torch)
12459-
return
12460-
12461-
if ".attn.rel_pos_emb.weight" in name:
12462-
assert bid is not None
12463-
yield (f"a.blk.{bid}.attn_rel_pos_emb", data_torch)
12464-
return
12465-
1246612442
yield from super().modify_tensors(data_torch, name, bid)
1246712443

12468-
@staticmethod
12469-
def _map_projector_tensor(name: str) -> str | None:
12470-
static_map = {
12471-
"projector.query": "a.proj_query",
12472-
"projector.qformer.layernorm.weight": "a.proj_norm.weight",
12473-
"projector.qformer.layernorm.bias": "a.proj_norm.bias",
12474-
"projector.linear.weight": "a.proj_linear.weight",
12475-
"projector.linear.bias": "a.proj_linear.bias",
12476-
}
12477-
if name in static_map:
12478-
return static_map[name]
12479-
m = re.match(r"projector\.qformer\.encoder\.layer\.(\d+)\.(.*)", name)
12480-
if not m:
12481-
return None
12482-
lid = m.group(1)
12483-
rest = m.group(2)
12484-
layer_map = {
12485-
"attention.attention.query.weight": "self_attn_q.weight",
12486-
"attention.attention.query.bias": "self_attn_q.bias",
12487-
"attention.attention.key.weight": "self_attn_k.weight",
12488-
"attention.attention.key.bias": "self_attn_k.bias",
12489-
"attention.attention.value.weight": "self_attn_v.weight",
12490-
"attention.attention.value.bias": "self_attn_v.bias",
12491-
"attention.output.dense.weight": "self_attn_out.weight",
12492-
"attention.output.dense.bias": "self_attn_out.bias",
12493-
"attention.output.LayerNorm.weight": "self_attn_norm.weight",
12494-
"attention.output.LayerNorm.bias": "self_attn_norm.bias",
12495-
"crossattention.attention.query.weight": "cross_attn_q.weight",
12496-
"crossattention.attention.query.bias": "cross_attn_q.bias",
12497-
"crossattention.attention.key.weight": "cross_attn_k.weight",
12498-
"crossattention.attention.key.bias": "cross_attn_k.bias",
12499-
"crossattention.attention.value.weight": "cross_attn_v.weight",
12500-
"crossattention.attention.value.bias": "cross_attn_v.bias",
12501-
"crossattention.output.dense.weight": "cross_attn_out.weight",
12502-
"crossattention.output.dense.bias": "cross_attn_out.bias",
12503-
"crossattention.output.LayerNorm.weight": "cross_attn_norm.weight",
12504-
"crossattention.output.LayerNorm.bias": "cross_attn_norm.bias",
12505-
"intermediate_query.dense.weight": "ffn_up.weight",
12506-
"intermediate_query.dense.bias": "ffn_up.bias",
12507-
"output_query.dense.weight": "ffn_down.weight",
12508-
"output_query.dense.bias": "ffn_down.bias",
12509-
"output_query.LayerNorm.weight": "ffn_norm.weight",
12510-
"output_query.LayerNorm.bias": "ffn_norm.bias",
12511-
}
12512-
suffix = layer_map.get(rest)
12513-
if suffix is None:
12514-
return None
12515-
return f"a.proj_blk.{lid}.{suffix}"
12516-
1251712444

1251812445
@ModelBase.register("Lfm25AudioTokenizer")
1251912446
class LFM25AudioTokenizer(LFM2Model):

gguf-py/gguf/constants.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,26 @@ class MODEL_TENSOR(IntEnum):
852852
A_ENC_CONV_NORM = auto() # SSM conv
853853
A_ENC_CONV_PW1 = auto()
854854
A_ENC_CONV_PW2 = auto()
855+
A_CTC_OUT = auto()
856+
A_CTC_OUT_MID = auto()
857+
A_ENC_ATTN_REL_POS_EMB = auto()
858+
# qformer projector
859+
A_QF_PROJ_QUERY = auto()
860+
A_QF_PROJ_NORM = auto()
861+
A_QF_PROJ_LINEAR = auto()
862+
A_QF_SELF_ATTN_Q = auto()
863+
A_QF_SELF_ATTN_K = auto()
864+
A_QF_SELF_ATTN_V = auto()
865+
A_QF_SELF_ATTN_O = auto()
866+
A_QF_SELF_ATTN_NORM = auto()
867+
A_QF_CROSS_ATTN_Q = auto()
868+
A_QF_CROSS_ATTN_K = auto()
869+
A_QF_CROSS_ATTN_V = auto()
870+
A_QF_CROSS_ATTN_O = auto()
871+
A_QF_CROSS_ATTN_NORM = auto()
872+
A_QF_FFN_UP = auto()
873+
A_QF_FFN_DOWN = auto()
874+
A_QF_FFN_NORM = auto()
855875

856876

857877
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -1330,6 +1350,26 @@ class MODEL_TENSOR(IntEnum):
13301350
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
13311351
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
13321352
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
1353+
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
1354+
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
1355+
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
1356+
# qformer projector
1357+
MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query",
1358+
MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm",
1359+
MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear",
1360+
MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q",
1361+
MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k",
1362+
MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v",
1363+
MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out",
1364+
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm",
1365+
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q",
1366+
MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k",
1367+
MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v",
1368+
MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out",
1369+
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm",
1370+
MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up",
1371+
MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down",
1372+
MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm",
13331373
# NextN/MTP
13341374
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
13351375
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
@@ -1477,6 +1517,26 @@ class MODEL_TENSOR(IntEnum):
14771517
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
14781518
MODEL_TENSOR.A_PER_DIM_K_SCALE,
14791519
MODEL_TENSOR.A_PER_DIM_SCALE,
1520+
MODEL_TENSOR.A_CTC_OUT,
1521+
MODEL_TENSOR.A_CTC_OUT_MID,
1522+
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB,
1523+
# qformer projector
1524+
MODEL_TENSOR.A_QF_PROJ_QUERY,
1525+
MODEL_TENSOR.A_QF_PROJ_NORM,
1526+
MODEL_TENSOR.A_QF_PROJ_LINEAR,
1527+
MODEL_TENSOR.A_QF_SELF_ATTN_Q,
1528+
MODEL_TENSOR.A_QF_SELF_ATTN_K,
1529+
MODEL_TENSOR.A_QF_SELF_ATTN_V,
1530+
MODEL_TENSOR.A_QF_SELF_ATTN_O,
1531+
MODEL_TENSOR.A_QF_SELF_ATTN_NORM,
1532+
MODEL_TENSOR.A_QF_CROSS_ATTN_Q,
1533+
MODEL_TENSOR.A_QF_CROSS_ATTN_K,
1534+
MODEL_TENSOR.A_QF_CROSS_ATTN_V,
1535+
MODEL_TENSOR.A_QF_CROSS_ATTN_O,
1536+
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM,
1537+
MODEL_TENSOR.A_QF_FFN_UP,
1538+
MODEL_TENSOR.A_QF_FFN_DOWN,
1539+
MODEL_TENSOR.A_QF_FFN_NORM,
14801540
],
14811541
MODEL_ARCH.LLAMA: [
14821542
MODEL_TENSOR.TOKEN_EMBD,

gguf-py/gguf/tensor_mapping.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,21 @@ class TensorNameMap:
158158
MODEL_TENSOR.V_ENC_MSFA_NORM: (
159159
"model.vision_tower.timm_model.msfa.norm", # gemma3n
160160
),
161+
MODEL_TENSOR.A_CTC_OUT: (
162+
"encoder.out",
163+
),
164+
MODEL_TENSOR.A_CTC_OUT_MID: (
165+
"encoder.out_mid",
166+
),
167+
MODEL_TENSOR.A_QF_PROJ_QUERY: (
168+
"projector.query",
169+
),
170+
MODEL_TENSOR.A_QF_PROJ_NORM: (
171+
"projector.qformer.layernorm",
172+
),
173+
MODEL_TENSOR.A_QF_PROJ_LINEAR: (
174+
"projector.linear",
175+
),
161176
}
162177

163178
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -1890,6 +1905,7 @@ class TensorNameMap:
18901905

18911906
MODEL_TENSOR.A_ENC_INP_PROJ: (
18921907
"conformer.subsample_conv_projection.input_proj_linear", # gemma4
1908+
"encoder.input_linear",
18931909
),
18941910

18951911
MODEL_TENSOR.A_ENC_CONV2D: (
@@ -2131,6 +2147,49 @@ class TensorNameMap:
21312147
"model.embed_audio.soft_embedding_norm", # gemma3n
21322148
),
21332149

2150+
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: (
2151+
"encoder.layers.{bid}.attn.rel_pos_emb.weight",
2152+
),
2153+
MODEL_TENSOR.A_QF_SELF_ATTN_Q: (
2154+
"projector.qformer.encoder.layer.{bid}.attention.attention.query",
2155+
),
2156+
MODEL_TENSOR.A_QF_SELF_ATTN_K: (
2157+
"projector.qformer.encoder.layer.{bid}.attention.attention.key",
2158+
),
2159+
MODEL_TENSOR.A_QF_SELF_ATTN_V: (
2160+
"projector.qformer.encoder.layer.{bid}.attention.attention.value",
2161+
),
2162+
MODEL_TENSOR.A_QF_SELF_ATTN_O: (
2163+
"projector.qformer.encoder.layer.{bid}.attention.output.dense",
2164+
),
2165+
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: (
2166+
"projector.qformer.encoder.layer.{bid}.attention.output.LayerNorm",
2167+
),
2168+
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: (
2169+
"projector.qformer.encoder.layer.{bid}.crossattention.attention.query",
2170+
),
2171+
MODEL_TENSOR.A_QF_CROSS_ATTN_K: (
2172+
"projector.qformer.encoder.layer.{bid}.crossattention.attention.key",
2173+
),
2174+
MODEL_TENSOR.A_QF_CROSS_ATTN_V: (
2175+
"projector.qformer.encoder.layer.{bid}.crossattention.attention.value",
2176+
),
2177+
MODEL_TENSOR.A_QF_CROSS_ATTN_O: (
2178+
"projector.qformer.encoder.layer.{bid}.crossattention.output.dense",
2179+
),
2180+
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: (
2181+
"projector.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm",
2182+
),
2183+
MODEL_TENSOR.A_QF_FFN_UP: (
2184+
"projector.qformer.encoder.layer.{bid}.intermediate_query.dense",
2185+
),
2186+
MODEL_TENSOR.A_QF_FFN_DOWN: (
2187+
"projector.qformer.encoder.layer.{bid}.output_query.dense",
2188+
),
2189+
MODEL_TENSOR.A_QF_FFN_NORM: (
2190+
"projector.qformer.encoder.layer.{bid}.output_query.LayerNorm",
2191+
),
2192+
21342193
# NextN/MTP tensors
21352194
MODEL_TENSOR.NEXTN_EH_PROJ: (
21362195
"model.layers.{bid}.eh_proj",

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,7 @@
182182
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
183183
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
184184
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
185-
// granite_speech encoder
186-
#define TN_INP_PROJ "a.enc_inp_linear.%s"
185+
#define TN_INP_PROJ "a.input_projection.%s"
187186
#define TN_CTC_OUT "a.enc_ctc_out.%s"
188187
#define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
189188
#define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb"

0 commit comments

Comments
 (0)