forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkimi_linear.py
More file actions
223 lines (190 loc) · 11.3 KB
/
Copy pathkimi_linear.py
File metadata and controls
223 lines (190 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from __future__ import annotations
from typing import Iterable, TYPE_CHECKING
import torch
if TYPE_CHECKING:
from torch import Tensor
from .base import ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM")
class KimiLinearModel(TextModel):
"""Kimi-Linear model with hybrid MLA+KDA architecture"""
model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
_experts: list[dict[str, Tensor]] | None = None
def set_vocab(self):
try:
self._set_vocab_gpt2()
return
except Exception:
pass
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
tokpre = self.get_vocab_base_pre(tokenizer)
if tokpre == "kimi-k2":
# Build merges list using the approach similar to HunYuanMoE
merges = []
vocab = {}
mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute]
for token, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
if len(merged) == 2:
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# Build token list
vocab_size = self.hparams["vocab_size"]
special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute]
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
token = reverse_vocab[i]
tokens.append(token)
if i in special_tokens.values():
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_token_merges(merges)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
special_vocab.add_to_gguf(self.gguf_writer)
# override eos id in config.json with tiktoken eos id
self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute]
else:
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
def set_gguf_parameters(self):
# note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group)
self.hparams["num_key_value_heads"] = 1
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
# KDA & MLA params
# Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
linear_attn_config = self.hparams["linear_attn_config"]
# n_head == 0 for KDA layers, n_head > 0 for MLA layers
# full_attention_layers list will be used to distinguish layer type
_num_kv_heads = list()
_full_attn_layers = linear_attn_config["full_attn_layers"]
for il in range(self.hparams["num_hidden_layers"]):
if il + 1 in _full_attn_layers:
_num_kv_heads.append(self.hparams["num_key_value_heads"])
else:
_num_kv_heads.append(0)
assert len(_num_kv_heads) == self.hparams["num_hidden_layers"]
self.gguf_writer.add_head_count_kv(_num_kv_heads)
if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None:
self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv)
if (kda_head_dim := linear_attn_config.get("head_dim")) is not None:
self.gguf_writer.add_kda_head_dim(kda_head_dim)
# MLA params - use add_* methods that handle arch substitution
# Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv)
if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None:
self.gguf_writer.add_q_lora_rank(q_lora_rank)
# To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA
kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False)
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
# MLA head dimensions
# Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim
qk_nope_head_dim = self.hparams.get("qk_nope_head_dim")
# Rotation - use qk_rope_head_dim for Kimi
qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False)
self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim)
self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim)
v_head_dim = self.hparams.get("v_head_dim")
# Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None:
self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
elif qk_nope_head_dim is not None:
n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim
self.gguf_writer.add_key_length_mla(n_embd_head_k_mla)
# n_embd_head_v_mla = v_head_dim
if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None:
self.gguf_writer.add_value_length_mla(n_embd_head_v_mla)
elif v_head_dim is not None:
self.gguf_writer.add_value_length_mla(v_head_dim)
# moe_intermediate_size (1024 for Kimi)
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
# num_shared_experts (1 for Kimi)
self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"])
# first_k_dense_replace (1 for Kimi - first layer uses dense MLP)
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
# Routed scaling factor (expert_weights_scale = 2.446 for Kimi)
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}")
# Handle KDA conv1d weights
# HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest
# llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest
# GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
# Memory layouts match: both have conv_step (d_conv) changing fastest
if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")):
# HF shape: [d_inner, d_conv] e.g. [4096, 4]
# Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1]
if data_torch.ndim == 2:
d_inner, d_conv = data_torch.shape
# Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest)
data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
elif data_torch.ndim == 3:
# Already 3D [d_inner, 1, d_conv] from unsqueeze
d_inner, _, d_conv = data_torch.shape
data_torch = data_torch.reshape(1, d_inner, 1, d_conv)
logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]")
# Handle A_log: iHF stores as [1, 1, num_heads, 1]
# llama.cpp expects ggml ne = [1, num_heads, 1, 1]
# GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1]
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)
if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
logger.info("Changed dt_bias to dt_proj.bias")
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
# w1: gate, w2: down, w3: up
for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP),
("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP),
("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
new_name = self.format_tensor_name(tname, bid)
yield from super().modify_tensors(data_torch, new_name, bid)
return
# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
if name.endswith("kv_b_proj.weight"):
name_kb = name.replace("kv_b_proj", "k_b_proj")
name_vb = name.replace("kv_b_proj", "v_b_proj")
n_head_kv = self.hparams["num_key_value_heads"]
v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False)
qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
logger.info("Split kv_b n_head_kv %d\n" % n_head_kv)
assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
k_b = k_b.transpose(1, 2)
yield from super().modify_tensors(k_b, name_kb, bid)
yield from super().modify_tensors(v_b, name_vb, bid)
return
yield from super().modify_tensors(data_torch, name, bid)