Skip to content

Commit add749e

Browse files
author
lvyichen
committed
fix: convert one layer & mtp quant
1 parent a0c02ae commit add749e

3 files changed

Lines changed: 27 additions & 5 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9147,7 +9147,10 @@ class Step35Model(TextModel):
91479147

91489148
def __init__(self, *args, **kwargs):
91499149
super().__init__(*args, **kwargs)
9150-
nextn = self.hparams.get("num_nextn_predict_layers", 0)
9150+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
9151+
# Step3.5 runtime currently uses only the first MTP layer. Keep the
9152+
# GGUF export aligned with that runtime until multi-layer MTP lands.
9153+
nextn = 1 if source_nextn > 0 else 0
91519154
if nextn > 0:
91529155
self.block_count = self.hparams["num_hidden_layers"] + nextn
91539156
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -9179,7 +9182,8 @@ def set_gguf_parameters(self):
91799182
kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
91809183
swa_pat = [lt == "sliding_attention" for lt in layer_types]
91819184

9182-
nextn = self.hparams.get("num_nextn_predict_layers", 0)
9185+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
9186+
nextn = 1 if source_nextn > 0 else 0
91839187
if nextn > 0:
91849188
self.gguf_writer.add_nextn_predict_layers(nextn)
91859189

@@ -9224,9 +9228,14 @@ def set_gguf_parameters(self):
92249228

92259229
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
92269230
n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
9231+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
9232+
export_nextn = 1 if source_nextn > 0 else 0
9233+
last_export_layer = n_main + export_nextn
92279234

92289235
if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
92299236
il = int(m.group(1))
9237+
if il >= last_export_layer:
9238+
return
92309239
if il >= n_main:
92319240
name = name.replace(f"model.layers.{il}.transformer.", f"model.layers.{il}.")
92329241

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2586,6 +2586,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
25862586

25872587
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
25882588

2589+
if (params.mtp && hparams.nextn_predict_layers > 1) {
2590+
LLAMA_LOG_WARN("%s: Step35 MTP uses only the first nextn layer\n", __func__);
2591+
}
2592+
25892593
if (params.mtp && hparams.nextn_predict_layers > 0) {
25902594
hparams.n_layer_kv_from_start = hparams.n_layer;
25912595
} else {

src/llama-quant.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,17 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
9999

100100
static bool tensor_name_match_token_embd(const char * tensor_name) {
101101
return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
102-
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
102+
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0 ||
103+
std::strstr(tensor_name, ".nextn.embed_tokens.weight") != nullptr;
103104
}
104105

105106
static bool tensor_name_match_output_weight(const char * tensor_name) {
106-
return std::strcmp(tensor_name, "output.weight") == 0;
107+
return std::strcmp(tensor_name, "output.weight") == 0 ||
108+
std::strstr(tensor_name, ".nextn.shared_head_head.weight") != nullptr;
109+
}
110+
111+
static bool tensor_name_should_ignore_imatrix(const char * tensor_name) {
112+
return std::strstr(tensor_name, ".nextn.") != nullptr;
107113
}
108114

109115
//
@@ -300,7 +306,7 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
300306
// do not quantize norm tensors
301307
quantize &= name.find("_norm.weight") == std::string::npos;
302308

303-
quantize &= params->quantize_output_tensor || name != "output.weight";
309+
quantize &= params->quantize_output_tensor || !tensor_name_match_output_weight(name.c_str());
304310

305311
// do not quantize expert gating tensors
306312
// NOTE: can't use LLM_TN here because the layer number is not known
@@ -1195,6 +1201,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
11951201
}
11961202
}
11971203
}
1204+
if (tensor_name_should_ignore_imatrix(tensor->name)) {
1205+
imatrix = nullptr;
1206+
}
11981207
if (!imatrix && tm.requires_imatrix) {
11991208
LLAMA_LOG_ERROR("\n\n============================================================\n");
12001209
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);

0 commit comments

Comments
 (0)