Skip to content

Commit 286b8a7

Browse files
author
lvyichen
committed
fix: convert one layer & mtp quant
1 parent f6a0283 commit 286b8a7

3 files changed

Lines changed: 27 additions & 5 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8562,7 +8562,10 @@ class Step35Model(TextModel):
85628562

85638563
def __init__(self, *args, **kwargs):
85648564
super().__init__(*args, **kwargs)
8565-
nextn = self.hparams.get("num_nextn_predict_layers", 0)
8565+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
8566+
# Step3.5 runtime currently uses only the first MTP layer. Keep the
8567+
# GGUF export aligned with that runtime until multi-layer MTP lands.
8568+
nextn = 1 if source_nextn > 0 else 0
85668569
if nextn > 0:
85678570
self.block_count = self.hparams["num_hidden_layers"] + nextn
85688571
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -8594,7 +8597,8 @@ def set_gguf_parameters(self):
85948597
kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
85958598
swa_pat = [lt == "sliding_attention" for lt in layer_types]
85968599

8597-
nextn = self.hparams.get("num_nextn_predict_layers", 0)
8600+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
8601+
nextn = 1 if source_nextn > 0 else 0
85988602
if nextn > 0:
85998603
self.gguf_writer.add_nextn_predict_layers(nextn)
86008604

@@ -8639,9 +8643,14 @@ def set_gguf_parameters(self):
86398643

86408644
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
86418645
n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
8646+
source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
8647+
export_nextn = 1 if source_nextn > 0 else 0
8648+
last_export_layer = n_main + export_nextn
86428649

86438650
if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
86448651
il = int(m.group(1))
8652+
if il >= last_export_layer:
8653+
return
86458654
if il >= n_main:
86468655
name = name.replace(f"model.layers.{il}.transformer.", f"model.layers.{il}.")
86478656

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2550,6 +2550,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
25502550

25512551
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
25522552

2553+
if (params.mtp && hparams.nextn_predict_layers > 1) {
2554+
LLAMA_LOG_WARN("%s: Step35 MTP uses only the first nextn layer\n", __func__);
2555+
}
2556+
25532557
if (params.mtp && hparams.nextn_predict_layers > 0) {
25542558
hparams.n_layer_kv_from_start = hparams.n_layer;
25552559
} else {

src/llama-quant.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,17 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
100100

101101
static bool tensor_name_match_token_embd(const char * tensor_name) {
102102
return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
103-
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
103+
std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0 ||
104+
std::strstr(tensor_name, ".nextn.embed_tokens.weight") != nullptr;
104105
}
105106

106107
static bool tensor_name_match_output_weight(const char * tensor_name) {
107-
return std::strcmp(tensor_name, "output.weight") == 0;
108+
return std::strcmp(tensor_name, "output.weight") == 0 ||
109+
std::strstr(tensor_name, ".nextn.shared_head_head.weight") != nullptr;
110+
}
111+
112+
static bool tensor_name_should_ignore_imatrix(const char * tensor_name) {
113+
return std::strstr(tensor_name, ".nextn.") != nullptr;
108114
}
109115

110116
//
@@ -301,7 +307,7 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
301307
// do not quantize norm tensors
302308
quantize &= name.find("_norm.weight") == std::string::npos;
303309

304-
quantize &= params->quantize_output_tensor || name != "output.weight";
310+
quantize &= params->quantize_output_tensor || !tensor_name_match_output_weight(name.c_str());
305311

306312
// do not quantize expert gating tensors
307313
// NOTE: can't use LLM_TN here because the layer number is not known
@@ -1183,6 +1189,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
11831189
}
11841190
}
11851191
}
1192+
if (tensor_name_should_ignore_imatrix(tensor->name)) {
1193+
imatrix = nullptr;
1194+
}
11861195
if (!imatrix && tm.requires_imatrix) {
11871196
LLAMA_LOG_ERROR("\n\n============================================================\n");
11881197
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);

0 commit comments

Comments
 (0)