fix: convert one layer & mtp quant

lvyichen · lvyichen · commit 286b8a7079be · 2026-03-18T19:16:07.000+08:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -8562,7 +8562,10 @@ class Step35Model(TextModel):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        nextn = self.hparams.get("num_nextn_predict_layers", 0)
+        source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+        # Step3.5 runtime currently uses only the first MTP layer. Keep the
+        # GGUF export aligned with that runtime until multi-layer MTP lands.
+        nextn = 1 if source_nextn > 0 else 0
         if nextn > 0:
             self.block_count = self.hparams["num_hidden_layers"] + nextn
             self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -8594,7 +8597,8 @@ def set_gguf_parameters(self):
         kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
         swa_pat = [lt == "sliding_attention" for lt in layer_types]
 
-        nextn = self.hparams.get("num_nextn_predict_layers", 0)
+        source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+        nextn = 1 if source_nextn > 0 else 0
         if nextn > 0:
             self.gguf_writer.add_nextn_predict_layers(nextn)
 
@@ -8639,9 +8643,14 @@ def set_gguf_parameters(self):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
+        source_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+        export_nextn = 1 if source_nextn > 0 else 0
+        last_export_layer = n_main + export_nextn
 
         if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
             il = int(m.group(1))
+            if il >= last_export_layer:
+                return
             if il >= n_main:
                 name = name.replace(f"model.layers.{il}.transformer.", f"model.layers.{il}.")
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2550,6 +2550,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
                 ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
 
+                if (params.mtp && hparams.nextn_predict_layers > 1) {
+                    LLAMA_LOG_WARN("%s: Step35 MTP uses only the first nextn layer\n", __func__);
+                }
+
                 if (params.mtp && hparams.nextn_predict_layers > 0) {
                     hparams.n_layer_kv_from_start = hparams.n_layer;
                 } else {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -100,11 +100,17 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
 
 static bool tensor_name_match_token_embd(const char * tensor_name) {
     return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
-           std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
+           std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0 ||
+           std::strstr(tensor_name, ".nextn.embed_tokens.weight") != nullptr;
 }
 
 static bool tensor_name_match_output_weight(const char * tensor_name) {
-    return std::strcmp(tensor_name, "output.weight") == 0;
+    return std::strcmp(tensor_name, "output.weight") == 0 ||
+           std::strstr(tensor_name, ".nextn.shared_head_head.weight") != nullptr;
+}
+
+static bool tensor_name_should_ignore_imatrix(const char * tensor_name) {
+    return std::strstr(tensor_name, ".nextn.") != nullptr;
 }
 
 //
@@ -301,7 +307,7 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     // do not quantize norm tensors
     quantize &= name.find("_norm.weight") == std::string::npos;
 
-    quantize &= params->quantize_output_tensor || name != "output.weight";
+    quantize &= params->quantize_output_tensor || !tensor_name_match_output_weight(name.c_str());
 
     // do not quantize expert gating tensors
     // NOTE: can't use LLM_TN here because the layer number is not known
@@ -1183,6 +1189,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
+                if (tensor_name_should_ignore_imatrix(tensor->name)) {
+                    imatrix = nullptr;
+                }
                 if (!imatrix && tm.requires_imatrix) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);