ggml-org
diff --git a/‎common/speculative.cpp‎
Lines changed: 13 additions & 0 deletions b/‎common/speculative.cpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎conversion/step3.py‎
Lines changed: 99 additions & 10 deletions b/‎conversion/step3.py‎
Lines changed: 99 additions & 10 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎gguf-py/gguf/constants.py‎
Lines changed: 7 additions & 0 deletions b/‎gguf-py/gguf/constants.py‎
Lines changed: 7 additions & 0 deletions
@@ -635,6 +635,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
         }
 
+        // First draft step uses the first MTP block (step 0). Archs with a
+        // single MTP block ignore this; multi-block archs (Step-3.5-Flash) use
+        // it to round-robin across their N MTP layers.
+        llama_set_mtp_step(ctx_dft, 0);
+
         int ret = llama_decode(ctx_dft, batch);
         if (ret != 0) {
             LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
@@ -699,6 +704,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                 break;
             }
 
+            // Step i+1: feed the i-th sampled draft token into the (i+1)-th
+            // MTP block. Multi-block archs round-robin via mtp_step % N.
+            llama_set_mtp_step(ctx_dft, (uint32_t)(i + 1));
+
             // evaluate the drafted tokens on the draft model
             ret = llama_decode(ctx_dft, batch);
             if (ret != 0) {
@@ -709,6 +718,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             ++i;
         }
 
+        // Reset MTP step so a subsequent non-MTP decode on this context doesn't
+        // inherit a stale offset.
+        llama_set_mtp_step(ctx_dft, 0);
+
         for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
             auto & dp = dparams[seq_id];
             if (!dp.drafting) {
 
@@ -99,6 +99,24 @@ class Step3VLTextModel(Qwen3Model):
 class Step35Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STEP35
 
+    # --mtp / --no-mtp toggles (see convert_hf_to_gguf.py main()).
+    # Unlike Qwen3.5 which stores MTP under a `mtp.*` namespace, Step3.5 just
+    # appends MTP layers at `model.layers.{num_hidden_layers + i}`; these flags
+    # filter by layer index instead of by name prefix.
+    no_mtp: bool = False
+    mtp_only: bool = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # NextN/MTP layers are appended past num_hidden_layers; extend the
+        # tensor map to cover them so the MTP block's tensors get correctly
+        # indexed names. When --no-mtp drops the MTP blocks, fall back to the
+        # base num_hidden_layers so we don't reserve unused slots.
+        n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+        if n_nextn > 0 and not self.no_mtp:
+            self.block_count = int(self.hparams["num_hidden_layers"]) + n_nextn
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
     def set_gguf_parameters(self):
         rope_theta = self.hparams.get("rope_theta")
         if isinstance(rope_theta, list):
@@ -119,8 +137,25 @@ def set_gguf_parameters(self):
         n_head_swa = attn_other.get("num_attention_heads", n_head_base)
         n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
 
-        layer_types = layer_types[: self.block_count]
-        partial_rotary_factors = partial_rotary_factors[: self.block_count]
+        n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+
+        # The Step3p5 HF checkpoint stores layer_types/partial_rotary_factors
+        # entries for the MTP blocks past num_hidden_layers; preserve them so
+        # the MTP layer's attention shape, SWA flag, and partial RoPE dim are
+        # set correctly. Pad with full-attention defaults if the checkpoint
+        # truncated them.
+        def _pad(arr, n, default):
+            arr = list(arr)
+            if len(arr) < n:
+                arr = arr + [default] * (n - len(arr))
+            return arr[:n]
+
+        layer_types = _pad(layer_types, self.block_count, "full_attention")
+        partial_rotary_factors = _pad(
+            partial_rotary_factors,
+            self.block_count,
+            0.5,  # full_attention default for Step3p5
+        )
         assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
         head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
         kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
@@ -157,14 +192,25 @@ def set_gguf_parameters(self):
 
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
 
-        # Optional per-layer SwiGLU clamps.
+        # Optional per-layer SwiGLU clamps. MTP layers default to no clamping (0.0).
         if (limits := self.hparams.get("swiglu_limits")) is not None:
-            limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
+            limits_f = _pad(
+                [0.0 if v is None else float(v) for v in limits],
+                self.block_count,
+                0.0,
+            )
             self.gguf_writer.add_swiglu_clamp_exp(limits_f)
         if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
-            limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
+            limits_shared_f = _pad(
+                [0.0 if v is None else float(v) for v in limits_shared],
+                self.block_count,
+                0.0,
+            )
             self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
 
+        if n_nextn > 0 and not self.no_mtp:
+            self.gguf_writer.add_nextn_predict_layers(n_nextn)
+
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
@@ -175,13 +221,41 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
 
         return super().filter_tensors((name, gen))
 
+    def _is_mtp_layer(self, bid: int | None) -> bool:
+        if bid is None:
+            return False
+        n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
+        return bid >= n_main
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        # remove mtp layers
-        if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
-            il = int(m.group(1))
-            n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
-            if il >= n_main:
+        is_mtp = self._is_mtp_layer(bid)
+
+        # --no-mtp: drop the appended MTP block(s) entirely.
+        if is_mtp and self.no_mtp:
+            return
+        # --mtp: keep ONLY MTP-block tensors plus the shared embeddings/norm/lm_head
+        # (so the resulting GGUF carries just the draft head).
+        if self.mtp_only and not is_mtp and bid is not None:
+            return
+        if self.mtp_only and bid is None:
+            # Top-level tensors: keep only shared embeddings/norm/lm_head.
+            keep = name in (
+                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
+            )
+            if not keep:
                 return
+
+        # The checkpoint nests the per-MTP-layer shared head under
+        # `model.layers.{N+i}.transformer.shared_head.{norm,output}.weight`;
+        # strip the `transformer.` infix and rename `output` → `head` so the
+        # existing NEXTN_SHARED_HEAD_{NORM,HEAD} tensor mapping picks them up.
+        # Mirrors vllm's `_rewrite_spec_layer_name` (step3p5_mtp.py).
+        if is_mtp:
+            if ".transformer." in name:
+                name = name.replace(".transformer.", ".")
+            if "shared_head.output" in name:
+                name = name.replace("shared_head.output", "shared_head.head")
+
         if name.endswith("norm.weight"):
             data_torch += 1.0
 
@@ -190,6 +264,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 
         yield from super().modify_tensors(data_torch, name, bid)
 
+    def prepare_metadata(self, vocab_only: bool):
+        from_dir = self.fname_out.is_dir()
+        super().prepare_metadata(vocab_only=vocab_only)
+
+        # Mirror Qwen3.5's behavior: when emitting a draft-only file into a
+        # directory, prefix with "mtp-" so it doesn't collide with the trunk.
+        if not self.mtp_only or not from_dir:
+            return
+
+        output_type: str = self.ftype.name.partition("_")[2]
+        fname_default: str = gguf.naming_convention(
+            self.metadata.name, self.metadata.basename, self.metadata.finetune,
+            self.metadata.version, size_label=None, output_type=output_type, model_type=None)
+        self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
+
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
         # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
 
@@ -247,8 +247,9 @@ def main() -> None:
 
         if args.mtp or args.no_mtp:
             from conversion.qwen import _Qwen35MtpMixin
-            if not issubclass(model_class, _Qwen35MtpMixin):
-                logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today")
+            from conversion.step3 import Step35Model
+            if not (issubclass(model_class, _Qwen35MtpMixin) or issubclass(model_class, Step35Model)):
+                logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 and Step3.5 text variants today")
                 sys.exit(1)
             if args.no_mtp:
                 model_class.no_mtp = True
 
@@ -3935,6 +3935,13 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE_SHEXP,
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
+        # NextN/MTP tensors (Step3p5 draft head)
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
     MODEL_ARCH.LLAMA_EMBED: [
         MODEL_TENSOR.TOKEN_EMBD,