From 0b61fd0ea636c55afa8eafa3e3219be5baa87aee Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Sat, 23 May 2026 23:03:05 +0100
Subject: [PATCH 1/2] Changes for c0c7e147e7efa6c5858754b47259ba4880f8a906

---
 LLama.Web/Common/ModelOptions.cs             |  5 ++++
 LLama/Abstractions/IContextParams.cs         | 10 +++++++
 LLama/Common/ModelParams.cs                  |  6 +++++
 LLama/Extensions/IContextParamsExtensions.cs |  2 ++
 LLama/Native/LLamaContextParams.cs           | 10 +++++++
 LLama/Native/LLamaContextType.cs             | 18 +++++++++++++
 LLama/Native/LLamaParamsFitStatus.cs         | 19 -------------
 LLama/Native/NativeApi.Mtmd.cs               | 22 +++++++++++++--
 LLama/Native/NativeApi.cs                    | 28 --------------------
 LLama/Native/SafeLLamaContextHandle.cs       | 16 +++++++++++
 10 files changed, 87 insertions(+), 49 deletions(-)
 create mode 100644 LLama/Native/LLamaContextType.cs
 delete mode 100644 LLama/Native/LLamaParamsFitStatus.cs
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index fbe8906a5..a8d4fb4ee 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -20,6 +20,8 @@ public class ModelOptions
         /// <inheritdoc />
         public uint? ContextSize { get; set; }
 
+        LLamaContextType IContextParams.ContextType => LLamaContextType.Default;
+
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
@@ -35,6 +37,9 @@ public class ModelOptions
         /// <inheritdoc />
         public uint SeqMax { get; set; }
 
+        /// <inheritdoc />
+        public uint RecurrentRollbackSnapshots { get; set; } = 0;
+
         /// <inheritdoc />
         public bool Embeddings { get; set; }
 
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
index b7abed5ed..327bc4c26 100644
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -13,6 +13,11 @@ public interface IContextParams
     /// </summary>
     uint? ContextSize { get; }
 
+    /// <summary>
+    /// The type of context
+    /// </summary>
+    LLamaContextType ContextType { get; }
+
     /// <summary>
     /// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
     /// </summary>
@@ -28,6 +33,11 @@ public interface IContextParams
     /// </summary>
     uint SeqMax { get; }
 
+    /// <summary>
+    /// The number of recurrent-state snapshots per seq for rollback
+    /// </summary>
+    uint RecurrentRollbackSnapshots { get; }
+
     /// <summary>
     /// If true, extract embeddings (together with logits).
     /// </summary>
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 6766da637..9045221d7 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -16,6 +16,9 @@ public record ModelParams
         /// <inheritdoc />
         public uint? ContextSize { get; set; }
 
+        /// <inheritdoc />
+        public LLamaContextType ContextType { get; set; } = LLamaContextType.Default;
+
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
@@ -31,6 +34,9 @@ public record ModelParams
         /// <inheritdoc />
         public uint SeqMax { get; set; } = 1;
 
+        /// <inheritdoc />
+        public uint RecurrentRollbackSnapshots { get; } = 0;
+
         /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 469d0517c..be76b3516 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -23,9 +23,11 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result = LLamaContextParams.Default();
 
             result.n_ctx = @params.ContextSize ?? 0;
+            result.context_type = @params.ContextType;
             result.n_batch = @params.BatchSize;
             result.n_ubatch = @params.UBatchSize;
             result.n_seq_max = @params.SeqMax;
+            result.n_rs_seq = @params.RecurrentRollbackSnapshots;
             result.embeddings = @params.Embeddings;
             result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
             result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 1e81e098c..198ee834f 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -40,6 +40,11 @@ public struct LLamaContextParams
         /// </summary>
         public uint n_seq_max;
 
+        /// <summary>
+        /// number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
+        /// </summary>
+        public uint n_rs_seq;
+
         /// <summary>
         /// number of threads to use for generation
         /// </summary>
@@ -50,6 +55,11 @@ public struct LLamaContextParams
         /// </summary>
         public int n_threads_batch;
 
+        /// <summary>
+        /// Set the type of context (e.g. MTP)
+        /// </summary>
+        public LLamaContextType context_type;
+        
         /// <summary>
         /// RoPE scaling type, from `enum llama_rope_scaling_type` 
         /// </summary>
diff --git a/LLama/Native/LLamaContextType.cs b/LLama/Native/LLamaContextType.cs
new file mode 100644
index 000000000..88cc2c353
--- /dev/null
+++ b/LLama/Native/LLamaContextType.cs
@@ -0,0 +1,18 @@
+namespace LLama.Native;
+
+/// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_context_type</remarks>
+public enum LLamaContextType
+{
+    /// <summary>
+    /// Default context type
+    /// </summary>
+    Default = 0,
+    
+    /// <summary>
+    /// Multi token prediction context
+    /// </summary>
+    Mtp = 1,
+}
\ No newline at end of file
diff --git a/LLama/Native/LLamaParamsFitStatus.cs b/LLama/Native/LLamaParamsFitStatus.cs
deleted file mode 100644
index a73184794..000000000
--- a/LLama/Native/LLamaParamsFitStatus.cs
+++ /dev/null
@@ -1,19 +0,0 @@
-namespace LLama.Native;
-
-public enum LLamaParamsFitStatus
-{
-    /// <summary>
-    /// Found allocations that are projected to fit
-    /// </summary>
-    LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
-
-    /// <summary>
-    /// Could not find allocations that are projected to fit
-    /// </summary>
-    LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
-
-    /// <summary>
-    /// A hard error occurred, e.g. because no model could be found at the specified path
-    /// </summary>
-    LLAMA_PARAMS_FIT_STATUS_ERROR = 2,
-}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
index 1bf2b5f9a..d35a3fb1a 100644
--- a/LLama/Native/NativeApi.Mtmd.cs
+++ b/LLama/Native/NativeApi.Mtmd.cs
@@ -204,16 +204,20 @@ internal struct mtmd_decoder_pos
 
         [FieldOffset(8)]
         uint y;
+
+        [FieldOffset(12)]
+        uint z;
     };
 
     /// <summary>
     /// get position for decoder attention, to be used by M-RoPE models
     /// </summary>
     /// <param name="image_tokens"></param>
+    /// <param name="pos_0">pos_0 is the absolute position of the first token</param>
     /// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
     /// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
-    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i);
+    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, LLamaPos pos_0, nuint i);
 
     // tokenize ----------------------------------------------------------
 
@@ -312,7 +316,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
     // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
     // out_pos must have length == mtmd_helper_get_n_tokens(image)
-    internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos);
+    internal static extern void mtmd_helper_image_get_decoder_pos(
+        IntPtr /* mtmd_image_tokens* */ image,
+        LLamaPos pos_0,
+        IntPtr /* mtmd_decoder_pos* */ out_pos
+    );
 
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_eval_chunks(
@@ -346,4 +354,14 @@ internal static extern int mtmd_helper_decode_image_chunk(
         int seq_id,
         int n_batch,
         ref int new_n_past);
+    
+    /*
+     * // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
+       // This is only intended to be used by llama-server, breaking changes is expected
+       struct mtmd_caps {
+           bool inp_vision;
+           bool inp_audio;
+       };
+       MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+     */
 }
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index cbdd05a53..2e06f83ee 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -464,34 +464,6 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split
         [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
 
-        /// <summary>
-        /// Fits mparams and cparams to free device memory (assumes system memory is unlimited)
-        ///   - returns true if the parameters could be successfully modified to fit device memory
-        ///   - this function is NOT thread safe because it modifies the global llama logger state
-        ///   - only parameters that have the same value as in llama_default_model_params are modified
-        ///     with the exception of the context size which is modified if and only if equal to 0
-        /// </summary>
-        /// <param name="path"></param>
-        /// <param name="mparams"></param>
-        /// <param name="cparams"></param>
-        /// <param name="tensor_split">Writable buffer for tensor split, needs at least llama_max_devices elements</param>
-        /// <param name="tensor_buft_overrides">Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements</param>
-        /// <param name="margins">Margins of memory to leave per device in bytes</param>
-        /// <param name="n_ctx_min">Minimum context size to set when trying to reduce memory use</param>
-        /// <param name="log_level">Minimum log level to print during fitting, lower levels go to debug log</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe LLamaParamsFitStatus llama_params_fit(
-            string path,
-            ref LLamaModelParams mparams,
-            ref LLamaContextParams cparams,
-            float* tensor_split,
-            LLamaModelTensorBufferOverride* tensor_buft_overrides,
-            nint* margins,
-            uint n_ctx_min,
-            int /* GGML_LOG_LEVEL */ log_level
-        );
-
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern long llama_time_us();
 
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 0041fbf5e..f805addbc 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -39,6 +39,11 @@ public sealed class SafeLLamaContextHandle
         /// </summary>
         public uint MaxSeq => llama_n_seq_max(this);
 
+        /// <summary>
+        /// Get the number of recurrent-state snapshots per seq for rollback
+        /// </summary>
+        public uint RecurrentRollbackSnapshots => llama_n_rs_seq(this);
+
         /// <summary>
         /// Get or set the number of threads used for generation of a single token.
         /// </summary>
@@ -449,6 +454,14 @@ static SafeLLamaContextHandle()
         /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx);
+
+        /// <summary>
+        /// Get the n_rs_seq for this context
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern uint llama_n_rs_seq(SafeLLamaContextHandle ctx);
         #endregion
 
         #region Setters
@@ -614,6 +627,7 @@ public Span<float> GetEmbeddingsSeq(LLamaSeqId seq)
         /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
         /// <returns></returns>
         /// <exception cref="RuntimeError"></exception>
+        // ReSharper disable once InconsistentNaming
         public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
         {
             return ThrowIfDisposed().Tokenize(text, add_bos, special, encoding);
@@ -711,9 +725,11 @@ public DecodeResult Decode(LLamaBatch batch)
             var batchSize = checked((int)BatchSize);
 
             // Evaluate the prompt, in chunks smaller than the max batch size
+            // ReSharper disable once InconsistentNaming
             var n_left = tokens.Count;
             for (var i = 0; i < tokens.Count; i += batchSize)
             {
+                // ReSharper disable once InconsistentNaming
                 var n_eval = tokens.Count - i;
                 if (n_eval > batchSize)
                     n_eval = batchSize;

From 40c6ee5dc4e63f477efeb8f006b9c5135049e077 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Sun, 24 May 2026 19:02:28 +0100
Subject: [PATCH 2/2] - Using updated binaries to test new build process  -
 Fixed NativeAbiTests

---
 LLama.Unittest/NativeAbiTests.cs |  12 +++-
 LLama/LLamaSharp.csproj          |   2 +-
 LLama/Native/LLamaModelParams.cs | 114 +++++++++++++++----------------
 llama.cpp                        |   2 +-
 4 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/LLama.Unittest/NativeAbiTests.cs b/LLama.Unittest/NativeAbiTests.cs
index fbbe9d7cc..b5c1667c0 100644
--- a/LLama.Unittest/NativeAbiTests.cs
+++ b/LLama.Unittest/NativeAbiTests.cs
@@ -1,5 +1,6 @@
-using System.Runtime.InteropServices;
 using LLama.Native;
+using System.Net.Mime;
+using System.Runtime.InteropServices;
 
 namespace LLama.Unittest
 {
@@ -42,8 +43,10 @@ public void ContextParamsSizeMatchesNative()
                 (sizeof(uint), 4), // n_batch
                 (sizeof(uint), 4), // n_ubatch
                 (sizeof(uint), 4), // n_seq_max
+                (sizeof(uint), 4), // n_rs_seq
                 (sizeof(int), 4),  // n_threads
                 (sizeof(int), 4),  // n_threads_batch
+                (sizeof(LLamaContextType), 4), // ctx_type
                 (sizeof(int), 4),  // rope_scaling_type
                 (sizeof(int), 4),  // pooling_type
                 (sizeof(int), 4),  // attention_type
@@ -80,9 +83,14 @@ public void ContextParamsSizeMatchesNative()
         public void ModelParamsBoolBlockMatchesNative()
         {
             var pointerSize = IntPtr.Size;
-            var kvOffset = Marshal.OffsetOf<LLamaModelParams>("kv_overrides").ToInt32();
+            
+            // Get the field immediately before the first boolean field
+            var kvOffset = Marshal.OffsetOf<LLamaModelParams>(nameof(LLamaModelParams.kv_overrides)).ToInt32();
+            
+            // Get the first boolean field
             var vocabOffset = Marshal.OffsetOf<LLamaModelParams>("_vocab_only").ToInt32();
 
+            // Check first boolean field is one ptr-size after the other
             Assert.Equal(kvOffset + pointerSize, vocabOffset);
             Assert.Equal(vocabOffset + 1, Marshal.OffsetOf<LLamaModelParams>("_use_mmap").ToInt32());
             Assert.Equal(vocabOffset + 2, Marshal.OffsetOf<LLamaModelParams>("_use_direct_io").ToInt32());
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 4671475d0..188ae5fee 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -59,7 +59,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>3f7c29d318e317b6</BinaryReleaseId>
+    <BinaryReleaseId>c0c7e147e7efa6c58587_test</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index e66248c62..2bdcad7a8 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -71,35 +71,35 @@ public bool vocab_only
         }
         private sbyte _vocab_only;
 
-        /// <summary>
-        /// use mmap if possible
-        /// </summary>
-        public bool use_mmap
-        {
-            readonly get => Convert.ToBoolean(_use_mmap);
-            set => _use_mmap = Convert.ToSByte(value);
-        }
-        private sbyte _use_mmap;
-
-        /// <summary>
-        /// use direct io, takes precedence over use_mmap when supported
-        /// </summary>
-        public bool use_direct_io
-        {
-            readonly get => Convert.ToBoolean(_use_direct_io);
-            set => _use_direct_io = Convert.ToSByte(value);
-        }
-        private sbyte _use_direct_io;
-
-        /// <summary>
-        /// force system to keep model in RAM
-        /// </summary>
-        public bool use_mlock
-        {
-            readonly get => Convert.ToBoolean(_use_mlock);
-            set => _use_mlock = Convert.ToSByte(value);
-        }
-        private sbyte _use_mlock;
+        /// <summary>
+        /// use mmap if possible
+        /// </summary>
+        public bool use_mmap
+        {
+            readonly get => Convert.ToBoolean(_use_mmap);
+            set => _use_mmap = Convert.ToSByte(value);
+        }
+        private sbyte _use_mmap;
+
+        /// <summary>
+        /// use direct io, takes precedence over use_mmap when supported
+        /// </summary>
+        public bool use_direct_io
+        {
+            readonly get => Convert.ToBoolean(_use_direct_io);
+            set => _use_direct_io = Convert.ToSByte(value);
+        }
+        private sbyte _use_direct_io;
+
+        /// <summary>
+        /// force system to keep model in RAM
+        /// </summary>
+        public bool use_mlock
+        {
+            readonly get => Convert.ToBoolean(_use_mlock);
+            set => _use_mlock = Convert.ToSByte(value);
+        }
+        private sbyte _use_mlock;
 
         /// <summary>
         /// validate model tensor data
@@ -112,34 +112,34 @@ public bool check_tensors
         private sbyte _check_tensors;
         
         /// <summary>
-        /// use extra buffer types (used for weight repacking) 
-        /// </summary>
-        public bool use_extra_bufts
-        {
-            readonly get => Convert.ToBoolean(_use_extra_bufts);
-            set => _use_extra_bufts = Convert.ToSByte(value);
-        }
-        private sbyte _use_extra_bufts;
-
-        /// <summary>
-        /// bypass host buffer allowing extra buffers to be used
-        /// </summary>
-        public bool no_host
-        {
-            readonly get => Convert.ToBoolean(_no_host);
-            set => _no_host = Convert.ToSByte(value);
-        }
-        private sbyte _no_host;
-
-        /// <summary>
-        /// only load metadata and simulate memory allocations
-        /// </summary>
-        public bool no_alloc
-        {
-            readonly get => Convert.ToBoolean(_no_alloc);
-            set => _no_alloc = Convert.ToSByte(value);
-        }
-        private sbyte _no_alloc;
+        /// use extra buffer types (used for weight repacking) 
+        /// </summary>
+        public bool use_extra_bufts
+        {
+            readonly get => Convert.ToBoolean(_use_extra_bufts);
+            set => _use_extra_bufts = Convert.ToSByte(value);
+        }
+        private sbyte _use_extra_bufts;
+
+        /// <summary>
+        /// bypass host buffer allowing extra buffers to be used
+        /// </summary>
+        public bool no_host
+        {
+            readonly get => Convert.ToBoolean(_no_host);
+            set => _no_host = Convert.ToSByte(value);
+        }
+        private sbyte _no_host;
+
+        /// <summary>
+        /// only load metadata and simulate memory allocations
+        /// </summary>
+        public bool no_alloc
+        {
+            readonly get => Convert.ToBoolean(_no_alloc);
+            set => _no_alloc = Convert.ToSByte(value);
+        }
+        private sbyte _no_alloc;
         /// <summary>
         /// Create a LLamaModelParams with default values
         /// </summary>
diff --git a/llama.cpp b/llama.cpp
index 3f7c29d31..c0c7e147e 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 3f7c29d318e317b63f54c558bc69803963d7d88c
+Subproject commit c0c7e147e7efa6c5858754b47259ba4880f8a906