From 0b61fd0ea636c55afa8eafa3e3219be5baa87aee Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 23 May 2026 23:03:05 +0100 Subject: [PATCH 1/2] Changes for c0c7e147e7efa6c5858754b47259ba4880f8a906 --- LLama.Web/Common/ModelOptions.cs | 5 ++++ LLama/Abstractions/IContextParams.cs | 10 +++++++ LLama/Common/ModelParams.cs | 6 +++++ LLama/Extensions/IContextParamsExtensions.cs | 2 ++ LLama/Native/LLamaContextParams.cs | 10 +++++++ LLama/Native/LLamaContextType.cs | 18 +++++++++++++ LLama/Native/LLamaParamsFitStatus.cs | 19 ------------- LLama/Native/NativeApi.Mtmd.cs | 22 +++++++++++++-- LLama/Native/NativeApi.cs | 28 -------------------- LLama/Native/SafeLLamaContextHandle.cs | 16 +++++++++++ 10 files changed, 87 insertions(+), 49 deletions(-) create mode 100644 LLama/Native/LLamaContextType.cs delete mode 100644 LLama/Native/LLamaParamsFitStatus.cs diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index fbe8906a5..a8d4fb4ee 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -20,6 +20,8 @@ public class ModelOptions /// public uint? ContextSize { get; set; } + LLamaContextType IContextParams.ContextType => LLamaContextType.Default; + /// public int MainGpu { get; set; } = 0; @@ -35,6 +37,9 @@ public class ModelOptions /// public uint SeqMax { get; set; } + /// + public uint RecurrentRollbackSnapshots { get; set; } = 0; + /// public bool Embeddings { get; set; } diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index b7abed5ed..327bc4c26 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -13,6 +13,11 @@ public interface IContextParams /// uint? ContextSize { get; } + /// + /// The type of context + /// + LLamaContextType ContextType { get; } + /// /// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch) /// @@ -28,6 +33,11 @@ public interface IContextParams /// uint SeqMax { get; } + /// + /// The number of recurrent-state snapshots per seq for rollback + /// + uint RecurrentRollbackSnapshots { get; } + /// /// If true, extract embeddings (together with logits). /// diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index 6766da637..9045221d7 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -16,6 +16,9 @@ public record ModelParams /// public uint? ContextSize { get; set; } + /// + public LLamaContextType ContextType { get; set; } = LLamaContextType.Default; + /// public int MainGpu { get; set; } = 0; @@ -31,6 +34,9 @@ public record ModelParams /// public uint SeqMax { get; set; } = 1; + /// + public uint RecurrentRollbackSnapshots { get; } = 0; + /// public bool UseMemorymap { get; set; } = true; diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index 469d0517c..be76b3516 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -23,9 +23,11 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo result = LLamaContextParams.Default(); result.n_ctx = @params.ContextSize ?? 0; + result.context_type = @params.ContextType; result.n_batch = @params.BatchSize; result.n_ubatch = @params.UBatchSize; result.n_seq_max = @params.SeqMax; + result.n_rs_seq = @params.RecurrentRollbackSnapshots; result.embeddings = @params.Embeddings; result.rope_freq_base = @params.RopeFrequencyBase ?? 0; result.rope_freq_scale = @params.RopeFrequencyScale ?? 0; diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 1e81e098c..198ee834f 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -40,6 +40,11 @@ public struct LLamaContextParams /// public uint n_seq_max; + /// + /// number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] + /// + public uint n_rs_seq; + /// /// number of threads to use for generation /// @@ -50,6 +55,11 @@ public struct LLamaContextParams /// public int n_threads_batch; + /// + /// Set the type of context (e.g. MTP) + /// + public LLamaContextType context_type; + /// /// RoPE scaling type, from `enum llama_rope_scaling_type` /// diff --git a/LLama/Native/LLamaContextType.cs b/LLama/Native/LLamaContextType.cs new file mode 100644 index 000000000..88cc2c353 --- /dev/null +++ b/LLama/Native/LLamaContextType.cs @@ -0,0 +1,18 @@ +namespace LLama.Native; + +/// +/// +/// +/// llama_context_type +public enum LLamaContextType +{ + /// + /// Default context type + /// + Default = 0, + + /// + /// Multi token prediction context + /// + Mtp = 1, +} \ No newline at end of file diff --git a/LLama/Native/LLamaParamsFitStatus.cs b/LLama/Native/LLamaParamsFitStatus.cs deleted file mode 100644 index a73184794..000000000 --- a/LLama/Native/LLamaParamsFitStatus.cs +++ /dev/null @@ -1,19 +0,0 @@ -namespace LLama.Native; - -public enum LLamaParamsFitStatus -{ - /// - /// Found allocations that are projected to fit - /// - LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, - - /// - /// Could not find allocations that are projected to fit - /// - LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, - - /// - /// A hard error occurred, e.g. because no model could be found at the specified path - /// - LLAMA_PARAMS_FIT_STATUS_ERROR = 2, -} \ No newline at end of file diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs index 1bf2b5f9a..d35a3fb1a 100644 --- a/LLama/Native/NativeApi.Mtmd.cs +++ b/LLama/Native/NativeApi.Mtmd.cs @@ -204,16 +204,20 @@ internal struct mtmd_decoder_pos [FieldOffset(8)] uint y; + + [FieldOffset(12)] + uint z; }; /// /// get position for decoder attention, to be used by M-RoPE models /// /// + /// pos_0 is the absolute position of the first token /// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 /// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)] - internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i); + internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, LLamaPos pos_0, nuint i); // tokenize ---------------------------------------------------------- @@ -312,7 +316,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)] // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE // out_pos must have length == mtmd_helper_get_n_tokens(image) - internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos); + internal static extern void mtmd_helper_image_get_decoder_pos( + IntPtr /* mtmd_image_tokens* */ image, + LLamaPos pos_0, + IntPtr /* mtmd_decoder_pos* */ out_pos + ); [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)] internal static extern int mtmd_helper_eval_chunks( @@ -346,4 +354,14 @@ internal static extern int mtmd_helper_decode_image_chunk( int seq_id, int n_batch, ref int new_n_past); + + /* + * // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context + // This is only intended to be used by llama-server, breaking changes is expected + struct mtmd_caps { + bool inp_vision; + bool inp_audio; + }; + MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); + */ } diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index cbdd05a53..2e06f83ee 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -464,34 +464,6 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)] public static extern IntPtr ggml_backend_buft_name(IntPtr buft); - /// - /// Fits mparams and cparams to free device memory (assumes system memory is unlimited) - /// - returns true if the parameters could be successfully modified to fit device memory - /// - this function is NOT thread safe because it modifies the global llama logger state - /// - only parameters that have the same value as in llama_default_model_params are modified - /// with the exception of the context size which is modified if and only if equal to 0 - /// - /// - /// - /// - /// Writable buffer for tensor split, needs at least llama_max_devices elements - /// Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements - /// Margins of memory to leave per device in bytes - /// Minimum context size to set when trying to reduce memory use - /// Minimum log level to print during fitting, lower levels go to debug log - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern unsafe LLamaParamsFitStatus llama_params_fit( - string path, - ref LLamaModelParams mparams, - ref LLamaContextParams cparams, - float* tensor_split, - LLamaModelTensorBufferOverride* tensor_buft_overrides, - nint* margins, - uint n_ctx_min, - int /* GGML_LOG_LEVEL */ log_level - ); - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern long llama_time_us(); diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 0041fbf5e..f805addbc 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -39,6 +39,11 @@ public sealed class SafeLLamaContextHandle /// public uint MaxSeq => llama_n_seq_max(this); + /// + /// Get the number of recurrent-state snapshots per seq for rollback + /// + public uint RecurrentRollbackSnapshots => llama_n_rs_seq(this); + /// /// Get or set the number of threads used for generation of a single token. /// @@ -449,6 +454,14 @@ static SafeLLamaContextHandle() /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx); + + /// + /// Get the n_rs_seq for this context + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern uint llama_n_rs_seq(SafeLLamaContextHandle ctx); #endregion #region Setters @@ -614,6 +627,7 @@ public Span GetEmbeddingsSeq(LLamaSeqId seq) /// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. /// /// + // ReSharper disable once InconsistentNaming public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding) { return ThrowIfDisposed().Tokenize(text, add_bos, special, encoding); @@ -711,9 +725,11 @@ public DecodeResult Decode(LLamaBatch batch) var batchSize = checked((int)BatchSize); // Evaluate the prompt, in chunks smaller than the max batch size + // ReSharper disable once InconsistentNaming var n_left = tokens.Count; for (var i = 0; i < tokens.Count; i += batchSize) { + // ReSharper disable once InconsistentNaming var n_eval = tokens.Count - i; if (n_eval > batchSize) n_eval = batchSize; From 40c6ee5dc4e63f477efeb8f006b9c5135049e077 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sun, 24 May 2026 19:02:28 +0100 Subject: [PATCH 2/2] - Using updated binaries to test new build process - Fixed NativeAbiTests --- LLama.Unittest/NativeAbiTests.cs | 12 +++- LLama/LLamaSharp.csproj | 2 +- LLama/Native/LLamaModelParams.cs | 114 +++++++++++++++---------------- llama.cpp | 2 +- 4 files changed, 69 insertions(+), 61 deletions(-) diff --git a/LLama.Unittest/NativeAbiTests.cs b/LLama.Unittest/NativeAbiTests.cs index fbbe9d7cc..b5c1667c0 100644 --- a/LLama.Unittest/NativeAbiTests.cs +++ b/LLama.Unittest/NativeAbiTests.cs @@ -1,5 +1,6 @@ -using System.Runtime.InteropServices; using LLama.Native; +using System.Net.Mime; +using System.Runtime.InteropServices; namespace LLama.Unittest { @@ -42,8 +43,10 @@ public void ContextParamsSizeMatchesNative() (sizeof(uint), 4), // n_batch (sizeof(uint), 4), // n_ubatch (sizeof(uint), 4), // n_seq_max + (sizeof(uint), 4), // n_rs_seq (sizeof(int), 4), // n_threads (sizeof(int), 4), // n_threads_batch + (sizeof(LLamaContextType), 4), // ctx_type (sizeof(int), 4), // rope_scaling_type (sizeof(int), 4), // pooling_type (sizeof(int), 4), // attention_type @@ -80,9 +83,14 @@ public void ContextParamsSizeMatchesNative() public void ModelParamsBoolBlockMatchesNative() { var pointerSize = IntPtr.Size; - var kvOffset = Marshal.OffsetOf("kv_overrides").ToInt32(); + + // Get the field immediately before the first boolean field + var kvOffset = Marshal.OffsetOf(nameof(LLamaModelParams.kv_overrides)).ToInt32(); + + // Get the first boolean field var vocabOffset = Marshal.OffsetOf("_vocab_only").ToInt32(); + // Check first boolean field is one ptr-size after the other Assert.Equal(kvOffset + pointerSize, vocabOffset); Assert.Equal(vocabOffset + 1, Marshal.OffsetOf("_use_mmap").ToInt32()); Assert.Equal(vocabOffset + 2, Marshal.OffsetOf("_use_direct_io").ToInt32()); diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 4671475d0..188ae5fee 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -59,7 +59,7 @@ - 3f7c29d318e317b6 + c0c7e147e7efa6c58587_test diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index e66248c62..2bdcad7a8 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -71,35 +71,35 @@ public bool vocab_only } private sbyte _vocab_only; - /// - /// use mmap if possible - /// - public bool use_mmap - { - readonly get => Convert.ToBoolean(_use_mmap); - set => _use_mmap = Convert.ToSByte(value); - } - private sbyte _use_mmap; - - /// - /// use direct io, takes precedence over use_mmap when supported - /// - public bool use_direct_io - { - readonly get => Convert.ToBoolean(_use_direct_io); - set => _use_direct_io = Convert.ToSByte(value); - } - private sbyte _use_direct_io; - - /// - /// force system to keep model in RAM - /// - public bool use_mlock - { - readonly get => Convert.ToBoolean(_use_mlock); - set => _use_mlock = Convert.ToSByte(value); - } - private sbyte _use_mlock; + /// + /// use mmap if possible + /// + public bool use_mmap + { + readonly get => Convert.ToBoolean(_use_mmap); + set => _use_mmap = Convert.ToSByte(value); + } + private sbyte _use_mmap; + + /// + /// use direct io, takes precedence over use_mmap when supported + /// + public bool use_direct_io + { + readonly get => Convert.ToBoolean(_use_direct_io); + set => _use_direct_io = Convert.ToSByte(value); + } + private sbyte _use_direct_io; + + /// + /// force system to keep model in RAM + /// + public bool use_mlock + { + readonly get => Convert.ToBoolean(_use_mlock); + set => _use_mlock = Convert.ToSByte(value); + } + private sbyte _use_mlock; /// /// validate model tensor data @@ -112,34 +112,34 @@ public bool check_tensors private sbyte _check_tensors; /// - /// use extra buffer types (used for weight repacking) - /// - public bool use_extra_bufts - { - readonly get => Convert.ToBoolean(_use_extra_bufts); - set => _use_extra_bufts = Convert.ToSByte(value); - } - private sbyte _use_extra_bufts; - - /// - /// bypass host buffer allowing extra buffers to be used - /// - public bool no_host - { - readonly get => Convert.ToBoolean(_no_host); - set => _no_host = Convert.ToSByte(value); - } - private sbyte _no_host; - - /// - /// only load metadata and simulate memory allocations - /// - public bool no_alloc - { - readonly get => Convert.ToBoolean(_no_alloc); - set => _no_alloc = Convert.ToSByte(value); - } - private sbyte _no_alloc; + /// use extra buffer types (used for weight repacking) + /// + public bool use_extra_bufts + { + readonly get => Convert.ToBoolean(_use_extra_bufts); + set => _use_extra_bufts = Convert.ToSByte(value); + } + private sbyte _use_extra_bufts; + + /// + /// bypass host buffer allowing extra buffers to be used + /// + public bool no_host + { + readonly get => Convert.ToBoolean(_no_host); + set => _no_host = Convert.ToSByte(value); + } + private sbyte _no_host; + + /// + /// only load metadata and simulate memory allocations + /// + public bool no_alloc + { + readonly get => Convert.ToBoolean(_no_alloc); + set => _no_alloc = Convert.ToSByte(value); + } + private sbyte _no_alloc; /// /// Create a LLamaModelParams with default values /// diff --git a/llama.cpp b/llama.cpp index 3f7c29d31..c0c7e147e 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 3f7c29d318e317b63f54c558bc69803963d7d88c +Subproject commit c0c7e147e7efa6c5858754b47259ba4880f8a906