Skip to content

Commit fae3ed5

Browse files
committed
Fix: Per-call LLama embedder; update Nomic model
Switch LLama embedding implementation to create and dispose contexts per call (aligns with LLamaSharp 0.26.0), removing the long-lived Context field and related state. Read EmbeddingSize from a temporary context at construction, call llama_set_embeddings on each per-call context, and normalize embeddings as before. Update LLamaSharpTextEmbedding defaults: use model-default context (ContextSize=0), enable Embeddings, reduce Batch/UBatch sizes, disable FlashAttention, and set pooling + metadata override for older Nomic GGUFs. Update KnownModels and LocalModels entries for the Nomic embedding model (filename, download URL, display name) and change its embedding dimension to 2048. Remove MemoryService's pre/post import context management and adjust MemoryFactory to load weights, inject pooling metadata, and return an embedding config that uses the model's native context.
1 parent 59ffde4 commit fae3ed5

6 files changed

Lines changed: 62 additions & 90 deletions

File tree

src/MaIN.Domain/Models/Concrete/LocalModels.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,11 +297,11 @@ public sealed record Olmo2_7b() : LocalModel(
297297

298298
public sealed record Nomic_Embedding() : LocalModel(
299299
"nomic-embedding",
300-
"nomicv2.gguf",
301-
new Uri("https://huggingface.co/Inza124/Nomic/resolve/main/nomicv2.gguf?download=true"),
302-
"Nomic Embedding",
303-
8192,
304-
"Model used to generate embeddings");
300+
"mxbai-embed-large-v1.Q4_K_M.gguf",
301+
new Uri("https://huggingface.co/ChristianAzinn/mxbai-embed-large-v1-gguf/resolve/main/mxbai-embed-large-v1.Q4_K_M.gguf?download=true"),
302+
"mxbai-embed-large v1",
303+
512,
304+
"Model used to generate embeddings with superior knowledge search recall");
305305

306306
// ===== TTS Model =====
307307

src/MaIN.Domain/Models/SupportedModels.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,9 @@ public static Model GetEmbeddingModel() =>
250250
new()
251251
{
252252
Name = KnownModelNames.Nomic_Embedding,
253-
FileName = "nomicv2.gguf",
254-
Description = "Model used to generate embeddings.",
255-
DownloadUrl = "https://huggingface.co/Inza124/Nomic/resolve/main/nomicv2.gguf?download=true",
253+
FileName = "mxbai-embed-large-v1.Q4_K_M.gguf",
254+
Description = "Model used to generate embeddings with superior knowledge search recall.",
255+
DownloadUrl = "https://huggingface.co/ChristianAzinn/mxbai-embed-large-v1-gguf/resolve/main/mxbai-embed-large-v1.Q4_K_M.gguf?download=true",
256256
};
257257

258258
public static bool IsModelSupported(string name) =>

src/MaIN.Services/Services/LLMService/Memory/Embeddings/LLamaEmbedderMaINClone.cs

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,7 @@ public sealed class LLamaEmbedderMaINClone
2525
/// <summary>
2626
/// Dimension of embedding vectors
2727
/// </summary>
28-
public int EmbeddingSize => Context.EmbeddingSize;
29-
30-
/// <summary>
31-
/// LLama Context
32-
/// </summary>
33-
public LLamaContext Context { get; set; }
34-
public bool isContextDisposed { get; set; }
28+
public int EmbeddingSize { get; }
3529

3630
/// <summary>
3731
/// Create a new embedder, using the given LLamaWeights
@@ -46,17 +40,21 @@ public LLamaEmbedderMaINClone(LLamaWeights weights, IContextParams @params, ILog
4640
if (weights.NativeHandle is { HasEncoder: true, HasDecoder: true })
4741
throw new NotSupportedException("Computing embeddings in encoder-decoder models is not supported");
4842

49-
Context = weights.CreateContext(@params, logger);
43+
// Create context only to read EmbeddingSize, then dispose immediately
44+
// (matches LLamaSharp 0.26.0 LLamaEmbedder pattern)
45+
using (var tempContext = weights.CreateContext(@params, logger))
46+
{
47+
EmbeddingSize = tempContext.EmbeddingSize;
48+
}
49+
5050
_weights = weights;
5151
_params = @params;
5252
_logger = logger;
53-
NativeApi.llama_set_embeddings(Context.NativeHandle, true);
5453
}
5554

5655
/// <inheritdoc />
5756
public void Dispose()
5857
{
59-
Context.Dispose();
6058
}
6159

6260
/// <summary>
@@ -74,22 +72,20 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
7472

7573
private async Task<(IReadOnlyList<float[]> Embeddings, int Tokens)> GetEmbeddingsWithTokenCount(string input, CancellationToken cancellationToken = default)
7674
{
77-
if (isContextDisposed)
78-
{
79-
Context = _weights.CreateContext(_params, _logger);
80-
NativeApi.llama_set_embeddings(Context.NativeHandle, true);
81-
}
82-
83-
var tokens = Context.Tokenize(input, special: true);
84-
if (tokens.Length > Context.ContextSize)
85-
throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {Context.ContextSize})", nameof(input));
75+
// Create a fresh context for each embedding call (0.26.0 pattern)
76+
using var context = _weights.CreateContext(_params, _logger);
77+
NativeApi.llama_set_embeddings(context.NativeHandle, true);
78+
79+
var tokens = context.Tokenize(input, special: true);
80+
if (tokens.Length > context.ContextSize)
81+
throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {context.ContextSize})", nameof(input));
8682

8783
cancellationToken.ThrowIfCancellationRequested();
8884

8985
// Evaluate prompt in batch-size chunks
9086
var n_past = 0;
9187
var batch = new LLamaBatch();
92-
var batchSize = (int)Context.Params.BatchSize;
88+
var batchSize = (int)context.Params.BatchSize;
9389
for (var i = 0; i < tokens.Length; i += batchSize)
9490
{
9591
var n_eval = tokens.Length - i;
@@ -101,19 +97,19 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
10197
n_past += n_eval;
10298

10399
// Run model
104-
switch (Context.NativeHandle.ModelHandle.HasEncoder, Context.NativeHandle.ModelHandle.HasDecoder)
100+
switch (context.NativeHandle.ModelHandle.HasEncoder, context.NativeHandle.ModelHandle.HasDecoder)
105101
{
106102
case (true, false):
107103
{
108-
var result = await Context.EncodeAsync(batch, cancellationToken);
104+
var result = await context.EncodeAsync(batch, cancellationToken);
109105
if (result != EncodeResult.Ok)
110106
throw new RuntimeError($"Failed to encode: {result}");
111107
break;
112108
}
113109

114110
case (false, true):
115111
{
116-
var result = await Context.DecodeAsync(batch, cancellationToken);
112+
var result = await context.DecodeAsync(batch, cancellationToken);
117113
if (result != DecodeResult.Ok)
118114
throw new RuntimeError($"Failed to decode: {result}");
119115
break;
@@ -125,18 +121,17 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
125121
}
126122

127123
// Extract results
128-
var poolingType = Context.NativeHandle.PoolingType;
124+
var poolingType = context.NativeHandle.PoolingType;
129125
var resultsCount = poolingType == LLamaPoolingType.None ? tokens.Length : 1;
130126
var results = new List<float[]>(resultsCount);
131-
results.Add(Context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero).ToArray());
127+
results.Add(context.NativeHandle.GetEmbeddingsSeq(LLamaSeqId.Zero).ToArray());
132128

133129
// Normalize the embeddings vector
134-
// https://github.com/ggerganov/llama.cpp/blob/2891c8aa9af17f4ff636ff3868bc34ff72b56e25/examples/embedding/embedding.cpp#L92
135130
foreach (var embedding in results)
136131
{
137132
embedding.EuclideanNormalization();
138133
}
139-
134+
140135
return (results, tokens.Length);
141136
}
142-
}
137+
}

src/MaIN.Services/Services/LLMService/Memory/Embeddings/LLamaSharpTextEmbeddingMaINClone.cs

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,18 @@ public sealed class LLamaSharpTextEmbeddingMaINClone
3131
/// <param name="config">The configuration for LLamaSharp.</param>
3232
public LLamaSharpTextEmbeddingMaINClone(LLamaSharpConfig config)
3333
{
34-
MaxTokens = (int?)config.ContextSize ?? 2048;
34+
MaxTokens = (int)(config.ContextSize is > 0 ? config.ContextSize.Value : 2048);
3535

3636
@params = new ModelParams(config.ModelPath)
3737
{
38-
ContextSize = config?.ContextSize ?? 2048,
38+
ContextSize = config?.ContextSize ?? 0, // 0 = use model default
3939
GpuLayerCount = config?.GpuLayerCount ?? 20,
4040
MainGpu = config?.MainGpu ?? 0,
4141
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
42-
BatchSize = 1024,
43-
UBatchSize = 1024,
44-
FlashAttention = true,
42+
BatchSize = 512,
43+
UBatchSize = 512,
44+
Embeddings = true,
45+
FlashAttention = false,
4546
UseMemorymap = true,
4647
PoolingType = LLamaPoolingType.Mean,
4748
};
@@ -59,17 +60,18 @@ public LLamaSharpTextEmbeddingMaINClone(LLamaSharpConfig config)
5960
/// <param name="weights">A LLamaWeights object.</param>
6061
public LLamaSharpTextEmbeddingMaINClone(LLamaSharpConfig config, LLamaWeights weights)
6162
{
62-
MaxTokens = (int?)config.ContextSize ?? 2048;
63+
MaxTokens = (int)(config.ContextSize is > 0 ? config.ContextSize.Value : 2048);
6364

6465
@params = new ModelParams(config.ModelPath)
6566
{
66-
ContextSize = config?.ContextSize ?? 2048,
67+
ContextSize = config?.ContextSize ?? 0, // 0 = use model default
6768
GpuLayerCount = config?.GpuLayerCount ?? 20,
6869
MainGpu = config?.MainGpu ?? 0,
6970
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
70-
BatchSize = 1024,
71-
UBatchSize = 1024,
72-
FlashAttention = true,
71+
BatchSize = 512,
72+
UBatchSize = 512,
73+
Embeddings = true,
74+
FlashAttention = false,
7375
UseMemorymap = true,
7476
PoolingType = LLamaPoolingType.Mean,
7577
};

src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
using System.Diagnostics.CodeAnalysis;
22
using LLama;
33
using LLama.Common;
4+
using LLama.Abstractions;
5+
using LLama.Native;
46
using LLamaSharp.KernelMemory;
57
using MaIN.Domain.Entities;
68
using MaIN.Domain.Exceptions.Models;
@@ -108,11 +110,22 @@ private static LLamaSharpTextEmbeddingMaINClone ConfigureGeneratorOptions(string
108110

109111
var parameters = new ModelParams(config.ModelPath)
110112
{
111-
ContextSize = new uint?(config.ContextSize.GetValueOrDefault(2048U)),
113+
ContextSize = 0, // let the model decide (mxbai-embed-large-v1 = 512)
112114
GpuLayerCount = config.GpuLayerCount.GetValueOrDefault(20),
115+
Embeddings = true,
116+
UseMemorymap = true,
117+
PoolingType = LLamaPoolingType.CLS,
113118
};
114119

115120
var weights = LLamaWeights.LoadFromFile(parameters);
121+
122+
// Override config context size for embedding — use model's native context
123+
config = new LLamaSharpConfig(desiredPath)
124+
{
125+
DefaultInferenceParams = inferenceParams,
126+
GpuLayerCount = memoryParams.GpuLayerCount,
127+
ContextSize = 0,
128+
};
116129
return new LLamaSharpTextEmbeddingMaINClone(config, weights);
117130
}
118131

@@ -131,8 +144,8 @@ private static TextPartitioningOptions ConfigureParsingOptions()
131144
{
132145
return new TextPartitioningOptions
133146
{
134-
MaxTokensPerParagraph = 512,
135-
OverlappingTokens = 30,
147+
MaxTokensPerParagraph = 400,
148+
OverlappingTokens = 20,
136149
};
137150
}
138151

src/MaIN.Services/Services/LLMService/Memory/MemoryService.cs

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
using LLama.Native;
2-
using MaIN.Services.Services.LLMService.Memory.Embeddings;
31
using MaIN.Services.Utils;
42
using Microsoft.KernelMemory;
53
using Microsoft.KernelMemory.AI;
@@ -22,7 +20,7 @@ public async Task ImportDataToMemory((IKernelMemory km, ITextEmbeddingGenerator?
2220
await ImportWebUrls(memory, options.WebUrls, cancellationToken);
2321
await ImportMemoryItems(memory, options.Memory, cancellationToken);
2422
}
25-
23+
2624
public string CleanResponseText(string text)
2725
{
2826
return text
@@ -38,10 +36,8 @@ private async Task ImportTextData((IKernelMemory km, ITextEmbeddingGenerator? ge
3836

3937
foreach (var item in textData)
4038
{
41-
PreImport(memory.generator);
4239
var cleanedValue = JsonCleaner.CleanAndUnescape(item.Value);
4340
await memory.km.ImportTextAsync(cleanedValue!, item.Key, cancellationToken: cancellationToken);
44-
PostImport(memory.generator);
4541
}
4642
}
4743

@@ -51,15 +47,11 @@ private async Task ImportFilesData((IKernelMemory km, ITextEmbeddingGenerator? g
5147
if (fileData?.Any() != true)
5248
return;
5349

54-
5550
foreach (var item in fileData)
5651
{
57-
PreImport(memory.generator);
5852
await memory.km.ImportDocumentAsync(item.Value, item.Key, cancellationToken: cancellationToken);
59-
PostImport(memory.generator);
6053
}
6154
}
62-
6355

6456
private async Task ImportStreamData((IKernelMemory km, ITextEmbeddingGenerator? generator) memory, Dictionary<string, Stream>? streamData,
6557
CancellationToken cancellationToken)
@@ -69,9 +61,7 @@ private async Task ImportStreamData((IKernelMemory km, ITextEmbeddingGenerator?
6961

7062
foreach (var item in streamData)
7163
{
72-
PreImport(memory.generator);
7364
await memory.km.ImportDocumentAsync(item.Value, item.Key, cancellationToken: cancellationToken);
74-
PostImport(memory.generator);
7565
}
7666
}
7767

@@ -82,9 +72,7 @@ private async Task ImportWebUrls((IKernelMemory km, ITextEmbeddingGenerator? gen
8272

8373
foreach (var item in webUrls)
8474
{
85-
PreImport(memory.generator);
8675
await memory.km.ImportWebPageAsync(item, cancellationToken: cancellationToken);
87-
PostImport(memory.generator);
8876
}
8977
}
9078

@@ -97,15 +85,13 @@ private async Task ImportMemoryItems((IKernelMemory km, ITextEmbeddingGenerator?
9785

9886
foreach (var item in memoryItems.Select((value, i) => (value, i)))
9987
{
100-
PreImport(memory.generator);
10188
await memory.km.ImportTextAsync(
10289
item.value,
10390
$"ANSWER_MEMORY_{item.i + 1}-{memoryItems.Count}",
10491
cancellationToken: cancellationToken);
105-
PostImport(memory.generator);
10692
}
10793
}
108-
94+
10995
private static async Task PreprocessAvailableDocuments(ChatMemoryOptions options, CancellationToken cancellationToken)
11096
{
11197
foreach (var file in options.FilesData!)
@@ -123,28 +109,4 @@ private static async Task PreprocessAvailableDocuments(ChatMemoryOptions options
123109
options.StreamData = [];
124110
}
125111
}
126-
127-
private void PostImport(ITextEmbeddingGenerator? memoryGenerator)
128-
{
129-
if (memoryGenerator is LLamaSharpTextEmbeddingMaINClone llamaGenerator)
130-
{
131-
llamaGenerator._embedder.Context.Dispose();
132-
llamaGenerator._embedder.isContextDisposed = true;
133-
}
134-
}
135-
136-
private void PreImport(ITextEmbeddingGenerator? memoryGenerator)
137-
{
138-
if (memoryGenerator is LLamaSharpTextEmbeddingMaINClone { _embedder.isContextDisposed: true } llamaGenerator)
139-
{
140-
llamaGenerator._embedder.Context = llamaGenerator
141-
._embedder
142-
._weights
143-
.CreateContext(llamaGenerator.@params!);
144-
llamaGenerator._embedder.isContextDisposed = false;
145-
NativeApi.llama_set_embeddings(llamaGenerator._embedder.Context.NativeHandle, true);
146-
147-
}
148-
}
149-
150-
}
112+
}

0 commit comments

Comments
 (0)