Skip to content

Commit cec95a6

Browse files
committed
Fix vertex file send for pdf files (without ocr)
Expose CreateMemoryWithVertex on IMemoryFactory and implement it in MemoryFactory to build a KernelMemory configured for Vertex (Gemini) text generation and embeddings using a bearer token provider, location and projectId. Add PDF MIME detection in ChatHelper. Extend VertexService.AskMemory to bypass KernelMemory for multimodal requests: collect inline images/PDFs, convert non-native files to text via DocumentProcessor, aggregate text/context/memory, inject optional grammar JSON, and send combined content to Gemini; includes helper methods for file/stream processing and a list of Gemini-native extensions. Also add necessary usings and temporary file handling for stream processing.
1 parent 05f7f5f commit cec95a6

4 files changed

Lines changed: 171 additions & 2 deletions

File tree

src/MaIN.Services/Services/LLMService/Memory/IMemoryFactory.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ public interface IMemoryFactory
1414
MemoryParams memoryParams);
1515
IKernelMemory CreateMemoryWithOpenAi(string openAiKey, MemoryParams memoryParams);
1616
IKernelMemory CreateMemoryWithGemini(string geminiKey, MemoryParams memoryParams);
17+
IKernelMemory CreateMemoryWithVertex(Func<ValueTask<string>> bearerTokenProvider, string location, string projectId, MemoryParams memoryParams);
1718
}

src/MaIN.Services/Services/LLMService/Memory/MemoryFactory.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,29 @@ public IKernelMemory CreateMemoryWithGemini(string geminiKey, MemoryParams memor
7474
.WithSemanticKernelTextEmbeddingGenerationService(
7575
new GoogleAITextEmbeddingGenerationService("gemini-embedding-001", geminiKey), new SemanticKernelConfig())
7676
#pragma warning restore SKEXP0070
77+
.WithCustomImageOcr(new OcrWrapper())
78+
.WithSimpleVectorDb()
79+
.Build();
80+
81+
return kernelMemory;
82+
}
83+
84+
public IKernelMemory CreateMemoryWithVertex(Func<ValueTask<string>> bearerTokenProvider, string location, string projectId, MemoryParams memoryParams)
85+
{
86+
var searchOptions = ConfigureSearchOptions(memoryParams);
87+
88+
var kernelMemory = new KernelMemoryBuilder()
89+
.WithSearchClientConfig(searchOptions)
90+
#pragma warning disable SKEXP0070
91+
.WithSemanticKernelTextGenerationService(
92+
new GeminiTextGeneratorAdapter(
93+
new VertexAIGeminiChatCompletionService("gemini-2.5-flash", bearerTokenProvider, location, projectId)),
94+
new SemanticKernelConfig())
95+
.WithSemanticKernelTextEmbeddingGenerationService(
96+
new VertexAITextEmbeddingGenerationService("text-embedding-005", bearerTokenProvider, location, projectId),
97+
new SemanticKernelConfig())
98+
#pragma warning restore SKEXP0070
99+
.WithCustomImageOcr(new OcrWrapper())
77100
.WithSimpleVectorDb()
78101
.Build();
79102

src/MaIN.Services/Services/LLMService/Utils/ChatHelper.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,11 @@ private static string DetectImageMimeType(byte[] imageBytes)
259259
if (imageBytes.Length < 4)
260260
return "image/jpeg";
261261

262+
// PDF: %PDF (0x25 0x50 0x44 0x46)
263+
if (imageBytes[0] == 0x25 && imageBytes[1] == 0x50 &&
264+
imageBytes[2] == 0x44 && imageBytes[3] == 0x46)
265+
return "application/pdf";
266+
262267
if (imageBytes[0] == 0xFF && imageBytes[1] == 0xD8)
263268
return "image/jpeg";
264269

src/MaIN.Services/Services/LLMService/VertexService.cs

Lines changed: 142 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
1+
using System.Text;
12
using MaIN.Domain.Configuration;
23
using MaIN.Domain.Configuration.BackendInferenceParams;
34
using MaIN.Domain.Entities;
5+
using MaIN.Domain.Models;
46
using MaIN.Domain.Models.Concrete;
57
using MaIN.Services.Constants;
68
using MaIN.Services.Services.Abstract;
79
using MaIN.Services.Services.LLMService.Auth;
810
using MaIN.Services.Services.LLMService.Memory;
911
using MaIN.Services.Services.Models;
12+
using MaIN.Services.Utils;
1013
using Microsoft.Extensions.Logging;
1114

1215
namespace MaIN.Services.Services.LLMService;
@@ -97,16 +100,153 @@ protected override void ApplyBackendParams(Dictionary<string, object> requestBod
97100
return await base.Send(chat, options, cancellationToken);
98101
}
99102

100-
public new async Task<ChatResult?> AskMemory(
103+
/// <summary>
104+
/// Bypasses KernelMemory and sends files directly to Gemini via multimodal API.
105+
/// PDFs and images are sent inline (Gemini handles OCR natively),
106+
/// other formats are pre-processed to text via DocumentProcessor.
107+
/// </summary>
108+
public override async Task<ChatResult?> AskMemory(
101109
Chat chat,
102110
ChatMemoryOptions memoryOptions,
103111
ChatRequestOptions requestOptions,
104112
CancellationToken cancellationToken = default)
105113
{
106114
ExtractLocation(chat);
107-
return await base.AskMemory(chat, memoryOptions, requestOptions, cancellationToken);
115+
116+
if (!chat.Messages.Any())
117+
return null;
118+
119+
var lastMessage = chat.Messages.Last();
120+
var originalContent = lastMessage.Content;
121+
var originalFiles = lastMessage.Files;
122+
var originalImages = lastMessage.Images;
123+
124+
try
125+
{
126+
var inlineBytes = new List<byte[]>();
127+
var textContext = new StringBuilder();
128+
129+
CollectTextData(memoryOptions, textContext);
130+
await CollectFilesData(memoryOptions, inlineBytes, textContext, cancellationToken);
131+
await CollectStreamData(memoryOptions, inlineBytes, textContext, cancellationToken);
132+
CollectMemoryItems(memoryOptions, textContext);
133+
134+
var queryBuilder = new StringBuilder();
135+
if (textContext.Length > 0)
136+
{
137+
queryBuilder.AppendLine("Use the following document content to answer the question:\n");
138+
queryBuilder.Append(textContext);
139+
queryBuilder.AppendLine();
140+
}
141+
queryBuilder.Append(originalContent);
142+
143+
if (chat.MemoryParams.Grammar != null)
144+
{
145+
var jsonGrammar = new GrammarToJsonConverter().ConvertToJson(chat.MemoryParams.Grammar);
146+
queryBuilder.Append(
147+
$" | For your next response only, please respond using exactly the following JSON format: \n{jsonGrammar}\n. Do not include any explanations, code blocks, or additional content. After this single JSON response, resume your normal conversational style.");
148+
}
149+
150+
lastMessage.Content = queryBuilder.ToString();
151+
lastMessage.Files = null;
152+
153+
// Merge existing images with inline file bytes (PDFs sent as native multimodal content)
154+
var allInline = new List<byte[]>(originalImages ?? []);
155+
allInline.AddRange(inlineBytes);
156+
lastMessage.Images = allInline.Count > 0 ? allInline : null;
157+
158+
return await Send(chat, requestOptions, cancellationToken);
159+
}
160+
finally
161+
{
162+
lastMessage.Content = originalContent;
163+
lastMessage.Files = originalFiles;
164+
lastMessage.Images = originalImages;
165+
}
166+
}
167+
168+
#region Multimodal File Processing
169+
170+
private static readonly HashSet<string> GeminiNativeExtensions =
171+
[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic", ".heif", ".avif"];
172+
173+
private static bool IsGeminiNativeFile(string fileName)
174+
=> GeminiNativeExtensions.Contains(Path.GetExtension(fileName).ToLowerInvariant());
175+
176+
private static void CollectTextData(ChatMemoryOptions options, StringBuilder textContext)
177+
{
178+
foreach (var (name, content) in options.TextData)
179+
{
180+
textContext.AppendLine($"[Document: {name}]");
181+
textContext.AppendLine(content);
182+
textContext.AppendLine();
183+
}
108184
}
109185

186+
private static async Task CollectFilesData(
187+
ChatMemoryOptions options, List<byte[]> inlineBytes, StringBuilder textContext,
188+
CancellationToken cancellationToken)
189+
{
190+
foreach (var (name, path) in options.FilesData)
191+
{
192+
if (IsGeminiNativeFile(name))
193+
{
194+
inlineBytes.Add(await File.ReadAllBytesAsync(path, cancellationToken));
195+
}
196+
else
197+
{
198+
textContext.AppendLine($"[Document: {name}]");
199+
textContext.AppendLine(DocumentProcessor.ProcessDocument(path));
200+
textContext.AppendLine();
201+
}
202+
}
203+
}
204+
205+
private static async Task CollectStreamData(
206+
ChatMemoryOptions options, List<byte[]> inlineBytes, StringBuilder textContext,
207+
CancellationToken cancellationToken)
208+
{
209+
foreach (var (name, stream) in options.StreamData)
210+
{
211+
using var ms = new MemoryStream();
212+
if (stream.CanSeek) stream.Position = 0;
213+
await stream.CopyToAsync(ms, cancellationToken);
214+
var bytes = ms.ToArray();
215+
216+
if (IsGeminiNativeFile(name))
217+
{
218+
inlineBytes.Add(bytes);
219+
}
220+
else
221+
{
222+
var tempPath = Path.Combine(Path.GetTempPath(), $"vertex_tmp_{Guid.NewGuid()}{Path.GetExtension(name)}");
223+
try
224+
{
225+
await File.WriteAllBytesAsync(tempPath, bytes, cancellationToken);
226+
textContext.AppendLine($"[Document: {name}]");
227+
textContext.AppendLine(DocumentProcessor.ProcessDocument(tempPath));
228+
textContext.AppendLine();
229+
}
230+
finally
231+
{
232+
if (File.Exists(tempPath)) File.Delete(tempPath);
233+
}
234+
}
235+
}
236+
}
237+
238+
private static void CollectMemoryItems(ChatMemoryOptions options, StringBuilder textContext)
239+
{
240+
if (options.Memory is not { Count: > 0 }) return;
241+
foreach (var item in options.Memory)
242+
{
243+
textContext.AppendLine(item);
244+
textContext.AppendLine();
245+
}
246+
}
247+
248+
#endregion
249+
110250
private void ExtractLocation(Chat chat)
111251
{
112252
if (chat.BackendParams is VertexInferenceParams vp)

0 commit comments

Comments
 (0)