Skip to content

Commit 938785b

Browse files
Merge pull request #89 from wisedev-code/feat/vl-models--cloud-providers
Feat/vl models cloud providers
2 parents 8f71eed + c2a4eb5 commit 938785b

File tree

6 files changed

+172
-68
lines changed

6 files changed

+172
-68
lines changed

Examples/Examples/Chat/ChatExampleGemini.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ public async Task Start()
1111
Console.WriteLine("(Gemini) ChatExample is running!");
1212

1313
await AIHub.Chat()
14-
.WithModel("gemini-2.0-flash")
14+
.WithModel("gemini-2.5-flash")
1515
.WithMessage("Is the killer whale the smartest animal?")
1616
.CompleteAsync(interactive: true);
1717
}

Examples/Examples/Chat/ChatExampleOpenAi.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using Examples.Utils;
22
using MaIN.Core.Hub;
3+
using MaIN.Domain.Configuration;
34

45
namespace Examples;
56

@@ -12,7 +13,7 @@ public async Task Start()
1213
Console.WriteLine("(OpenAi) ChatExample is running!");
1314

1415
await AIHub.Chat()
15-
.WithModel("gpt-4o-mini")
16+
.WithModel("gpt-5-nano")
1617
.WithMessage("What do you consider to be the greatest invention in history?")
1718
.CompleteAsync(interactive: true);
1819
}

Releases/0.6.2.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 0.6.2 release
2+
3+
- Allow vision message processing for cloud LLM providers

src/MaIN.Core/.nuspec

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package>
33
<metadata>
44
<id>MaIN.NET</id>
5-
<version>0.6.1</version>
5+
<version>0.6.2</version>
66
<authors>Wisedev</authors>
77
<owners>Wisedev</owners>
88
<icon>favicon.png</icon>

src/MaIN.Services/Services/LLMService/AnthropicService.cs

Lines changed: 5 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -151,23 +151,10 @@ private List<ChatMessage> GetOrCreateConversation(Chat chat, bool createSession)
151151
conversation = new List<ChatMessage>();
152152
}
153153

154-
MergeMessages(conversation, chat.Messages);
154+
OpenAiCompatibleService.MergeMessages(conversation, chat.Messages);
155155
return conversation;
156156
}
157-
158-
private void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
159-
{
160-
var existing = new HashSet<(string Role, string Content)>(conversation.Select(m => (m.Role, m.Content)));
161-
foreach (var msg in messages)
162-
{
163-
var role = msg.Role.ToLowerInvariant();
164-
if (!existing.Contains((role, msg.Content)))
165-
{
166-
conversation.Add(new ChatMessage(role, msg.Content));
167-
existing.Add((role, msg.Content));
168-
}
169-
}
170-
}
157+
171158

172159
private void UpdateSessionCache(string chatId, string assistantResponse, bool createSession)
173160
{
@@ -200,16 +187,7 @@ private async Task ProcessStreamingChatAsync(
200187
max_tokens = chat.InterferenceParams.MaxTokens < 0 ? 4096 : chat.InterferenceParams.MaxTokens,
201188
stream = true,
202189
system = chat.InterferenceParams.Grammar is not null ? $"Respond only using the following grammar format: \n{chat.InterferenceParams.Grammar}\n. Do not add explanations, code tags, or any extra content." : "",
203-
messages = conversation.Select(m => new
204-
{
205-
role = m.Role switch
206-
{
207-
"user" => "user",
208-
"assistant" => "assistant",
209-
_ => "user"
210-
},
211-
content = m.Content
212-
}).ToArray()
190+
messages = await OpenAiCompatibleService.BuildMessagesArray(conversation, chat, ImageType.AsBase64)
213191
//todo: Add thinking support
214192
};
215193

@@ -276,6 +254,7 @@ await notificationService.DispatchNotification(
276254
}
277255
}
278256

257+
279258
private LLMTokenValue? ProcessAnthropicStreamChunk(string data)
280259
{
281260
var chunk = JsonSerializer.Deserialize<AnthropicStreamChunk>(data, new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
@@ -305,16 +284,7 @@ private async Task ProcessNonStreamingChatAsync(
305284
max_tokens = chat.InterferenceParams.MaxTokens < 0 ? 4096 : chat.InterferenceParams.MaxTokens,
306285
stream = false,
307286
system = chat.InterferenceParams.Grammar is not null ? $"Respond only using the following grammar format: \n{chat.InterferenceParams.Grammar}\n. Do not add explanations, code tags, or any extra content." : "",
308-
messages = conversation.Select(m => new
309-
{
310-
role = m.Role switch
311-
{
312-
"user" => "user",
313-
"assistant" => "assistant",
314-
_ => "user"
315-
},
316-
content = m.Content
317-
}).ToArray()
287+
messages = await OpenAiCompatibleService.BuildMessagesArray(conversation, chat, ImageType.AsBase64)
318288
};
319289

320290
var requestJson = JsonSerializer.Serialize(requestBody);

src/MaIN.Services/Services/LLMService/OpenAiCompatibleService.cs

Lines changed: 160 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,9 @@ await _notificationService.DispatchNotification(
121121
{
122122
var jsonGrammarConverter = new GBNFToJsonConverter();
123123
var jsonGrammar = jsonGrammarConverter.ConvertToJson(chat.MemoryParams.Grammar);
124-
userQuery =
125-
$"{userQuery} | For your next response only, please respond using exactly the following JSON format: \n{jsonGrammar}\n. Do not include any explanations, code blocks, or additional content. After this single JSON response, resume your normal conversational style.";
124+
userQuery = $"{userQuery} | Respond only using the following JSON format: \n{jsonGrammar}\n. Do not add explanations, code tags, or any extra content.";
126125
}
127-
126+
128127
var retrievedContext = await kernel.AskAsync(userQuery, cancellationToken: cancellationToken);
129128

130129
await kernel.DeleteIndexAsync(cancellationToken: cancellationToken);
@@ -144,13 +143,13 @@ public virtual async Task<string[]> GetCurrentModels()
144143
response.EnsureSuccessStatusCode();
145144

146145
var responseJson = await response.Content.ReadAsStringAsync();
147-
var modelsResponse = JsonSerializer.Deserialize<OpenAiModelsResponse>(responseJson,
146+
var modelsResponse = JsonSerializer.Deserialize<OpenAiModelsResponse>(responseJson,
148147
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
149148

150149
return (modelsResponse?.Data?
151150
.Select(m => m.Id)
152151
.Where(id => id != null)
153-
.ToArray()
152+
.ToArray()
154153
?? [])!;
155154
}
156155

@@ -205,17 +204,10 @@ private async Task ProcessStreamingChatAsync(
205204
var requestBody = new
206205
{
207206
model = chat.Model,
208-
messages = conversation.Select(m => new
209-
{
210-
role = m.Role,
211-
content = chat.InterferenceParams.Grammar != null
212-
//I know that this is a bit ugly, but hey, it works
213-
? $"{m.Content} | Respond only using the following JSON format: \n{new GBNFToJsonConverter().ConvertToJson(chat.InterferenceParams.Grammar)}\n. Do not add explanations, code tags, or any extra content."
214-
: m.Content
215-
}).ToArray(),
207+
messages = await BuildMessagesArray(conversation, chat, ImageType.AsUrl),
216208
stream = true
217209
};
218-
210+
219211
var requestJson = JsonSerializer.Serialize(requestBody);
220212
var content = new StringContent(requestJson, Encoding.UTF8, MediaTypeNames.Application.Json);
221213

@@ -301,13 +293,7 @@ private async Task ProcessNonStreamingChatAsync(
301293
var requestBody = new
302294
{
303295
model = chat.Model,
304-
messages = conversation.Select(m => new
305-
{
306-
role = m.Role, content = chat.InterferenceParams.Grammar != null
307-
//I know that this is a bit ugly, but hey, it works
308-
? $"{m.Content} | Respond only using the following JSON format: \n{new GBNFToJsonConverter().ConvertToJson(chat.InterferenceParams.Grammar)}\n. Do not add explanations, code tags, or any extra content."
309-
: m.Content
310-
}).ToArray(),
296+
messages = await BuildMessagesArray(conversation, chat, ImageType.AsUrl),
311297
stream = false
312298
};
313299

@@ -328,16 +314,31 @@ private async Task ProcessNonStreamingChatAsync(
328314
}
329315
}
330316

331-
private void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
317+
internal static void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
332318
{
333-
var existing = new HashSet<(string, string)>(conversation.Select(m => (m.Role, m.Content)));
319+
var existing = new HashSet<(string, object)>(conversation.Select(m => (m.Role, m.Content)));
334320
foreach (var msg in messages)
335321
{
336322
var role = msg.Role.ToLowerInvariant();
337-
if (!existing.Contains((role, msg.Content)))
323+
324+
if (HasImages(msg))
325+
{
326+
var simplifiedContent = $"{msg.Content} [Contains image]";
327+
if (!existing.Contains((role, simplifiedContent)))
328+
{
329+
var chatMessage = new ChatMessage(role, msg.Content);
330+
chatMessage.OriginalMessage = msg;
331+
conversation.Add(chatMessage);
332+
existing.Add((role, simplifiedContent));
333+
}
334+
}
335+
else
338336
{
339-
conversation.Add(new ChatMessage(role, msg.Content));
340-
existing.Add((role, msg.Content));
337+
if (!existing.Contains((role, msg.Content)))
338+
{
339+
conversation.Add(new ChatMessage(role, msg.Content));
340+
existing.Add((role, msg.Content));
341+
}
341342
}
342343
}
343344
}
@@ -359,27 +360,156 @@ protected static ChatResult CreateChatResult(Chat chat, string content, List<LLM
359360
};
360361
}
361362

363+
internal static async Task<object[]> BuildMessagesArray(List<ChatMessage> conversation, Chat chat, ImageType imageType)
364+
{
365+
var messages = new List<object>();
366+
367+
foreach (var msg in conversation)
368+
{
369+
var content = msg.OriginalMessage != null ? BuildMessageContent(msg.OriginalMessage, imageType) : msg.Content;
370+
if (chat.InterferenceParams.Grammar != null && msg.Role == "user")
371+
{
372+
var jsonGrammarConverter = new GBNFToJsonConverter();
373+
var jsonGrammar = jsonGrammarConverter.ConvertToJson(chat.InterferenceParams.Grammar);
374+
var grammarInstruction = $" | Respond only using the following JSON format: \n{jsonGrammar}\n. Do not add explanations, code tags, or any extra content.";
375+
376+
if (content is string textContent)
377+
{
378+
content = textContent + grammarInstruction;
379+
}
380+
else if (content is List<object> contentParts)
381+
{
382+
var modifiedParts = contentParts.ToList();
383+
modifiedParts.Add(new { type = "text", text = grammarInstruction });
384+
content = modifiedParts;
385+
}
386+
}
387+
388+
messages.Add(new
389+
{
390+
role = msg.Role,
391+
content = content
392+
});
393+
}
394+
395+
return messages.ToArray();
396+
}
397+
362398
private static async Task InvokeTokenCallbackAsync(Func<LLMTokenValue, Task>? callback, LLMTokenValue token)
363399
{
364400
if (callback != null)
365401
{
366402
await callback.Invoke(token);
367403
}
368404
}
405+
406+
private static bool HasImages(Message message)
407+
{
408+
return message.Image != null && message.Image.Length > 0;
409+
}
410+
411+
private static object BuildMessageContent(Message message, ImageType imageType)
412+
{
413+
if (!HasImages(message))
414+
{
415+
return message.Content;
416+
}
417+
418+
var contentParts = new List<object>();
419+
420+
if (!string.IsNullOrEmpty(message.Content))
421+
{
422+
contentParts.Add(new
423+
{
424+
type = "text",
425+
text = message.Content
426+
});
427+
}
428+
429+
if (message.Image != null && message.Image.Length > 0)
430+
{
431+
var base64Data = Convert.ToBase64String(message.Image);
432+
var mimeType = DetectImageMimeType(message.Image);
433+
434+
switch (imageType)
435+
{
436+
case ImageType.AsUrl:
437+
contentParts.Add(new
438+
{
439+
type = "image_url",
440+
image_url = new
441+
{
442+
url = $"data:{mimeType};base64,{base64Data}",
443+
detail = "auto"
444+
}
445+
});
446+
break;
447+
case ImageType.AsBase64:
448+
contentParts.Add(new
449+
{
450+
type = "image",
451+
source = new
452+
{
453+
data = base64Data,
454+
media_type = mimeType,
455+
type = "base64"
456+
}
457+
});
458+
break;
459+
}
460+
}
461+
462+
return contentParts;
463+
}
464+
465+
private static string DetectImageMimeType(byte[] imageBytes)
466+
{
467+
if (imageBytes.Length < 4)
468+
return "image/jpeg";
469+
470+
if (imageBytes[0] == 0xFF && imageBytes[1] == 0xD8)
471+
return "image/jpeg";
472+
473+
if (imageBytes.Length >= 8 &&
474+
imageBytes[0] == 0x89 && imageBytes[1] == 0x50 &&
475+
imageBytes[2] == 0x4E && imageBytes[3] == 0x47)
476+
return "image/png";
477+
478+
if (imageBytes.Length >= 6 &&
479+
imageBytes[0] == 0x47 && imageBytes[1] == 0x49 &&
480+
imageBytes[2] == 0x46 && imageBytes[3] == 0x38)
481+
return "image/gif";
482+
483+
if (imageBytes.Length >= 12 &&
484+
imageBytes[0] == 0x52 && imageBytes[1] == 0x49 &&
485+
imageBytes[2] == 0x46 && imageBytes[3] == 0x46 &&
486+
imageBytes[8] == 0x57 && imageBytes[9] == 0x45 &&
487+
imageBytes[10] == 0x42 && imageBytes[11] == 0x50)
488+
return "image/webp";
489+
490+
return "image/jpeg";
491+
}
369492
}
370493

371494
public class ChatRequestOptions
372495
{
373496
public bool InteractiveUpdates { get; set; }
374497
public bool CreateSession { get; set; }
375-
public bool SaveConv { get; set; } = true;
498+
public bool SaveConv {get; set; } = true;
376499
public Func<LLMTokenValue, Task>? TokenCallback { get; set; }
377500
}
378501

379-
internal class ChatMessage(string role, string content)
502+
internal class ChatMessage(string role, object content)
380503
{
381504
public string Role { get; set; } = role;
382-
public string Content { get; set; } = content;
505+
public object Content { get; set; } = content;
506+
public Message? OriginalMessage { get; set; }
507+
}
508+
509+
internal enum ImageType
510+
{
511+
AsUrl,
512+
AsBase64
383513
}
384514

385515
file class ChatCompletionResponse
@@ -413,7 +543,7 @@ file class Delta
413543
}
414544

415545
file class OpenAiModelsResponse
416-
{
546+
{
417547
public List<OpenAiModel>? Data { get; set; }
418548
}
419549

0 commit comments

Comments
 (0)