Merge pull request #89 from wisedev-code/feat/vl-models--cloud-providers

wisedev-pstach · web-flow · commit 938785ba60df · 2025-09-22T17:43:31.000+02:00
Feat/vl models  cloud providers
diff --git a/Examples/Examples/Chat/ChatExampleGemini.cs b/Examples/Examples/Chat/ChatExampleGemini.cs
@@ -11,7 +11,7 @@ public async Task Start()
         Console.WriteLine("(Gemini) ChatExample is running!");
 
         await AIHub.Chat()
-            .WithModel("gemini-2.0-flash")
+            .WithModel("gemini-2.5-flash")
             .WithMessage("Is the killer whale the smartest animal?")
             .CompleteAsync(interactive: true);
     }
diff --git a/Examples/Examples/Chat/ChatExampleOpenAi.cs b/Examples/Examples/Chat/ChatExampleOpenAi.cs
@@ -1,5 +1,6 @@
 using Examples.Utils;
 using MaIN.Core.Hub;
+using MaIN.Domain.Configuration;
 
 namespace Examples;
 
@@ -12,7 +13,7 @@ public async Task Start()
         Console.WriteLine("(OpenAi) ChatExample is running!"); 
         
         await AIHub.Chat()
-            .WithModel("gpt-4o-mini")
+            .WithModel("gpt-5-nano")
             .WithMessage("What do you consider to be the greatest invention in history?")
             .CompleteAsync(interactive: true);
     }
diff --git a/Releases/0.6.2.md b/Releases/0.6.2.md
@@ -0,0 +1,3 @@
+# 0.6.2 release
+
+- Allow vision message processing for cloud LLM providers
diff --git a/src/MaIN.Core/.nuspec b/src/MaIN.Core/.nuspec
@@ -2,7 +2,7 @@
 <package>
   <metadata>
     <id>MaIN.NET</id>
-    <version>0.6.1</version>
+    <version>0.6.2</version>
     <authors>Wisedev</authors>
     <owners>Wisedev</owners>
     <icon>favicon.png</icon>
diff --git a/src/MaIN.Services/Services/LLMService/AnthropicService.cs b/src/MaIN.Services/Services/LLMService/AnthropicService.cs
@@ -151,23 +151,10 @@ private List<ChatMessage> GetOrCreateConversation(Chat chat, bool createSession)
             conversation = new List<ChatMessage>();
         }
 
-        MergeMessages(conversation, chat.Messages);
+        OpenAiCompatibleService.MergeMessages(conversation, chat.Messages);
         return conversation;
     }
-
-    private void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
-    {
-        var existing = new HashSet<(string Role, string Content)>(conversation.Select(m => (m.Role, m.Content)));
-        foreach (var msg in messages)
-        {
-            var role = msg.Role.ToLowerInvariant();
-            if (!existing.Contains((role, msg.Content)))
-            {
-                conversation.Add(new ChatMessage(role, msg.Content));
-                existing.Add((role, msg.Content));
-            }
-        }
-    }
+    
 
     private void UpdateSessionCache(string chatId, string assistantResponse, bool createSession)
     {
@@ -200,16 +187,7 @@ private async Task ProcessStreamingChatAsync(
             max_tokens = chat.InterferenceParams.MaxTokens < 0 ? 4096 : chat.InterferenceParams.MaxTokens,
             stream = true,
             system = chat.InterferenceParams.Grammar is not null ? $"Respond only using the following grammar format: \n{chat.InterferenceParams.Grammar}\n. Do not add explanations, code tags, or any extra content." : "",
-            messages = conversation.Select(m => new
-            {
-                role = m.Role switch
-                {
-                    "user" => "user",
-                    "assistant" => "assistant",
-                    _ => "user"
-                },
-                content = m.Content
-            }).ToArray()
+            messages = await OpenAiCompatibleService.BuildMessagesArray(conversation, chat, ImageType.AsBase64)
             //todo: Add thinking support
         };
 
@@ -276,6 +254,7 @@ await notificationService.DispatchNotification(
         }
     }
 
+    
     private LLMTokenValue? ProcessAnthropicStreamChunk(string data)
     {
         var chunk = JsonSerializer.Deserialize<AnthropicStreamChunk>(data, new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
@@ -305,16 +284,7 @@ private async Task ProcessNonStreamingChatAsync(
             max_tokens = chat.InterferenceParams.MaxTokens < 0 ? 4096 : chat.InterferenceParams.MaxTokens,
             stream = false,
             system = chat.InterferenceParams.Grammar is not null ? $"Respond only using the following grammar format: \n{chat.InterferenceParams.Grammar}\n. Do not add explanations, code tags, or any extra content." : "",
-            messages = conversation.Select(m => new
-            {
-                role = m.Role switch
-                {
-                    "user" => "user",
-                    "assistant" => "assistant",
-                    _ => "user"
-                },
-                content = m.Content
-            }).ToArray()
+            messages = await OpenAiCompatibleService.BuildMessagesArray(conversation, chat, ImageType.AsBase64)
         };
 
         var requestJson = JsonSerializer.Serialize(requestBody);
diff --git a/src/MaIN.Services/Services/LLMService/OpenAiCompatibleService.cs b/src/MaIN.Services/Services/LLMService/OpenAiCompatibleService.cs
@@ -121,10 +121,9 @@ await _notificationService.DispatchNotification(
         {
             var jsonGrammarConverter = new GBNFToJsonConverter();
             var jsonGrammar = jsonGrammarConverter.ConvertToJson(chat.MemoryParams.Grammar);
-            userQuery =
-                $"{userQuery} | For your next response only, please respond using exactly the following JSON format: \n{jsonGrammar}\n. Do not include any explanations, code blocks, or additional content. After this single JSON response, resume your normal conversational style.";
+            userQuery = $"{userQuery} | Respond only using the following JSON format: \n{jsonGrammar}\n. Do not add explanations, code tags, or any extra content.";
         }
-
+        
         var retrievedContext = await kernel.AskAsync(userQuery, cancellationToken: cancellationToken);
 
         await kernel.DeleteIndexAsync(cancellationToken: cancellationToken);
@@ -144,13 +143,13 @@ public virtual async Task<string[]> GetCurrentModels()
         response.EnsureSuccessStatusCode();
 
         var responseJson = await response.Content.ReadAsStringAsync();
-        var modelsResponse = JsonSerializer.Deserialize<OpenAiModelsResponse>(responseJson,
+        var modelsResponse = JsonSerializer.Deserialize<OpenAiModelsResponse>(responseJson, 
             new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
 
         return (modelsResponse?.Data?
                     .Select(m => m.Id)
                     .Where(id => id != null)
-                    .ToArray()
+                    .ToArray() 
                 ?? [])!;
     }
 
@@ -205,17 +204,10 @@ private async Task ProcessStreamingChatAsync(
         var requestBody = new
         {
             model = chat.Model,
-            messages = conversation.Select(m => new
-            {
-                role = m.Role,
-                content = chat.InterferenceParams.Grammar != null
-                    //I know that this is a bit ugly, but hey, it works
-                    ? $"{m.Content} | Respond only using the following JSON format: \n{new GBNFToJsonConverter().ConvertToJson(chat.InterferenceParams.Grammar)}\n. Do not add explanations, code tags, or any extra content."
-                    : m.Content
-            }).ToArray(),
+            messages = await BuildMessagesArray(conversation, chat, ImageType.AsUrl),
             stream = true
         };
-
+        
         var requestJson = JsonSerializer.Serialize(requestBody);
         var content = new StringContent(requestJson, Encoding.UTF8, MediaTypeNames.Application.Json);
 
@@ -301,13 +293,7 @@ private async Task ProcessNonStreamingChatAsync(
         var requestBody = new
         {
             model = chat.Model,
-            messages = conversation.Select(m => new
-            {
-                role = m.Role, content = chat.InterferenceParams.Grammar != null
-                    //I know that this is a bit ugly, but hey, it works
-                    ? $"{m.Content} | Respond only using the following JSON format: \n{new GBNFToJsonConverter().ConvertToJson(chat.InterferenceParams.Grammar)}\n. Do not add explanations, code tags, or any extra content."
-                    : m.Content
-            }).ToArray(),
+            messages = await BuildMessagesArray(conversation, chat, ImageType.AsUrl),
             stream = false
         };
 
@@ -328,16 +314,31 @@ private async Task ProcessNonStreamingChatAsync(
         }
     }
 
-    private void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
+    internal static void MergeMessages(List<ChatMessage> conversation, List<Message> messages)
     {
-        var existing = new HashSet<(string, string)>(conversation.Select(m => (m.Role, m.Content)));
+        var existing = new HashSet<(string, object)>(conversation.Select(m => (m.Role, m.Content)));
         foreach (var msg in messages)
         {
             var role = msg.Role.ToLowerInvariant();
-            if (!existing.Contains((role, msg.Content)))
+        
+            if (HasImages(msg))
+            {
+                var simplifiedContent = $"{msg.Content} [Contains image]";
+                if (!existing.Contains((role, simplifiedContent)))
+                {
+                    var chatMessage = new ChatMessage(role, msg.Content);
+                    chatMessage.OriginalMessage = msg;
+                    conversation.Add(chatMessage);
+                    existing.Add((role, simplifiedContent));
+                }
+            }
+            else
             {
-                conversation.Add(new ChatMessage(role, msg.Content));
-                existing.Add((role, msg.Content));
+                if (!existing.Contains((role, msg.Content)))
+                {
+                    conversation.Add(new ChatMessage(role, msg.Content));
+                    existing.Add((role, msg.Content));
+                }
             }
         }
     }
@@ -359,27 +360,156 @@ protected static ChatResult CreateChatResult(Chat chat, string content, List<LLM
         };
     }
 
+    internal static async Task<object[]> BuildMessagesArray(List<ChatMessage> conversation, Chat chat, ImageType imageType)
+    {
+        var messages = new List<object>();
+    
+        foreach (var msg in conversation)
+        {
+            var content = msg.OriginalMessage != null ? BuildMessageContent(msg.OriginalMessage, imageType) : msg.Content;            
+            if (chat.InterferenceParams.Grammar != null && msg.Role == "user")
+            {
+                var jsonGrammarConverter = new GBNFToJsonConverter();
+                var jsonGrammar = jsonGrammarConverter.ConvertToJson(chat.InterferenceParams.Grammar);
+                var grammarInstruction = $" | Respond only using the following JSON format: \n{jsonGrammar}\n. Do not add explanations, code tags, or any extra content.";
+            
+                if (content is string textContent)
+                {
+                    content = textContent + grammarInstruction;
+                }
+                else if (content is List<object> contentParts)
+                {
+                    var modifiedParts = contentParts.ToList();
+                    modifiedParts.Add(new { type = "text", text = grammarInstruction });
+                    content = modifiedParts;
+                }
+            }
+        
+            messages.Add(new
+            {
+                role = msg.Role,
+                content = content
+            });
+        }
+    
+        return messages.ToArray();
+    }
+    
     private static async Task InvokeTokenCallbackAsync(Func<LLMTokenValue, Task>? callback, LLMTokenValue token)
     {
         if (callback != null)
         {
             await callback.Invoke(token);
         }
     }
+    
+    private static bool HasImages(Message message)
+    {
+        return message.Image != null && message.Image.Length > 0;
+    }
+
+    private static object BuildMessageContent(Message message, ImageType imageType)
+    {
+        if (!HasImages(message))
+        {
+            return message.Content;
+        }
+
+        var contentParts = new List<object>();
+
+        if (!string.IsNullOrEmpty(message.Content))
+        {
+            contentParts.Add(new
+            {
+                type = "text",
+                text = message.Content
+            });
+        }
+
+        if (message.Image != null && message.Image.Length > 0)
+        {
+            var base64Data = Convert.ToBase64String(message.Image);
+            var mimeType = DetectImageMimeType(message.Image);
+
+            switch (imageType)
+            {
+                case ImageType.AsUrl:
+                    contentParts.Add(new
+                    {
+                        type = "image_url",
+                        image_url = new
+                        {
+                            url = $"data:{mimeType};base64,{base64Data}",
+                            detail = "auto"
+                        }
+                    });
+                    break;
+                case ImageType.AsBase64:
+                    contentParts.Add(new
+                    {
+                        type = "image",
+                        source = new
+                        {
+                            data = base64Data,
+                            media_type = mimeType,
+                            type = "base64"
+                        }
+                    });
+                    break;
+            }
+        }
+
+        return contentParts;
+    }
+
+    private static string DetectImageMimeType(byte[] imageBytes)
+    {
+        if (imageBytes.Length < 4)
+            return "image/jpeg";
+
+        if (imageBytes[0] == 0xFF && imageBytes[1] == 0xD8)
+            return "image/jpeg";
+    
+        if (imageBytes.Length >= 8 && 
+            imageBytes[0] == 0x89 && imageBytes[1] == 0x50 && 
+            imageBytes[2] == 0x4E && imageBytes[3] == 0x47)
+            return "image/png";
+        
+        if (imageBytes.Length >= 6 && 
+            imageBytes[0] == 0x47 && imageBytes[1] == 0x49 && 
+            imageBytes[2] == 0x46 && imageBytes[3] == 0x38)
+            return "image/gif";
+        
+        if (imageBytes.Length >= 12 && 
+            imageBytes[0] == 0x52 && imageBytes[1] == 0x49 && 
+            imageBytes[2] == 0x46 && imageBytes[3] == 0x46 &&
+            imageBytes[8] == 0x57 && imageBytes[9] == 0x45 && 
+            imageBytes[10] == 0x42 && imageBytes[11] == 0x50)
+            return "image/webp";
+
+        return "image/jpeg";
+    }
 }
 
 public class ChatRequestOptions
 {
     public bool InteractiveUpdates { get; set; }
     public bool CreateSession { get; set; }
-    public bool SaveConv { get; set; } = true;
+    public bool SaveConv  {get; set; } = true;
     public Func<LLMTokenValue, Task>? TokenCallback { get; set; }
 }
 
-internal class ChatMessage(string role, string content)
+internal class ChatMessage(string role, object content)
 {
     public string Role { get; set; } = role;
-    public string Content { get; set; } = content;
+    public object Content { get; set; } = content;
+    public Message? OriginalMessage { get; set; }
+}
+
+internal enum ImageType
+{
+    AsUrl,
+    AsBase64
 }
 
 file class ChatCompletionResponse
@@ -413,7 +543,7 @@ file class Delta
 }
 
 file class OpenAiModelsResponse
-{
+{ 
     public List<OpenAiModel>? Data { get; set; }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ public async Task Start()`
`11`	`11`	`Console.WriteLine("(Gemini) ChatExample is running!");`
`12`	`12`
`13`	`13`	`await AIHub.Chat()`
`14`		`- .WithModel("gemini-2.0-flash")`
	`14`	`+ .WithModel("gemini-2.5-flash")`
`15`	`15`	`.WithMessage("Is the killer whale the smartest animal?")`
`16`	`16`	`.CompleteAsync(interactive: true);`
`17`	`17`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# 0.6.2 release`
	`2`	`+`
	`3`	`+- Allow vision message processing for cloud LLM providers`