@@ -121,10 +121,9 @@ await _notificationService.DispatchNotification(
121121 {
122122 var jsonGrammarConverter = new GBNFToJsonConverter ( ) ;
123123 var jsonGrammar = jsonGrammarConverter . ConvertToJson ( chat . MemoryParams . Grammar ) ;
124- userQuery =
125- $ "{ userQuery } | For your next response only, please respond using exactly the following JSON format: \n { jsonGrammar } \n . Do not include any explanations, code blocks, or additional content. After this single JSON response, resume your normal conversational style.";
124+ userQuery = $ "{ userQuery } | Respond only using the following JSON format: \n { jsonGrammar } \n . Do not add explanations, code tags, or any extra content.";
126125 }
127-
126+
128127 var retrievedContext = await kernel . AskAsync ( userQuery , cancellationToken : cancellationToken ) ;
129128
130129 await kernel . DeleteIndexAsync ( cancellationToken : cancellationToken ) ;
@@ -144,13 +143,13 @@ public virtual async Task<string[]> GetCurrentModels()
144143 response . EnsureSuccessStatusCode ( ) ;
145144
146145 var responseJson = await response . Content . ReadAsStringAsync ( ) ;
147- var modelsResponse = JsonSerializer . Deserialize < OpenAiModelsResponse > ( responseJson ,
146+ var modelsResponse = JsonSerializer . Deserialize < OpenAiModelsResponse > ( responseJson ,
148147 new JsonSerializerOptions { PropertyNameCaseInsensitive = true } ) ;
149148
150149 return ( modelsResponse ? . Data ?
151150 . Select ( m => m . Id )
152151 . Where ( id => id != null )
153- . ToArray ( )
152+ . ToArray ( )
154153 ?? [ ] ) ! ;
155154 }
156155
@@ -205,17 +204,10 @@ private async Task ProcessStreamingChatAsync(
205204 var requestBody = new
206205 {
207206 model = chat . Model ,
208- messages = conversation . Select ( m => new
209- {
210- role = m . Role ,
211- content = chat . InterferenceParams . Grammar != null
212- //I know that this is a bit ugly, but hey, it works
213- ? $ "{ m . Content } | Respond only using the following JSON format: \n { new GBNFToJsonConverter ( ) . ConvertToJson ( chat . InterferenceParams . Grammar ) } \n . Do not add explanations, code tags, or any extra content."
214- : m . Content
215- } ) . ToArray ( ) ,
207+ messages = await BuildMessagesArray ( conversation , chat , ImageType . AsUrl ) ,
216208 stream = true
217209 } ;
218-
210+
219211 var requestJson = JsonSerializer . Serialize ( requestBody ) ;
220212 var content = new StringContent ( requestJson , Encoding . UTF8 , MediaTypeNames . Application . Json ) ;
221213
@@ -301,13 +293,7 @@ private async Task ProcessNonStreamingChatAsync(
301293 var requestBody = new
302294 {
303295 model = chat . Model ,
304- messages = conversation . Select ( m => new
305- {
306- role = m . Role , content = chat . InterferenceParams . Grammar != null
307- //I know that this is a bit ugly, but hey, it works
308- ? $ "{ m . Content } | Respond only using the following JSON format: \n { new GBNFToJsonConverter ( ) . ConvertToJson ( chat . InterferenceParams . Grammar ) } \n . Do not add explanations, code tags, or any extra content."
309- : m . Content
310- } ) . ToArray ( ) ,
296+ messages = await BuildMessagesArray ( conversation , chat , ImageType . AsUrl ) ,
311297 stream = false
312298 } ;
313299
@@ -328,16 +314,31 @@ private async Task ProcessNonStreamingChatAsync(
328314 }
329315 }
330316
331- private void MergeMessages ( List < ChatMessage > conversation , List < Message > messages )
317+ internal static void MergeMessages ( List < ChatMessage > conversation , List < Message > messages )
332318 {
333- var existing = new HashSet < ( string , string ) > ( conversation . Select ( m => ( m . Role , m . Content ) ) ) ;
319+ var existing = new HashSet < ( string , object ) > ( conversation . Select ( m => ( m . Role , m . Content ) ) ) ;
334320 foreach ( var msg in messages )
335321 {
336322 var role = msg . Role . ToLowerInvariant ( ) ;
337- if ( ! existing . Contains ( ( role , msg . Content ) ) )
323+
324+ if ( HasImages ( msg ) )
325+ {
326+ var simplifiedContent = $ "{ msg . Content } [Contains image]";
327+ if ( ! existing . Contains ( ( role , simplifiedContent ) ) )
328+ {
329+ var chatMessage = new ChatMessage ( role , msg . Content ) ;
330+ chatMessage . OriginalMessage = msg ;
331+ conversation . Add ( chatMessage ) ;
332+ existing . Add ( ( role , simplifiedContent ) ) ;
333+ }
334+ }
335+ else
338336 {
339- conversation . Add ( new ChatMessage ( role , msg . Content ) ) ;
340- existing . Add ( ( role , msg . Content ) ) ;
337+ if ( ! existing . Contains ( ( role , msg . Content ) ) )
338+ {
339+ conversation . Add ( new ChatMessage ( role , msg . Content ) ) ;
340+ existing . Add ( ( role , msg . Content ) ) ;
341+ }
341342 }
342343 }
343344 }
@@ -359,27 +360,156 @@ protected static ChatResult CreateChatResult(Chat chat, string content, List<LLM
359360 } ;
360361 }
361362
363+ internal static async Task < object [ ] > BuildMessagesArray ( List < ChatMessage > conversation , Chat chat , ImageType imageType )
364+ {
365+ var messages = new List < object > ( ) ;
366+
367+ foreach ( var msg in conversation )
368+ {
369+ var content = msg . OriginalMessage != null ? BuildMessageContent ( msg . OriginalMessage , imageType ) : msg . Content ;
370+ if ( chat . InterferenceParams . Grammar != null && msg . Role == "user" )
371+ {
372+ var jsonGrammarConverter = new GBNFToJsonConverter ( ) ;
373+ var jsonGrammar = jsonGrammarConverter . ConvertToJson ( chat . InterferenceParams . Grammar ) ;
374+ var grammarInstruction = $ " | Respond only using the following JSON format: \n { jsonGrammar } \n . Do not add explanations, code tags, or any extra content.";
375+
376+ if ( content is string textContent )
377+ {
378+ content = textContent + grammarInstruction ;
379+ }
380+ else if ( content is List < object > contentParts )
381+ {
382+ var modifiedParts = contentParts . ToList ( ) ;
383+ modifiedParts . Add ( new { type = "text" , text = grammarInstruction } ) ;
384+ content = modifiedParts ;
385+ }
386+ }
387+
388+ messages . Add ( new
389+ {
390+ role = msg . Role ,
391+ content = content
392+ } ) ;
393+ }
394+
395+ return messages . ToArray ( ) ;
396+ }
397+
362398 private static async Task InvokeTokenCallbackAsync ( Func < LLMTokenValue , Task > ? callback , LLMTokenValue token )
363399 {
364400 if ( callback != null )
365401 {
366402 await callback . Invoke ( token ) ;
367403 }
368404 }
405+
406+ private static bool HasImages ( Message message )
407+ {
408+ return message . Image != null && message . Image . Length > 0 ;
409+ }
410+
411+ private static object BuildMessageContent ( Message message , ImageType imageType )
412+ {
413+ if ( ! HasImages ( message ) )
414+ {
415+ return message . Content ;
416+ }
417+
418+ var contentParts = new List < object > ( ) ;
419+
420+ if ( ! string . IsNullOrEmpty ( message . Content ) )
421+ {
422+ contentParts . Add ( new
423+ {
424+ type = "text" ,
425+ text = message . Content
426+ } ) ;
427+ }
428+
429+ if ( message . Image != null && message . Image . Length > 0 )
430+ {
431+ var base64Data = Convert . ToBase64String ( message . Image ) ;
432+ var mimeType = DetectImageMimeType ( message . Image ) ;
433+
434+ switch ( imageType )
435+ {
436+ case ImageType . AsUrl :
437+ contentParts . Add ( new
438+ {
439+ type = "image_url" ,
440+ image_url = new
441+ {
442+ url = $ "data:{ mimeType } ;base64,{ base64Data } ",
443+ detail = "auto"
444+ }
445+ } ) ;
446+ break ;
447+ case ImageType . AsBase64 :
448+ contentParts . Add ( new
449+ {
450+ type = "image" ,
451+ source = new
452+ {
453+ data = base64Data ,
454+ media_type = mimeType ,
455+ type = "base64"
456+ }
457+ } ) ;
458+ break ;
459+ }
460+ }
461+
462+ return contentParts ;
463+ }
464+
465+ private static string DetectImageMimeType ( byte [ ] imageBytes )
466+ {
467+ if ( imageBytes . Length < 4 )
468+ return "image/jpeg" ;
469+
470+ if ( imageBytes [ 0 ] == 0xFF && imageBytes [ 1 ] == 0xD8 )
471+ return "image/jpeg" ;
472+
473+ if ( imageBytes . Length >= 8 &&
474+ imageBytes [ 0 ] == 0x89 && imageBytes [ 1 ] == 0x50 &&
475+ imageBytes [ 2 ] == 0x4E && imageBytes [ 3 ] == 0x47 )
476+ return "image/png" ;
477+
478+ if ( imageBytes . Length >= 6 &&
479+ imageBytes [ 0 ] == 0x47 && imageBytes [ 1 ] == 0x49 &&
480+ imageBytes [ 2 ] == 0x46 && imageBytes [ 3 ] == 0x38 )
481+ return "image/gif" ;
482+
483+ if ( imageBytes . Length >= 12 &&
484+ imageBytes [ 0 ] == 0x52 && imageBytes [ 1 ] == 0x49 &&
485+ imageBytes [ 2 ] == 0x46 && imageBytes [ 3 ] == 0x46 &&
486+ imageBytes [ 8 ] == 0x57 && imageBytes [ 9 ] == 0x45 &&
487+ imageBytes [ 10 ] == 0x42 && imageBytes [ 11 ] == 0x50 )
488+ return "image/webp" ;
489+
490+ return "image/jpeg" ;
491+ }
369492}
370493
371494public class ChatRequestOptions
372495{
373496 public bool InteractiveUpdates { get ; set ; }
374497 public bool CreateSession { get ; set ; }
375- public bool SaveConv { get ; set ; } = true ;
498+ public bool SaveConv { get ; set ; } = true ;
376499 public Func < LLMTokenValue , Task > ? TokenCallback { get ; set ; }
377500}
378501
379- internal class ChatMessage ( string role , string content )
502+ internal class ChatMessage ( string role , object content )
380503{
381504 public string Role { get ; set ; } = role ;
382- public string Content { get ; set ; } = content ;
505+ public object Content { get ; set ; } = content ;
506+ public Message ? OriginalMessage { get ; set ; }
507+ }
508+
509+ internal enum ImageType
510+ {
511+ AsUrl ,
512+ AsBase64
383513}
384514
385515file class ChatCompletionResponse
@@ -413,7 +543,7 @@ file class Delta
413543}
414544
415545file class OpenAiModelsResponse
416- {
546+ {
417547 public List < OpenAiModel > ? Data { get ; set ; }
418548}
419549
0 commit comments