@@ -22,7 +22,6 @@ public sealed class Conversation
2222 /// Indicates if this conversation has been "forked" and may share logits with another conversation.
2323 /// </summary>
2424 private bool _forked ;
25- private readonly List < SafeMtmdEmbed > _mtmdEmbeds = new ( ) ;
2625 private MtmdChunkSequence ? _pendingMtmdSequence ;
2726 private readonly List < LLamaToken > _embed_inps = new ( ) ;
2827 private readonly List < LLamaToken > _session_tokens = new ( ) ;
@@ -133,8 +132,6 @@ public void Dispose()
133132 _pendingMtmdSequence ? . Dispose ( ) ;
134133 _pendingMtmdSequence = null ;
135134
136- DisposeQueuedMedia ( ) ;
137-
138135 // Remove this conversation from the KV cache
139136 Executor . Context . NativeHandle . MemorySequenceRemove ( ConversationId , - 1 , - 1 ) ;
140137
@@ -251,37 +248,29 @@ private void AssertCanBePrompted()
251248 throw new AlreadyPromptedConversationException ( ) ;
252249 }
253250
254- public void QueueMedia ( string path )
251+ public void Prompt ( string promptText , bool addBos = true , bool special = true )
255252 {
256- AssertCanBePrompted ( ) ;
257-
258- if ( Executor . ClipModel is null )
259- throw new InvalidOperationException ( "This conversation is not configured for multimodal prompts." ) ;
260-
261- var embed = Executor . ClipModel . LoadMedia ( path ) ;
262- _mtmdEmbeds . Add ( embed ) ;
253+ var tokens = Executor . Context . Tokenize ( promptText , addBos , special ) ;
254+ Prompt ( tokens ) ;
263255 }
264256
265- public void QueueMedia ( SafeMtmdEmbed embed )
257+ /// <summary>
258+ /// Prompt this conversation with explicit multimodal embeddings.
259+ /// The caller retains ownership of <paramref name="embeds"/>.
260+ /// </summary>
261+ /// <param name="promptText">Prompt text for the model.</param>
262+ /// <param name="embeds">Media embeddings to include in the multimodal prompt.</param>
263+ /// <param name="addBos">Whether to add the BOS token.</param>
264+ public void Prompt ( string promptText , ReadOnlySpan < SafeMtmdEmbed > embeds , bool addBos = true )
266265 {
267266 AssertCanBePrompted ( ) ;
268267
269268 if ( Executor . ClipModel is null )
270269 throw new InvalidOperationException ( "This conversation is not configured for multimodal prompts." ) ;
270+ if ( embeds . IsEmpty )
271+ throw new ArgumentException ( "Embeds cannot be empty for multimodal prompts." , nameof ( embeds ) ) ;
271272
272- _mtmdEmbeds . Add ( embed ) ;
273- }
274-
275- public void Prompt ( string promptText , bool addBos = true , bool special = true )
276- {
277- if ( Executor . ClipModel != null && _mtmdEmbeds . Count > 0 )
278- {
279- PromptMultimodal ( promptText , addBos ) ;
280- return ;
281- }
282-
283- var tokens = Executor . Context . Tokenize ( promptText , addBos , special ) ;
284- Prompt ( tokens ) ;
273+ PromptMultimodal ( promptText , addBos , embeds ) ;
285274 }
286275
287276 /// <summary>
@@ -367,36 +356,21 @@ public void Prompt(ReadOnlySpan<LLamaToken> tokens, bool allLogits = false)
367356 _forked = false ;
368357 }
369358
370- private void PromptMultimodal ( string text , bool addBos )
359+ private void PromptMultimodal ( string text , bool addBos , ReadOnlySpan < SafeMtmdEmbed > embeds )
371360 {
372361 AssertCanBePrompted ( ) ;
373362
374363 if ( Executor . ClipModel is null )
375364 throw new InvalidOperationException ( "This conversation is not configured for multimodal prompts." ) ;
376- if ( _mtmdEmbeds . Count == 0 )
377- throw new InvalidOperationException ( "Queue media before prompting with multimodal input." ) ;
378365
379- var marker = Executor . GetMtmdMarker ( ) ;
380- var prompt = text ;
381-
382- if ( prompt . Contains ( "<image>" ) )
383- prompt = prompt . Replace ( "<image>" , marker ) ;
384-
385- if ( ! prompt . Contains ( marker ) )
386- {
387- var suffix = string . Concat ( Enumerable . Repeat ( marker , _mtmdEmbeds . Count ) ) ;
388- prompt = string . Concat ( prompt , suffix ) ;
389- }
366+ var prompt = BuildMtmdPrompt ( text , embeds . Length ) ;
390367
391368 SafeMtmdInputChunks ? chunks = null ;
392369 try
393370 {
394- var status = Executor . ClipModel . Tokenize ( prompt , addBos , parseSpecial : true , out chunks ) ;
371+ var status = Executor . ClipModel . Tokenize ( prompt , addBos , parseSpecial : true , embeds , out chunks ) ;
395372 if ( status != 0 || chunks is null )
396- {
397- Executor . ClipModel . ClearMedia ( ) ;
398373 throw new RuntimeError ( $ "Failed to tokenize multimodal prompt. Status: { status } .") ;
399- }
400374
401375 var sequence = MtmdChunkSequence . Create ( chunks , Executor . ClipModel ) ;
402376 _pendingMtmdSequence = sequence ;
@@ -413,11 +387,27 @@ private void PromptMultimodal(string text, bool addBos)
413387 }
414388 finally
415389 {
416- DisposeQueuedMedia ( ) ;
417390 chunks ? . Dispose ( ) ;
418391 }
419392 }
420393
394+ private string BuildMtmdPrompt ( string text , int embedCount )
395+ {
396+ var marker = Executor . GetMtmdMarker ( ) ;
397+ var prompt = text ;
398+
399+ if ( prompt . Contains ( "<image>" ) )
400+ prompt = prompt . Replace ( "<image>" , marker ) ;
401+
402+ if ( ! prompt . Contains ( marker ) )
403+ {
404+ var suffix = string . Concat ( Enumerable . Repeat ( marker , embedCount ) ) ;
405+ prompt = string . Concat ( prompt , suffix ) ;
406+ }
407+
408+ return prompt ;
409+ }
410+
421411 /// <summary>
422412 /// Add a single token to this conversation
423413 /// </summary>
@@ -525,7 +515,6 @@ internal void OnMtmdEvaluationFailed(int status)
525515 _pendingMtmdSequence ? . Dispose ( ) ;
526516 _pendingMtmdSequence = null ;
527517 _requiredEpoch = Executor . Epoch ;
528- DisposeQueuedMedia ( ) ;
529518 }
530519
531520 private LLamaToken GetFillerToken ( string marker )
@@ -541,18 +530,6 @@ private LLamaToken GetFillerToken(string marker)
541530 return default ;
542531 }
543532
544- private void DisposeQueuedMedia ( )
545- {
546- if ( _mtmdEmbeds . Count == 0 )
547- return ;
548-
549- foreach ( var embed in _mtmdEmbeds )
550- embed . Dispose ( ) ;
551-
552- _mtmdEmbeds . Clear ( ) ;
553- Executor . ClipModel ? . ClearMedia ( ) ;
554- }
555-
556533 /// <summary>
557534 /// Provides direct access to the KV cache of a <see cref="Conversation"/>.
558535 /// See <see cref="Modify"/> for how to use this.
0 commit comments