Skip to content

Commit 855b8c5

Browse files
committed
Remove media queue
1 parent 8168309 commit 855b8c5

File tree

5 files changed

+89
-63
lines changed

5 files changed

+89
-63
lines changed

LLama.Examples/Examples/BatchedExecutorMtmd.cs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public class BatchedExecutorMtmd
1919
/// <summary>
2020
/// Number of completion tokens to generate after sending the image prompt.
2121
/// </summary>
22-
public const int TokenCount = 10000;
22+
public const int TokenCount = 100;
2323

2424
public static async Task Run()
2525
{
@@ -60,10 +60,11 @@ public static async Task Run()
6060
{
6161
// Each conversation tracks its own KV cache sequence IDs.
6262
var conversation = executor.Create();
63-
// enqueue the image so MtmdHelper sees it
64-
conversation.QueueMedia(imagePath);
65-
// schedule multimodal prompt
66-
conversation.Prompt(promptText, addBos: true, special: true);
63+
// Load the media embed explicitly so ownership is clear.
64+
using var embed = mtmd.LoadMedia( imagePath)
65+
?? throw new RuntimeError($"Failed to load media '{imagePath}'.");
66+
// Schedule the multimodal prompt with explicit embeds.
67+
conversation.Prompt(promptText, new[] { embed }, addBos: true);
6768

6869
Console.ForegroundColor = ConsoleColor.Yellow;
6970
Console.WriteLine("Prompt queued with multimodal chunks. Generating response...\n");

LLama.Examples/Examples/MtmdInteractiveModeExecute.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public static async Task Run()
1919
{
2020
string multiModalProj = UserSettings.GetMMProjPath();
2121
string modelPath = UserSettings.GetModelPath();
22-
const int maxTokens = 8192;
22+
const int maxTokens = 4096;
2323

2424
string? prompt = await File.ReadAllTextAsync("Assets/chat-with-bob.json");
2525

LLama/Batched/Conversation.cs

Lines changed: 34 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ public sealed class Conversation
2222
/// Indicates if this conversation has been "forked" and may share logits with another conversation.
2323
/// </summary>
2424
private bool _forked;
25-
private readonly List<SafeMtmdEmbed> _mtmdEmbeds = new();
2625
private MtmdChunkSequence? _pendingMtmdSequence;
2726
private readonly List<LLamaToken> _embed_inps = new();
2827
private readonly List<LLamaToken> _session_tokens = new();
@@ -133,8 +132,6 @@ public void Dispose()
133132
_pendingMtmdSequence?.Dispose();
134133
_pendingMtmdSequence = null;
135134

136-
DisposeQueuedMedia();
137-
138135
// Remove this conversation from the KV cache
139136
Executor.Context.NativeHandle.MemorySequenceRemove(ConversationId, -1, -1);
140137

@@ -251,37 +248,29 @@ private void AssertCanBePrompted()
251248
throw new AlreadyPromptedConversationException();
252249
}
253250

254-
public void QueueMedia(string path)
251+
public void Prompt(string promptText, bool addBos = true, bool special = true)
255252
{
256-
AssertCanBePrompted();
257-
258-
if (Executor.ClipModel is null)
259-
throw new InvalidOperationException("This conversation is not configured for multimodal prompts.");
260-
261-
var embed = Executor.ClipModel.LoadMedia(path);
262-
_mtmdEmbeds.Add(embed);
253+
var tokens = Executor.Context.Tokenize(promptText, addBos, special);
254+
Prompt(tokens);
263255
}
264256

265-
public void QueueMedia(SafeMtmdEmbed embed)
257+
/// <summary>
258+
/// Prompt this conversation with explicit multimodal embeddings.
259+
/// The caller retains ownership of <paramref name="embeds"/>.
260+
/// </summary>
261+
/// <param name="promptText">Prompt text for the model.</param>
262+
/// <param name="embeds">Media embeddings to include in the multimodal prompt.</param>
263+
/// <param name="addBos">Whether to add the BOS token.</param>
264+
public void Prompt(string promptText, ReadOnlySpan<SafeMtmdEmbed> embeds, bool addBos = true)
266265
{
267266
AssertCanBePrompted();
268267

269268
if (Executor.ClipModel is null)
270269
throw new InvalidOperationException("This conversation is not configured for multimodal prompts.");
270+
if (embeds.IsEmpty)
271+
throw new ArgumentException("Embeds cannot be empty for multimodal prompts.", nameof(embeds));
271272

272-
_mtmdEmbeds.Add(embed);
273-
}
274-
275-
public void Prompt(string promptText, bool addBos = true, bool special = true)
276-
{
277-
if (Executor.ClipModel != null && _mtmdEmbeds.Count > 0)
278-
{
279-
PromptMultimodal(promptText, addBos);
280-
return;
281-
}
282-
283-
var tokens = Executor.Context.Tokenize(promptText, addBos, special);
284-
Prompt(tokens);
273+
PromptMultimodal(promptText, addBos, embeds);
285274
}
286275

287276
/// <summary>
@@ -367,36 +356,21 @@ public void Prompt(ReadOnlySpan<LLamaToken> tokens, bool allLogits = false)
367356
_forked = false;
368357
}
369358

370-
private void PromptMultimodal(string text, bool addBos)
359+
private void PromptMultimodal(string text, bool addBos, ReadOnlySpan<SafeMtmdEmbed> embeds)
371360
{
372361
AssertCanBePrompted();
373362

374363
if (Executor.ClipModel is null)
375364
throw new InvalidOperationException("This conversation is not configured for multimodal prompts.");
376-
if (_mtmdEmbeds.Count == 0)
377-
throw new InvalidOperationException("Queue media before prompting with multimodal input.");
378365

379-
var marker = Executor.GetMtmdMarker();
380-
var prompt = text;
381-
382-
if (prompt.Contains("<image>"))
383-
prompt = prompt.Replace("<image>", marker);
384-
385-
if (!prompt.Contains(marker))
386-
{
387-
var suffix = string.Concat(Enumerable.Repeat(marker, _mtmdEmbeds.Count));
388-
prompt = string.Concat(prompt, suffix);
389-
}
366+
var prompt = BuildMtmdPrompt(text, embeds.Length);
390367

391368
SafeMtmdInputChunks? chunks = null;
392369
try
393370
{
394-
var status = Executor.ClipModel.Tokenize(prompt, addBos, parseSpecial: true, out chunks);
371+
var status = Executor.ClipModel.Tokenize(prompt, addBos, parseSpecial: true, embeds, out chunks);
395372
if (status != 0 || chunks is null)
396-
{
397-
Executor.ClipModel.ClearMedia();
398373
throw new RuntimeError($"Failed to tokenize multimodal prompt. Status: {status}.");
399-
}
400374

401375
var sequence = MtmdChunkSequence.Create(chunks, Executor.ClipModel);
402376
_pendingMtmdSequence = sequence;
@@ -413,11 +387,27 @@ private void PromptMultimodal(string text, bool addBos)
413387
}
414388
finally
415389
{
416-
DisposeQueuedMedia();
417390
chunks?.Dispose();
418391
}
419392
}
420393

394+
private string BuildMtmdPrompt(string text, int embedCount)
395+
{
396+
var marker = Executor.GetMtmdMarker();
397+
var prompt = text;
398+
399+
if (prompt.Contains("<image>"))
400+
prompt = prompt.Replace("<image>", marker);
401+
402+
if (!prompt.Contains(marker))
403+
{
404+
var suffix = string.Concat(Enumerable.Repeat(marker, embedCount));
405+
prompt = string.Concat(prompt, suffix);
406+
}
407+
408+
return prompt;
409+
}
410+
421411
/// <summary>
422412
/// Add a single token to this conversation
423413
/// </summary>
@@ -525,7 +515,6 @@ internal void OnMtmdEvaluationFailed(int status)
525515
_pendingMtmdSequence?.Dispose();
526516
_pendingMtmdSequence = null;
527517
_requiredEpoch = Executor.Epoch;
528-
DisposeQueuedMedia();
529518
}
530519

531520
private LLamaToken GetFillerToken(string marker)
@@ -541,18 +530,6 @@ private LLamaToken GetFillerToken(string marker)
541530
return default;
542531
}
543532

544-
private void DisposeQueuedMedia()
545-
{
546-
if (_mtmdEmbeds.Count == 0)
547-
return;
548-
549-
foreach (var embed in _mtmdEmbeds)
550-
embed.Dispose();
551-
552-
_mtmdEmbeds.Clear();
553-
Executor.ClipModel?.ClearMedia();
554-
}
555-
556533
/// <summary>
557534
/// Provides direct access to the KV cache of a <see cref="Conversation"/>.
558535
/// See <see cref="Modify"/> for how to use this.

LLama/MtmdWeights.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ public static Task<MtmdWeights> LoadFromFileAsync(string mmProject, LLamaWeights
5454
public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtmdInputChunks? chunks)
5555
=> NativeHandle.Tokenize(text, addSpecial, parseSpecial, out chunks);
5656

57+
/// <summary>
58+
/// Tokenize text (with optional special tokens) against explicit media embeddings.
59+
/// The caller retains ownership of <paramref name="embeds"/>.
60+
/// </summary>
61+
public int Tokenize(string text, bool addSpecial, bool parseSpecial, ReadOnlySpan<SafeMtmdEmbed> embeds, out SafeMtmdInputChunks? chunks)
62+
=> NativeHandle.Tokenize(text, addSpecial, parseSpecial, embeds, out chunks);
63+
5764
/// <summary>
5865
/// Evaluate a chunk batch using the helper that performs mtmd encode + llama decode.
5966
/// </summary>

LLama/Native/SafeMtmdModelHandle.cs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,47 @@ public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtm
149149
return result;
150150
}
151151

152+
/// <summary>
153+
/// Tokenize a prompt alongside the provided media embeddings.
154+
/// The caller retains ownership of <paramref name="embeds"/>.
155+
/// </summary>
156+
/// <param name="text">Prompt text to tokenize.</param>
157+
/// <param name="addSpecial">Whether to append special tokens automatically.</param>
158+
/// <param name="parseSpecial">Whether special tokens should be treated as user-provided text.</param>
159+
/// <param name="embeds">Media embeddings to include in the multimodal prompt.</param>
160+
/// <param name="chunks">Receives the native chunk collection when tokenization succeeds.</param>
161+
/// <returns>Zero on success; otherwise the native mtmd tokenize error code.</returns>
162+
/// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
163+
/// <exception cref="RuntimeError">The native tokenizer failed to allocate output chunks.</exception>
164+
public int Tokenize(string text, bool addSpecial, bool parseSpecial, ReadOnlySpan<SafeMtmdEmbed> embeds, out SafeMtmdInputChunks? chunks)
165+
{
166+
EnsureNotDisposed();
167+
168+
chunks = null;
169+
var output = NativeApi.mtmd_input_chunks_init();
170+
if (output == IntPtr.Zero)
171+
throw new RuntimeError("Failed to allocate mtmd_input_chunks.");
172+
173+
var bitmapHandles = new IntPtr[embeds.Length];
174+
for (var i = 0; i < embeds.Length; i++)
175+
{
176+
var embed = embeds[i] ?? throw new ArgumentNullException(nameof(embeds), "Embeds cannot contain null.");
177+
bitmapHandles[i] = embed.NativePtr;
178+
}
179+
180+
var result = NativeApi.mtmd_tokenize(DangerousGetHandle(), output, text, addSpecial, parseSpecial, bitmapHandles, (UIntPtr)bitmapHandles.Length);
181+
if (result == 0)
182+
{
183+
chunks = new SafeMtmdInputChunks(output);
184+
}
185+
else
186+
{
187+
NativeApi.mtmd_input_chunks_free(output);
188+
}
189+
190+
return result;
191+
}
192+
152193
/// <summary>
153194
/// Evaluate a batch of chunks using the helper (mirrors mtmd-helper eval logic).
154195
/// </summary>

0 commit comments

Comments
 (0)