Skip to content

Commit 7809c5f

Browse files
authored
fix(vision): propagate mtmd media marker from backend via ModelMetadata (#9412)
Upstream llama.cpp (PR #21962) switched the server-side mtmd media marker to a random per-server string and removed the legacy "<__media__>" backward-compat replacement in mtmd_tokenizer. The Go layer still emitted the hardcoded "<__media__>", so on the non-tokenizer-template path the prompt arrived with a marker mtmd did not recognize and tokenization failed with "number of bitmaps (1) does not match number of markers (0)". Report the active media marker via ModelMetadataResponse.media_marker and substitute the sentinel "<__media__>" with it right before the gRPC call, after the backend has been loaded and probed. Also skip the Go-side multimodal templating entirely when UseTokenizerTemplate is true — llama.cpp's oaicompat_chat_params_parse already injects its own marker and StringContent is unused in that path. Backends that do not expose the field keep the legacy "<__media__>" behavior.
1 parent ad74273 commit 7809c5f

9 files changed

Lines changed: 96 additions & 20 deletions

File tree

backend/backend.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,7 @@ message ModelMetadataResponse {
557557
bool supports_thinking = 1;
558558
string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
559559
ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis
560+
string media_marker = 4; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker.
560561
}
561562

562563
// Fine-tuning messages

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2814,6 +2814,13 @@ class BackendServiceImpl final : public backend::Backend::Service {
28142814
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
28152815
}
28162816

2817+
// Report the active multimodal media marker so the Go layer can emit the
2818+
// same string when rendering prompts outside the tokenizer-template path.
2819+
// Only meaningful when an mtmd context was initialized (vision/audio models).
2820+
if (ctx_server.impl->mctx != nullptr) {
2821+
response->set_media_marker(get_media_marker());
2822+
}
2823+
28172824
// Check if chat templates are initialized
28182825
if (ctx_server.impl->chat_params.tmpls == nullptr) {
28192826
// If templates are not initialized, we can't detect thinking support

core/backend/llm.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/mudler/LocalAI/core/config"
1616
"github.com/mudler/LocalAI/core/schema"
1717
"github.com/mudler/LocalAI/core/services/galleryop"
18+
"github.com/mudler/LocalAI/core/templates"
1819
"github.com/mudler/LocalAI/core/trace"
1920

2021
"github.com/mudler/LocalAI/core/gallery"
@@ -94,15 +95,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
9495
return nil, err
9596
}
9697

97-
// Detect thinking support after model load (only if not already detected)
98-
// This needs to happen after LoadModel succeeds so the backend can render templates
99-
if (c.ReasoningConfig.DisableReasoning == nil && c.ReasoningConfig.DisableReasoningTagPrefill == nil) && c.TemplateConfig.UseTokenizerTemplate {
98+
// Probe the backend for model-scoped metadata after LoadModel succeeds.
99+
// Two signals are captured: thinking-mode detection (only meaningful when the
100+
// tokenizer template path is active) and the multimodal media marker (needed
101+
// by custom chat templates so markers line up with what mtmd expects).
102+
// We probe whenever any of those slots is still empty.
103+
needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate &&
104+
c.ReasoningConfig.DisableReasoning == nil &&
105+
c.ReasoningConfig.DisableReasoningTagPrefill == nil
106+
needsMarkerProbe := c.MediaMarker == ""
107+
if needsThinkingProbe || needsMarkerProbe {
100108
modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
101109
config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
102110
// Update the config in the loader so it persists for future requests
103111
cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
104112
cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
105113
cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
114+
if c.MediaMarker != "" {
115+
cfg.MediaMarker = c.MediaMarker
116+
}
106117
})
107118
}
108119

@@ -121,7 +132,17 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
121132
for k, v := range metadata {
122133
opts.Metadata[k] = v
123134
}
124-
opts.Prompt = s
135+
// The prompt was rendered with the sentinel "<__media__>" marker because
136+
// middleware templating runs before the backend is loaded and probed.
137+
// Once we know the backend's actual media marker, substitute so marker
138+
// count matches the bitmap count passed through opts.Images/Videos/Audios.
139+
// No-op when MediaMarker is unset, matches the sentinel, or the prompt has
140+
// no media placeholders.
141+
prompt := s
142+
if c.MediaMarker != "" && c.MediaMarker != templates.DefaultMultiMediaMarker {
143+
prompt = strings.ReplaceAll(prompt, templates.DefaultMultiMediaMarker, c.MediaMarker)
144+
}
145+
opts.Prompt = prompt
125146
opts.Messages = protoMessages
126147
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
127148
opts.Images = images

core/config/gguf.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
8484
// if the model supports thinking mode and if the template ends with a thinking start token.
8585
// This should be called after the model is loaded.
8686
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
87+
// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
8788
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
8889
if backendClient == nil {
8990
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
@@ -95,9 +96,10 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
9596
return
9697
}
9798

98-
// Only detect for llama-cpp backend when using tokenizer templates
99-
if cfg.Backend != "llama-cpp" || !cfg.TemplateConfig.UseTokenizerTemplate {
100-
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend, "useTokenizerTemplate", cfg.TemplateConfig.UseTokenizerTemplate)
99+
// Only llama-cpp exposes ModelMetadata today. Other backends will either error
100+
// or return an empty response — both are fine, we just bail before calling.
101+
if cfg.Backend != "llama-cpp" {
102+
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
101103
return
102104
}
103105

@@ -108,6 +110,21 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
108110
}
109111

110112
if metadata != nil {
113+
// The multimodal media marker is backend-controlled (llama.cpp may pick a
114+
// random per-server string). Empty means "no mtmd context" — Go falls back
115+
// to templates.DefaultMultiMediaMarker at render time.
116+
if metadata.MediaMarker != "" {
117+
cfg.MediaMarker = metadata.MediaMarker
118+
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
119+
}
120+
121+
// Thinking / tool-format detection only applies when we rely on the
122+
// backend-side tokenizer template — otherwise the rendered-template based
123+
// heuristics below aren't meaningful.
124+
if !cfg.TemplateConfig.UseTokenizerTemplate {
125+
return
126+
}
127+
111128
cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)
112129

113130
// Use the rendered template to detect if thinking token is at the end

core/config/model_config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ type ModelConfig struct {
5252
ResponseFormat string `yaml:"-" json:"-"`
5353
ResponseFormatMap map[string]any `yaml:"-" json:"-"`
5454

55+
// MediaMarker is the runtime-discovered multimodal marker the backend expects
56+
// in the prompt (e.g. "<__media__>" or a random "<__media_<rand>__>" picked by
57+
// llama.cpp). Populated on first successful ModelMetadata call. Empty until
58+
// then — callers must fall back to templates.DefaultMultiMediaMarker.
59+
MediaMarker string `yaml:"-" json:"-"`
60+
5561
FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
5662
ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`
5763

core/http/endpoints/openai/realtime.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1179,7 +1179,7 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
11791179
nrOfImgsInMessage++
11801180
}
11811181
}
1182-
if nrOfImgsInMessage > 0 {
1182+
if nrOfImgsInMessage > 0 && !config.TemplateConfig.UseTokenizerTemplate {
11831183
templated, err := templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
11841184
TotalImages: imgIndex,
11851185
ImagesInMessage: nrOfImgsInMessage,

core/http/endpoints/openresponses/responses.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -709,8 +709,10 @@ func convertORMessageItem(itemMap map[string]any, cfg *config.ModelConfig) (sche
709709
msg.StringVideos = stringVideos
710710
msg.StringAudios = stringAudios
711711

712-
// Template multimodal content
713-
if len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0 {
712+
// Template multimodal content. Skipped when the backend handles templating
713+
// itself (UseTokenizerTemplate) — it also injects markers server-side and
714+
// StringContent is not consumed by the evaluator in that path.
715+
if (len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0) && !cfg.TemplateConfig.UseTokenizerTemplate {
714716
msg.StringContent, _ = templates.TemplateMultiModal(cfg.TemplateConfig.Multimodal, templates.MultiModalOptions{
715717
TotalImages: len(stringImages),
716718
TotalVideos: len(stringVideos),

core/http/middleware/request.go

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -398,14 +398,23 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
398398
}
399399
}
400400

401-
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
402-
TotalImages: imgIndex,
403-
TotalVideos: vidIndex,
404-
TotalAudios: audioIndex,
405-
ImagesInMessage: nrOfImgsInMessage,
406-
VideosInMessage: nrOfVideosInMessage,
407-
AudiosInMessage: nrOfAudiosInMessage,
408-
}, textContent)
401+
// When the backend handles templating itself (UseTokenizerTemplate),
402+
// it also injects media markers server-side (see
403+
// oaicompat_chat_params_parse in llama.cpp). Emitting our own markers
404+
// here would double-mark them and downstream consumers ignore
405+
// StringContent in that path anyway, so just pass through plain text.
406+
if config.TemplateConfig.UseTokenizerTemplate {
407+
input.Messages[i].StringContent = textContent
408+
} else {
409+
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
410+
TotalImages: imgIndex,
411+
TotalVideos: vidIndex,
412+
TotalAudios: audioIndex,
413+
ImagesInMessage: nrOfImgsInMessage,
414+
VideosInMessage: nrOfVideosInMessage,
415+
AudiosInMessage: nrOfAudiosInMessage,
416+
}, textContent)
417+
}
409418
}
410419
}
411420

core/templates/multimodal.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,21 @@ type MultimodalContent struct {
2121
ID int
2222
}
2323

24-
// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42
25-
// from <__image__> to <__media__> https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
24+
// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
25+
// prompt for each image/audio item. It matches llama.cpp's historical
26+
// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
27+
// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
28+
// callers substitute this sentinel with the backend-reported marker right before
29+
// the gRPC call (core/backend/llm.go).
30+
const DefaultMultiMediaMarker = "<__media__>"
31+
32+
// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
33+
// by the text content. The sentinel marker is substituted late, so this
34+
// template does not need to know the backend-specific marker.
35+
//
36+
// References:
37+
// - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
38+
// - https://github.com/ggml-org/llama.cpp/pull/21962
2639
const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
2740

2841
func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {

0 commit comments

Comments
 (0)