diff --git a/backend/backend.proto b/backend/backend.proto index d10e63e8faef..fb8849d9acd4 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -557,6 +557,7 @@ message ModelMetadataResponse { bool supports_thinking = 1; string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable) ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis + string media_marker = 4; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker. } // Fine-tuning messages diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index fe7350528348..a0ef198e0fdb 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -2814,6 +2814,13 @@ class BackendServiceImpl final : public backend::Backend::Service { return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded"); } + // Report the active multimodal media marker so the Go layer can emit the + // same string when rendering prompts outside the tokenizer-template path. + // Only meaningful when an mtmd context was initialized (vision/audio models). + if (ctx_server.impl->mctx != nullptr) { + response->set_media_marker(get_media_marker()); + } + // Check if chat templates are initialized if (ctx_server.impl->chat_params.tmpls == nullptr) { // If templates are not initialized, we can't detect thinking support diff --git a/core/backend/llm.go b/core/backend/llm.go index d4894e70e95d..4c6c1874d106 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -15,6 +15,7 @@ import ( "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/services/galleryop" + "github.com/mudler/LocalAI/core/templates" "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/core/gallery" @@ -94,15 +95,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima return nil, err } - // Detect thinking support after model load (only if not already detected) - // This needs to happen after LoadModel succeeds so the backend can render templates - if (c.ReasoningConfig.DisableReasoning == nil && c.ReasoningConfig.DisableReasoningTagPrefill == nil) && c.TemplateConfig.UseTokenizerTemplate { + // Probe the backend for model-scoped metadata after LoadModel succeeds. + // Two signals are captured: thinking-mode detection (only meaningful when the + // tokenizer template path is active) and the multimodal media marker (needed + // by custom chat templates so markers line up with what mtmd expects). + // We probe whenever any of those slots is still empty. + needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate && + c.ReasoningConfig.DisableReasoning == nil && + c.ReasoningConfig.DisableReasoningTagPrefill == nil + needsMarkerProbe := c.MediaMarker == "" + if needsThinkingProbe || needsMarkerProbe { modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath) config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) // Update the config in the loader so it persists for future requests cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) { cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill + if c.MediaMarker != "" { + cfg.MediaMarker = c.MediaMarker + } }) } @@ -121,7 +132,17 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima for k, v := range metadata { opts.Metadata[k] = v } - opts.Prompt = s + // The prompt was rendered with the sentinel "<__media__>" marker because + // middleware templating runs before the backend is loaded and probed. + // Once we know the backend's actual media marker, substitute so marker + // count matches the bitmap count passed through opts.Images/Videos/Audios. + // No-op when MediaMarker is unset, matches the sentinel, or the prompt has + // no media placeholders. + prompt := s + if c.MediaMarker != "" && c.MediaMarker != templates.DefaultMultiMediaMarker { + prompt = strings.ReplaceAll(prompt, templates.DefaultMultiMediaMarker, c.MediaMarker) + } + opts.Prompt = prompt opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate opts.Images = images diff --git a/core/config/gguf.go b/core/config/gguf.go index 0c8255478303..14d95d4ce335 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -84,6 +84,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { // if the model supports thinking mode and if the template ends with a thinking start token. // This should be called after the model is loaded. // The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen. +// The backend-reported multimodal marker is also captured into cfg.MediaMarker. func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) { if backendClient == nil { xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection") @@ -95,9 +96,10 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac return } - // Only detect for llama-cpp backend when using tokenizer templates - if cfg.Backend != "llama-cpp" || !cfg.TemplateConfig.UseTokenizerTemplate { - xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend, "useTokenizerTemplate", cfg.TemplateConfig.UseTokenizerTemplate) + // Only llama-cpp exposes ModelMetadata today. Other backends will either error + // or return an empty response — both are fine, we just bail before calling. + if cfg.Backend != "llama-cpp" { + xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend) return } @@ -108,6 +110,21 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac } if metadata != nil { + // The multimodal media marker is backend-controlled (llama.cpp may pick a + // random per-server string). Empty means "no mtmd context" — Go falls back + // to templates.DefaultMultiMediaMarker at render time. + if metadata.MediaMarker != "" { + cfg.MediaMarker = metadata.MediaMarker + xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker) + } + + // Thinking / tool-format detection only applies when we rely on the + // backend-side tokenizer template — otherwise the rendered-template based + // heuristics below aren't meaningful. + if !cfg.TemplateConfig.UseTokenizerTemplate { + return + } + cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking) // Use the rendered template to detect if thinking token is at the end diff --git a/core/config/model_config.go b/core/config/model_config.go index 4185d4f3ff9c..7a82b4f053b0 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -52,6 +52,12 @@ type ModelConfig struct { ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]any `yaml:"-" json:"-"` + // MediaMarker is the runtime-discovered multimodal marker the backend expects + // in the prompt (e.g. "<__media__>" or a random "<__media___>" picked by + // llama.cpp). Populated on first successful ModelMetadata call. Empty until + // then — callers must fall back to templates.DefaultMultiMediaMarker. + MediaMarker string `yaml:"-" json:"-"` + FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"` ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"` diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index e5610d343da6..9867233c4bab 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1179,7 +1179,7 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation, nrOfImgsInMessage++ } } - if nrOfImgsInMessage > 0 { + if nrOfImgsInMessage > 0 && !config.TemplateConfig.UseTokenizerTemplate { templated, err := templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{ TotalImages: imgIndex, ImagesInMessage: nrOfImgsInMessage, diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 764156d4da1b..dea6ffa21649 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -709,8 +709,10 @@ func convertORMessageItem(itemMap map[string]any, cfg *config.ModelConfig) (sche msg.StringVideos = stringVideos msg.StringAudios = stringAudios - // Template multimodal content - if len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0 { + // Template multimodal content. Skipped when the backend handles templating + // itself (UseTokenizerTemplate) — it also injects markers server-side and + // StringContent is not consumed by the evaluator in that path. + if (len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0) && !cfg.TemplateConfig.UseTokenizerTemplate { msg.StringContent, _ = templates.TemplateMultiModal(cfg.TemplateConfig.Multimodal, templates.MultiModalOptions{ TotalImages: len(stringImages), TotalVideos: len(stringVideos), diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 8e03ad956c32..a6765fff5cca 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -398,14 +398,23 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema. } } - input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{ - TotalImages: imgIndex, - TotalVideos: vidIndex, - TotalAudios: audioIndex, - ImagesInMessage: nrOfImgsInMessage, - VideosInMessage: nrOfVideosInMessage, - AudiosInMessage: nrOfAudiosInMessage, - }, textContent) + // When the backend handles templating itself (UseTokenizerTemplate), + // it also injects media markers server-side (see + // oaicompat_chat_params_parse in llama.cpp). Emitting our own markers + // here would double-mark them and downstream consumers ignore + // StringContent in that path anyway, so just pass through plain text. + if config.TemplateConfig.UseTokenizerTemplate { + input.Messages[i].StringContent = textContent + } else { + input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{ + TotalImages: imgIndex, + TotalVideos: vidIndex, + TotalAudios: audioIndex, + ImagesInMessage: nrOfImgsInMessage, + VideosInMessage: nrOfVideosInMessage, + AudiosInMessage: nrOfAudiosInMessage, + }, textContent) + } } } diff --git a/core/templates/multimodal.go b/core/templates/multimodal.go index 8301e55c7a2a..53793216639c 100644 --- a/core/templates/multimodal.go +++ b/core/templates/multimodal.go @@ -21,8 +21,21 @@ type MultimodalContent struct { ID int } -// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42 -// from <__image__> to <__media__> https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83 +// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered +// prompt for each image/audio item. It matches llama.cpp's historical +// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random +// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker; +// callers substitute this sentinel with the backend-reported marker right before +// the gRPC call (core/backend/llm.go). +const DefaultMultiMediaMarker = "<__media__>" + +// DefaultMultiModalTemplate renders a per-message media-marker prefix followed +// by the text content. The sentinel marker is substituted late, so this +// template does not need to know the backend-specific marker. +// +// References: +// - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83 +// - https://github.com/ggml-org/llama.cpp/pull/21962 const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}" func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {