Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ message ModelMetadataResponse {
bool supports_thinking = 1;
string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis
string media_marker = 4; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker.
}

// Fine-tuning messages
Expand Down
7 changes: 7 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2814,6 +2814,13 @@ class BackendServiceImpl final : public backend::Backend::Service {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}

// Report the active multimodal media marker so the Go layer can emit the
// same string when rendering prompts outside the tokenizer-template path.
// Only meaningful when an mtmd context was initialized (vision/audio models).
if (ctx_server.impl->mctx != nullptr) {
response->set_media_marker(get_media_marker());
}

// Check if chat templates are initialized
if (ctx_server.impl->chat_params.tmpls == nullptr) {
// If templates are not initialized, we can't detect thinking support
Expand Down
29 changes: 25 additions & 4 deletions core/backend/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/services/galleryop"
"github.com/mudler/LocalAI/core/templates"
"github.com/mudler/LocalAI/core/trace"

"github.com/mudler/LocalAI/core/gallery"
Expand Down Expand Up @@ -94,15 +95,25 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
return nil, err
}

// Detect thinking support after model load (only if not already detected)
// This needs to happen after LoadModel succeeds so the backend can render templates
if (c.ReasoningConfig.DisableReasoning == nil && c.ReasoningConfig.DisableReasoningTagPrefill == nil) && c.TemplateConfig.UseTokenizerTemplate {
// Probe the backend for model-scoped metadata after LoadModel succeeds.
// Two signals are captured: thinking-mode detection (only meaningful when the
// tokenizer template path is active) and the multimodal media marker (needed
// by custom chat templates so markers line up with what mtmd expects).
// We probe whenever any of those slots is still empty.
needsThinkingProbe := c.TemplateConfig.UseTokenizerTemplate &&
c.ReasoningConfig.DisableReasoning == nil &&
c.ReasoningConfig.DisableReasoningTagPrefill == nil
needsMarkerProbe := c.MediaMarker == ""
if needsThinkingProbe || needsMarkerProbe {
modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
// Update the config in the loader so it persists for future requests
cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
if c.MediaMarker != "" {
cfg.MediaMarker = c.MediaMarker
}
})
}

Expand All @@ -121,7 +132,17 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
for k, v := range metadata {
opts.Metadata[k] = v
}
opts.Prompt = s
// The prompt was rendered with the sentinel "<__media__>" marker because
// middleware templating runs before the backend is loaded and probed.
// Once we know the backend's actual media marker, substitute so marker
// count matches the bitmap count passed through opts.Images/Videos/Audios.
// No-op when MediaMarker is unset, matches the sentinel, or the prompt has
// no media placeholders.
prompt := s
if c.MediaMarker != "" && c.MediaMarker != templates.DefaultMultiMediaMarker {
prompt = strings.ReplaceAll(prompt, templates.DefaultMultiMediaMarker, c.MediaMarker)
}
opts.Prompt = prompt
opts.Messages = protoMessages
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
opts.Images = images
Expand Down
23 changes: 20 additions & 3 deletions core/config/gguf.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
// if the model supports thinking mode and if the template ends with a thinking start token.
// This should be called after the model is loaded.
// The results are stored in cfg.SupportsThinking and cfg.ThinkingForcedOpen.
// The backend-reported multimodal marker is also captured into cfg.MediaMarker.
func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, backendClient grpc.Backend, modelOptions *pb.ModelOptions) {
if backendClient == nil {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: backend client is nil, skipping detection")
Expand All @@ -95,9 +96,10 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
return
}

// Only detect for llama-cpp backend when using tokenizer templates
if cfg.Backend != "llama-cpp" || !cfg.TemplateConfig.UseTokenizerTemplate {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend, "useTokenizerTemplate", cfg.TemplateConfig.UseTokenizerTemplate)
// Only llama-cpp exposes ModelMetadata today. Other backends will either error
// or return an empty response — both are fine, we just bail before calling.
if cfg.Backend != "llama-cpp" {
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: skipping detection", "backend", cfg.Backend)
return
}

Expand All @@ -108,6 +110,21 @@ func DetectThinkingSupportFromBackend(ctx context.Context, cfg *ModelConfig, bac
}

if metadata != nil {
// The multimodal media marker is backend-controlled (llama.cpp may pick a
// random per-server string). Empty means "no mtmd context" — Go falls back
// to templates.DefaultMultiMediaMarker at render time.
if metadata.MediaMarker != "" {
cfg.MediaMarker = metadata.MediaMarker
xlog.Debug("[gguf] DetectThinkingSupportFromBackend: media marker captured", "marker", metadata.MediaMarker)
}

// Thinking / tool-format detection only applies when we rely on the
// backend-side tokenizer template — otherwise the rendered-template based
// heuristics below aren't meaningful.
if !cfg.TemplateConfig.UseTokenizerTemplate {
return
}

cfg.ReasoningConfig.DisableReasoning = ptr.To(!metadata.SupportsThinking)

// Use the rendered template to detect if thinking token is at the end
Expand Down
6 changes: 6 additions & 0 deletions core/config/model_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ type ModelConfig struct {
ResponseFormat string `yaml:"-" json:"-"`
ResponseFormatMap map[string]any `yaml:"-" json:"-"`

// MediaMarker is the runtime-discovered multimodal marker the backend expects
// in the prompt (e.g. "<__media__>" or a random "<__media_<rand>__>" picked by
// llama.cpp). Populated on first successful ModelMetadata call. Empty until
// then — callers must fall back to templates.DefaultMultiMediaMarker.
MediaMarker string `yaml:"-" json:"-"`

FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

Expand Down
2 changes: 1 addition & 1 deletion core/http/endpoints/openai/realtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -1179,7 +1179,7 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
nrOfImgsInMessage++
}
}
if nrOfImgsInMessage > 0 {
if nrOfImgsInMessage > 0 && !config.TemplateConfig.UseTokenizerTemplate {
templated, err := templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
TotalImages: imgIndex,
ImagesInMessage: nrOfImgsInMessage,
Expand Down
6 changes: 4 additions & 2 deletions core/http/endpoints/openresponses/responses.go
Original file line number Diff line number Diff line change
Expand Up @@ -709,8 +709,10 @@ func convertORMessageItem(itemMap map[string]any, cfg *config.ModelConfig) (sche
msg.StringVideos = stringVideos
msg.StringAudios = stringAudios

// Template multimodal content
if len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0 {
// Template multimodal content. Skipped when the backend handles templating
// itself (UseTokenizerTemplate) — it also injects markers server-side and
// StringContent is not consumed by the evaluator in that path.
if (len(stringImages) > 0 || len(stringVideos) > 0 || len(stringAudios) > 0) && !cfg.TemplateConfig.UseTokenizerTemplate {
msg.StringContent, _ = templates.TemplateMultiModal(cfg.TemplateConfig.Multimodal, templates.MultiModalOptions{
TotalImages: len(stringImages),
TotalVideos: len(stringVideos),
Expand Down
25 changes: 17 additions & 8 deletions core/http/middleware/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,23 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
}
}

input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
TotalImages: imgIndex,
TotalVideos: vidIndex,
TotalAudios: audioIndex,
ImagesInMessage: nrOfImgsInMessage,
VideosInMessage: nrOfVideosInMessage,
AudiosInMessage: nrOfAudiosInMessage,
}, textContent)
// When the backend handles templating itself (UseTokenizerTemplate),
// it also injects media markers server-side (see
// oaicompat_chat_params_parse in llama.cpp). Emitting our own markers
// here would double-mark them and downstream consumers ignore
// StringContent in that path anyway, so just pass through plain text.
if config.TemplateConfig.UseTokenizerTemplate {
input.Messages[i].StringContent = textContent
} else {
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
TotalImages: imgIndex,
TotalVideos: vidIndex,
TotalAudios: audioIndex,
ImagesInMessage: nrOfImgsInMessage,
VideosInMessage: nrOfVideosInMessage,
AudiosInMessage: nrOfAudiosInMessage,
}, textContent)
}
}
}

Expand Down
17 changes: 15 additions & 2 deletions core/templates/multimodal.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,21 @@ type MultimodalContent struct {
ID int
}

// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42
// from <__image__> to <__media__> https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
// prompt for each image/audio item. It matches llama.cpp's historical
// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
// callers substitute this sentinel with the backend-reported marker right before
// the gRPC call (core/backend/llm.go).
const DefaultMultiMediaMarker = "<__media__>"

// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
// by the text content. The sentinel marker is substituted late, so this
// template does not need to know the backend-specific marker.
//
// References:
// - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
// - https://github.com/ggml-org/llama.cpp/pull/21962
const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"

func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
Expand Down
Loading