mtmd, server, common: expose modalities to /v1/models (#22952)

ngxson · web-flow · commit 7bfe120c210c · 2026-05-12T19:08:07.000+02:00
* mtmd, server, common: expose modalities to /v1/models

* fix build

* rename to mtmd_caps
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -435,6 +435,25 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //
 
+void common_params_handle_models(common_params & params, llama_example curr_ex) {
+    auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+    if (params.no_mmproj) {
+        params.mmproj = {};
+    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+        // optionally, handle mmproj model when -hf is specified
+        params.mmproj = res.mmproj;
+    }
+    // only download mmproj if the current example is using it
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
+            break;
+        }
+    }
+    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
+    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
     common_params & params = ctx_arg.params;
 
@@ -588,22 +607,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     if (!skip_model_download) {
-        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
-                break;
-            }
-        }
-        common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
-        common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
+        common_params_handle_models(params, ctx_arg.ex);
     }
 
     // model is required (except for server)
diff --git a/common/arg.h b/common/arg.h
@@ -129,5 +129,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);
 
+// Populate model paths (main model, mmproj, etc) from -hf if necessary
+void common_params_handle_models(common_params & params, llama_example curr_ex);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/preset.cpp b/common/preset.cpp
@@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {
     }
 }
 
-void common_preset::apply_to_params(common_params & params) const {
+void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {
     for (const auto & [opt, val] : options) {
+        if (!handled_keys.empty()) {
+            if (!opt.env || handled_keys.find(opt.env) == handled_keys.end()) {
+                continue;
+            }
+        }
         // apply each option to params
         if (opt.handler_string) {
             opt.handler_string(params, val);
diff --git a/common/preset.h b/common/preset.h
@@ -43,7 +43,8 @@ struct common_preset {
     void merge(const common_preset & other);
 
     // apply preset options to common_params
-    void apply_to_params(common_params & params) const;
+    // optionally specify handled_keys to only apply a subset of options (identified by their env), if empty, apply all options
+    void apply_to_params(common_params & params, const std::set<std::string> & handled_keys = std::set<std::string>()) const;
 };
 
 // interface for multiple presets in one file
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -994,7 +994,7 @@ struct clip_model_loader {
     bool has_audio  = false;
 
     // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
-    clip_model_loader(const char * fname) : fname(fname) {
+    clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
         struct ggml_context * meta = nullptr;
 
         struct gguf_init_params params = {
@@ -1040,7 +1040,7 @@ struct clip_model_loader {
         }
 
         // tensors
-        {
+        if (!skip_tensors) {
             for (int i = 0; i < n_tensors; ++i) {
                 const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
                 const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
@@ -2927,6 +2927,14 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
     return {ctx_vision, ctx_audio};
 }
 
+struct clip_cap clip_get_cap(const char * fname) {
+    clip_cap res;
+    clip_model_loader loader(fname, /* skip_tensors= */ true);
+    res.has_vision = loader.has_vision;
+    res.has_audio  = loader.has_audio;
+    return res;
+}
+
 struct clip_image_size * clip_image_size_init() {
     struct clip_image_size * load_image_size = new struct clip_image_size();
     load_image_size->width = 448;
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -116,3 +116,9 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+struct clip_cap {
+    bool has_vision;
+    bool has_audio;
+};
+struct clip_cap clip_get_cap(const char * fname);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -1423,6 +1423,19 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback_user_data = user_data;
 }
 
+struct mtmd_caps mtmd_get_cap_from_file(const char * fname) {
+    try {
+        auto tmp = clip_get_cap(fname);
+        mtmd_caps cap;
+        cap.inp_audio  = tmp.has_audio;
+        cap.inp_vision = tmp.has_vision;
+        return cap;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to get capabilities from file '%s': %s\n", __func__, fname, e.what());
+        return mtmd_caps{ false, false };
+    }
+}
+
 //
 // Debugging API (NOT intended for public use)
 //
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -244,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // If this is not called, or NULL is supplied, everything is output on stderr.
 MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
 
+// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
+// This is only intended to be used by llama-server, breaking changes is expected
+struct mtmd_caps {
+    bool inp_vision;
+    bool inp_audio;
+};
+MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+
 /////////////////////////////////////////
 
 // test function, to be used in test-mtmd-c-api.c
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
@@ -161,6 +161,30 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
     args = preset.to_args(bin_path);
 }
 
+void server_model_meta::update_caps() {
+    try {
+        common_params params;
+        preset.apply_to_params(params, {
+            "LLAMA_ARG_MODEL",
+            "LLAMA_ARG_MODEL_URL",
+            "LLAMA_ARG_MMPROJ",
+            "LLAMA_ARG_MMPROJ_URL",
+            "LLAMA_ARG_HF_REPO",
+            "LLAMA_ARG_HF_REPO_FILE",
+        });
+        params.offline = true; // avoid any unwanted network call during capability detection
+        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
+        if (params.mmproj.path.empty()) {
+            multimodal = { false, false };
+        } else {
+            multimodal = mtmd_get_cap_from_file(params.mmproj.path.c_str());
+        }
+    } catch (const std::exception & e) {
+        LOG_WRN("failed to initialize common_params for multimodal capability detection: %s\n", e.what());
+        multimodal = { false, false };
+    }
+}
+
 //
 // server_models
 //
@@ -236,6 +260,7 @@ void server_models::add_model(server_model_meta && meta) {
     }
 
     meta.update_args(ctx_preset, bin_path); // render args
+    meta.update_caps();
     std::string name = meta.name;
     mapping[name] = instance_t{
         /* subproc */ std::make_shared<subprocess_s>(),
@@ -346,8 +371,10 @@ void server_models::load_models() {
                 /* status       */ SERVER_MODEL_STATUS_UNLOADED,
                 /* last_used    */ 0,
                 /* args         */ std::vector<std::string>(),
+                /* loaded_info  */ {},
                 /* exit_code    */ 0,
                 /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                /* multimodal   */ mtmd_caps{false, false},
             };
             add_model(std::move(meta));
         }
@@ -481,6 +508,7 @@ void server_models::load_models() {
 
             inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
             inst.meta.update_args(ctx_preset, bin_path);
+            inst.meta.update_caps();
         }
 
         // add models that are new in this reload
@@ -496,8 +524,10 @@ void server_models::load_models() {
                     /* status       */ SERVER_MODEL_STATUS_UNLOADED,
                     /* last_used    */ 0,
                     /* args         */ std::vector<std::string>(),
+                    /* loaded_info  */ {},
                     /* exit_code    */ 0,
                     /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                    /* multimodal   */ mtmd_caps{false, false},
                 };
                 add_model(std::move(meta));
                 newly_added.push_back(name);
@@ -1206,14 +1236,28 @@ void server_models_routes::init_routes() {
                 status["failed"]    = true;
             }
 
+            // pi coding agent multimodal compatibility
+            json input_modalities = json::array({"text"});
+            if (meta.multimodal.inp_vision) {
+                input_modalities.push_back("image");
+            }
+            if (meta.multimodal.inp_audio) {
+                input_modalities.push_back("audio");
+            }
+            json architecture {
+                {"input_modalities",  input_modalities},
+                {"output_modalities", json::array({"text"})},
+            };
+
             json model_info = json {
-                {"id",       meta.name},
-                {"aliases",  meta.aliases},
-                {"tags",     meta.tags},
-                {"object",   "model"},    // for OAI-compat
-                {"owned_by", "llamacpp"}, // for OAI-compat
-                {"created",  t},          // for OAI-compat
-                {"status",   status},
+                {"id",           meta.name},
+                {"aliases",      meta.aliases},
+                {"tags",         meta.tags},
+                {"object",       "model"},    // for OAI-compat
+                {"owned_by",     "llamacpp"}, // for OAI-compat
+                {"created",      t},          // for OAI-compat
+                {"status",       status},
+                {"architecture", architecture},
                 // TODO: add other fields, may require reading GGUF metadata
             };
 
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
@@ -66,6 +66,7 @@ struct server_model_meta {
     json loaded_info; // info to be reflected via /v1/models endpoint
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
+    mtmd_caps multimodal; // multimodal capabilities
 
     bool is_ready() const {
         return status == SERVER_MODEL_STATUS_LOADED;
@@ -80,6 +81,7 @@ struct server_model_meta {
     }
 
     void update_args(common_preset_context & ctx_presets, std::string bin_path);
+    void update_caps();
 };
 
 struct subprocess_s;

Original file line number	Diff line number	Diff line change
`@@ -163,8 +163,13 @@ void common_preset::merge(const common_preset & other) {`
`163`	`163`	`}`
`164`	`164`	`}`
`165`	`165`
`166`		`-void common_preset::apply_to_params(common_params & params) const {`
	`166`	`+void common_preset::apply_to_params(common_params & params, const std::set<std::string> & handled_keys) const {`
`167`	`167`	`for (const auto & [opt, val] : options) {`
	`168`	`+ if (!handled_keys.empty()) {`
	`169`	`+ if (!opt.env \|\| handled_keys.find(opt.env) == handled_keys.end()) {`
	`170`	`+ continue;`
	`171`	`+ }`
	`172`	`+ }`
`168`	`173`	`// apply each option to params`
`169`	`174`	`if (opt.handler_string) {`
`170`	`175`	`opt.handler_string(params, val);`