Skip to content

Commit 408225b

Browse files
authored
server: use random media marker (ggml-org#21962)
* server: use random media marker * nits * remove legacy <__image__> token * revert special char in random
1 parent b3d7587 commit 408225b

5 files changed

Lines changed: 17 additions & 11 deletions

File tree

tools/mtmd/mtmd.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ mtmd_context_params mtmd_context_params_default() {
109109
/* use_gpu */ true,
110110
/* print_timings */ true,
111111
/* n_threads */ 4,
112-
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
112+
/* image_marker */ nullptr,
113113
/* media_marker */ mtmd_default_marker(),
114114
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
115115
/* warmup */ true,
@@ -169,7 +169,7 @@ struct mtmd_context {
169169
media_marker (ctx_params.media_marker),
170170
n_embd_text (llama_model_n_embd_inp(text_model))
171171
{
172-
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
172+
if (ctx_params.image_marker != nullptr) {
173173
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
174174
}
175175

@@ -584,9 +584,6 @@ struct mtmd_tokenizer {
584584
parse_special = text->parse_special;
585585
input_text = text->text;
586586
vocab = llama_model_get_vocab(ctx->text_model);
587-
588-
// for compatibility, we convert image marker to media marker
589-
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
590587
}
591588

592589
int32_t tokenize(mtmd_input_chunks * output) {

tools/mtmd/mtmd.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@
4646
# define MTMD_API
4747
#endif
4848

49-
// deprecated marker, use mtmd_default_marker() instead
50-
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
51-
5249
#ifdef __cplusplus
5350
extern "C" {
5451
#endif

tools/server/server-common.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ std::string gen_tool_call_id() {
8484
return random_string();
8585
}
8686

87+
static std::string media_marker = "";
88+
const char * get_media_marker() {
89+
if (media_marker.empty()) {
90+
media_marker = "<__media_" + random_string() + "__>";
91+
}
92+
return media_marker.c_str();
93+
}
94+
8795
//
8896
// lora utils
8997
//
@@ -975,7 +983,7 @@ json oaicompat_chat_params_parse(
975983
handle_media(out_files, image_url, opt.media_path);
976984

977985
p["type"] = "media_marker";
978-
p["text"] = mtmd_default_marker();
986+
p["text"] = get_media_marker();
979987
p.erase("image_url");
980988

981989
} else if (type == "input_audio") {
@@ -996,7 +1004,7 @@ json oaicompat_chat_params_parse(
9961004
// TODO: add audio_url support by reusing handle_media()
9971005

9981006
p["type"] = "media_marker";
999-
p["text"] = mtmd_default_marker();
1007+
p["text"] = get_media_marker();
10001008
p.erase("input_audio");
10011009

10021010
} else if (type != "text") {
@@ -1460,7 +1468,7 @@ json convert_transcriptions_to_chatcmpl(
14601468
if (!language.empty()) {
14611469
prompt += string_format(" (language: %s)", language.c_str());
14621470
}
1463-
prompt += mtmd_default_marker();
1471+
prompt += get_media_marker();
14641472

14651473
json chatcmpl_body = inp_body; // copy all fields
14661474
chatcmpl_body["messages"] = json::array({

tools/server/server-common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ std::string random_string();
9292
std::string gen_chatcmplid();
9393
std::string gen_tool_call_id();
9494

95+
// get a random marker; note: each time the server restarts, the marker will be different
96+
const char * get_media_marker();
97+
9598
//
9699
// lora utils
97100
//

tools/server/server-context.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,7 @@ struct server_context_impl {
708708
mparams.warmup = params_base.warmup;
709709
mparams.image_min_tokens = params_base.image_min_tokens;
710710
mparams.image_max_tokens = params_base.image_max_tokens;
711+
mparams.media_marker = get_media_marker();
711712

712713
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
713714
if (mctx == nullptr) {

0 commit comments

Comments
 (0)