Skip to content

Commit 5577927

Browse files
Merge pull request #569 from janhq/update-dev-from-master-2026-06-24-01-08
Sync master with upstream release b9775
2 parents 72f26d9 + be4a6a6 commit 5577927

54 files changed

Lines changed: 2267 additions & 1155 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# ggml-org/ggml-rpc : rgerganov
1111
# ggml-org/ggml-sycl : arthw
1212
# ggml-org/ggml-vulkan : 0cc4m, jeffbolznv
13-
# ggml-org/ggml-webgpu : reeselevine
13+
# ggml-org/ggml-webgpu : reeselevine, yomaytk
1414
# ggml-org/ggml-zdnn : taronaeo
1515
# ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
1616
# ggml-org/llama-mtmd : ngxson

common/arg.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,8 @@ static handle_model_result common_params_handle_model(struct common_params_model
301301
const common_download_opts & opts) {
302302
handle_model_result result;
303303

304+
// TODO @ngxson : refactor this into a new common_model_download_context
305+
304306
if (!model.docker_repo.empty()) {
305307
model.path = common_docker_resolve_model(model.docker_repo);
306308
} else if (!model.hf_repo.empty()) {
@@ -396,7 +398,7 @@ static bool parse_bool_value(const std::string & value) {
396398
// CLI argument parsing functions
397399
//
398400

399-
bool common_params_handle_models(common_params & params, llama_example curr_ex, common_download_callback * callback) {
401+
bool common_params_handle_models(common_params & params, llama_example curr_ex, const common_params_handle_models_params & handle_params) {
400402
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
401403
params.speculative.types.end(),
402404
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
@@ -407,9 +409,10 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex,
407409
opts.skip_download = params.skip_download;
408410
opts.download_mtp = spec_type_draft_mtp;
409411
opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
412+
opts.preset_only = handle_params.preset_only;
410413

411-
if (callback) {
412-
opts.callback = callback;
414+
if (handle_params.callback) {
415+
opts.callback = handle_params.callback;
413416
}
414417

415418
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
@@ -596,7 +599,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
596599

597600
if (!skip_model_download) {
598601
// handle model and download
599-
common_params_handle_models(params, ctx_arg.ex);
602+
common_params_handle_models(params, ctx_arg.ex, {});
600603

601604
// model is required (except for server)
602605
// TODO @ngxson : maybe show a list of available models in CLI in this case

common/arg.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,19 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
130130
// see: https://github.com/ggml-org/llama.cpp/issues/18163
131131
void common_params_add_preset_options(std::vector<common_arg> & args);
132132

133+
struct common_params_handle_models_params {
134+
common_download_callback * callback = nullptr;
135+
bool preset_only = false; // if true, only check & download remote preset (for router mode)
136+
};
137+
133138
// populate model paths (main model, mmproj, etc) from -hf if necessary
134139
// return true if the model is ready to use
135140
// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
136141
// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
137142
bool common_params_handle_models(
138143
common_params & params,
139144
llama_example curr_ex,
140-
common_download_callback * callback = nullptr);
145+
const common_params_handle_models_params & handle_params);
141146

142147
// initialize argument parser context - used by test-arg-parser and preset
143148
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

common/chat.cpp

Lines changed: 99 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -90,41 +90,93 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
9090
return text;
9191
}
9292

93-
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
94-
if (delims.empty() || prompt.empty()) {
95-
return {};
93+
common_chat_role common_chat_role_from_string(const std::string & role) {
94+
if (role == "system") { return COMMON_CHAT_ROLE_SYSTEM; }
95+
if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
96+
if (role == "user") { return COMMON_CHAT_ROLE_USER; }
97+
if (role == "tool") { return COMMON_CHAT_ROLE_TOOL; }
98+
return COMMON_CHAT_ROLE_UNKNOWN;
99+
}
100+
101+
const char * common_chat_role_to_string(common_chat_role role) {
102+
switch (role) {
103+
case COMMON_CHAT_ROLE_SYSTEM: return "system";
104+
case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
105+
case COMMON_CHAT_ROLE_USER: return "user";
106+
case COMMON_CHAT_ROLE_TOOL: return "tool";
107+
case COMMON_CHAT_ROLE_UNKNOWN: return "";
96108
}
109+
return "";
110+
}
97111

98-
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
99-
std::vector<std::string> all_delims;
100-
std::vector<common_peg_parser> tagged_messages;
112+
json common_chat_msg_delimiters::to_json() const {
113+
json result = json::array();
114+
for (const auto & d : delimiters) {
115+
result.push_back({
116+
{ "role", common_chat_role_to_string(d.role) },
117+
{ "delimiter", d.delimiter },
118+
});
119+
}
120+
return result;
121+
}
101122

102-
all_delims.reserve(delims.size());
103-
tagged_messages.reserve(delims.size());
104-
for (const auto & d : delims) {
105-
all_delims.push_back(d.delimiter);
106-
}
123+
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
124+
common_chat_msg_delimiters result;
107125

108-
auto any_delim = p.until_one_of(all_delims);
109-
for (const auto & d : delims) {
110-
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
126+
if (!delimiters.is_array()) {
127+
return result;
128+
}
129+
130+
result.delimiters.reserve(delimiters.size());
131+
for (const auto & d : delimiters) {
132+
if (!d.is_object()) {
133+
continue;
111134
}
135+
result.delimiters.push_back({
136+
common_chat_role_from_string(d.value("role", std::string())),
137+
d.value("delimiter", std::string()),
138+
});
139+
}
112140

113-
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
114-
});
141+
return result;
142+
}
115143

116-
common_peg_parse_context ctx(prompt);
117-
const auto result = parser.parse(ctx);
118-
if (!result.success()) {
119-
return {};
144+
void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
145+
for (auto & d : delimiters) {
146+
d.tokens = common_tokenize(vocab, d.delimiter, false, true);
120147
}
148+
}
121149

122-
std::vector<common_chat_msg_span> spans;
123-
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
124-
if (!node.tag.empty()) {
125-
spans.push_back({ node.tag, node.start, node.end - node.start });
150+
common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
151+
std::vector<std::pair<common_chat_role, size_t>> matches;
152+
153+
auto skip = skips.begin();
154+
for (size_t i = 0; i < tokens.size();) {
155+
if (skip != skips.end() && i == skip->first) {
156+
i += skip->second;
157+
++skip;
158+
continue;
126159
}
127-
});
160+
for (const auto & d : delimiters) {
161+
if (i + d.tokens.size() > tokens.size()) {
162+
continue;
163+
}
164+
if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
165+
matches.emplace_back(d.role, i);
166+
break;
167+
}
168+
}
169+
i++;
170+
}
171+
172+
matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
173+
174+
common_chat_msg_spans spans;
175+
for (size_t i = 0; i + 1 < matches.size(); i++) {
176+
const auto & curr = matches[i];
177+
const auto & next = matches[i + 1];
178+
spans.add(curr.first, curr.second, next.second - curr.second);
179+
}
128180

129181
return spans;
130182
}
@@ -1081,13 +1133,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
10811133

10821134
data.prompt = prompt;
10831135
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
1084-
data.message_spans = common_chat_split_by_role(prompt, {
1085-
{ "assistant", "<|start|>assistant" },
1086-
{ "user", "<|start|>user" },
1087-
{ "system", "<|start|>developer" },
1088-
{ "system", "<|start|>system" },
1089-
{ "tool", "<|start|>functions" },
1090-
});
1136+
data.message_delimiters = {
1137+
{ COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
1138+
{ COMMON_CHAT_ROLE_USER, "<|start|>user" },
1139+
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>developer" },
1140+
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>system" },
1141+
{ COMMON_CHAT_ROLE_TOOL, "<|start|>functions" },
1142+
};
10911143

10921144
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
10931145
data.supports_thinking = true;
@@ -1228,10 +1280,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
12281280
data.prompt += data.generation_prompt;
12291281
}
12301282

1231-
data.message_spans = common_chat_split_by_role(data.prompt, {
1232-
{ "user", "<|turn>user\n" },
1233-
{ "assistant", "<|turn>model\n" },
1234-
});
1283+
data.message_delimiters = {
1284+
{ COMMON_CHAT_ROLE_USER, "<|turn>user" },
1285+
{ COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
1286+
};
12351287

12361288
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
12371289
data.supports_thinking = true;
@@ -2030,15 +2082,15 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
20302082
RESULT_START, RESULT_END,
20312083
};
20322084

2033-
// Split the rendered prompt into per-role message spans. Tool results are rendered with the
2085+
// Declare per-role message delimiters. Tool results are rendered with the
20342086
// system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
20352087
// the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
2036-
data.message_spans = common_chat_split_by_role(data.prompt, {
2037-
{ "assistant", GEN_PREFIX },
2038-
{ "user", TURN_START + USER },
2039-
{ "tool", TURN_START + SYSTEM + RESULT_START },
2040-
{ "system", TURN_START + SYSTEM },
2041-
});
2088+
data.message_delimiters = {
2089+
{ COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
2090+
{ COMMON_CHAT_ROLE_USER, TURN_START + USER },
2091+
{ COMMON_CHAT_ROLE_TOOL, TURN_START + SYSTEM + RESULT_START },
2092+
{ COMMON_CHAT_ROLE_SYSTEM, TURN_START + SYSTEM },
2093+
};
20422094

20432095
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
20442096
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
@@ -2526,17 +2578,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
25262578
autoparser.analyze_template(tmpl);
25272579
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
25282580

2529-
std::vector<common_chat_msg_delimiter> delimiters;
2581+
common_chat_msg_delimiters delimiters;
25302582
if (!autoparser.assistant_start.empty()) {
2531-
delimiters.push_back({ "assistant", autoparser.assistant_start });
2583+
delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
25322584
}
25332585
if (!autoparser.user_start.empty()) {
2534-
delimiters.push_back({ "user", autoparser.user_start });
2586+
delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
25352587
}
25362588

2537-
if (!delimiters.empty()) {
2538-
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
2539-
}
2589+
auto_params.message_delimiters = std::move(delimiters);
25402590

25412591
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
25422592
if (auto_params.supports_thinking) {

common/chat.h

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,75 @@ struct common_chat_msg_diff {
143143
}
144144
};
145145

146+
enum common_chat_role {
147+
COMMON_CHAT_ROLE_UNKNOWN,
148+
COMMON_CHAT_ROLE_SYSTEM,
149+
COMMON_CHAT_ROLE_ASSISTANT,
150+
COMMON_CHAT_ROLE_USER,
151+
COMMON_CHAT_ROLE_TOOL
152+
};
153+
154+
common_chat_role common_chat_role_from_string(const std::string & role);
155+
const char * common_chat_role_to_string(common_chat_role role);
156+
146157
struct common_chat_msg_span {
147-
std::string role;
158+
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
148159
std::size_t pos = 0;
149160
std::size_t len = 0;
161+
162+
bool valid() const {
163+
return role != COMMON_CHAT_ROLE_UNKNOWN;
164+
}
165+
};
166+
167+
struct common_chat_msg_spans {
168+
std::vector<common_chat_msg_span> spans;
169+
170+
void add(common_chat_role role, size_t pos, size_t len) {
171+
spans.push_back({ role, pos, len });
172+
}
173+
174+
bool is_user_start(int32_t pos) const {
175+
for (auto it = spans.begin(); it != spans.end(); ++it) {
176+
if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
177+
return true;
178+
}
179+
}
180+
return false;
181+
}
182+
183+
int32_t last_user_message_pos() const {
184+
for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
185+
if (it->role == COMMON_CHAT_ROLE_USER) {
186+
return (int32_t) it->pos;
187+
}
188+
}
189+
return -1;
190+
}
150191
};
151192

152193
struct common_chat_msg_delimiter {
153-
std::string role;
154-
std::string delimiter;
194+
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
195+
std::string delimiter;
196+
llama_tokens tokens = {};
197+
};
198+
199+
struct common_chat_msg_delimiters {
200+
std::vector<common_chat_msg_delimiter> delimiters;
201+
202+
common_chat_msg_delimiters() = default;
203+
common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
204+
205+
void add(common_chat_role role, const std::string & delimiter) {
206+
delimiters.push_back({ role, delimiter });
207+
}
208+
209+
void tokenize(const llama_vocab * vocab);
210+
211+
// split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
212+
common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
213+
214+
nlohmann::ordered_json to_json() const;
155215
};
156216

157217
struct common_chat_tool {
@@ -219,7 +279,7 @@ struct common_chat_params {
219279
std::vector<std::string> preserved_tokens;
220280
std::vector<std::string> additional_stops;
221281
std::string parser;
222-
std::vector<common_chat_msg_span> message_spans;
282+
common_chat_msg_delimiters message_delimiters;
223283
};
224284

225285
// per-message parsing syntax
@@ -325,5 +385,4 @@ struct common_chat_prompt_preset {
325385

326386
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
327387

328-
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
329-
388+
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ struct common_params {
609609
bool cache_prompt = true; // whether to enable prompt caching
610610
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
611611
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
612-
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
612+
int32_t checkpoint_min_step = 8192; // minimum spacing between context checkpoints
613613
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
614614

615615
std::string hostname = "127.0.0.1";

common/download.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,14 +799,16 @@ common_download_model_result common_download_model(const common_params_model &
799799

800800
bool download_mmproj = opts.download_mmproj;
801801
bool download_mtp = opts.download_mtp;
802+
bool preset_only = opts.preset_only;
802803
bool is_hf = !model.hf_repo.empty();
803804

804805
if (is_hf) {
805806
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
806807
if (!hf.preset.path.empty()) {
807808
// if preset.ini exists, only download that file alone
808809
tasks.push_back({hf.preset.url, hf.preset.local_path});
809-
} else {
810+
} else if (!preset_only) {
811+
// only add other files if we're NOT in preset-only mode (normal run, non-router)
810812
for (const auto & f : hf.model_files) {
811813
tasks.push_back({f.url, f.local_path});
812814
}

common/download.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct common_download_opts {
5555
bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
5656
bool download_mmproj = false;
5757
bool download_mtp = false;
58+
bool preset_only = false; // if true, only check & download remote preset (for router mode)
5859
common_download_callback * callback = nullptr;
5960
};
6061

0 commit comments

Comments
 (0)