Skip to content

Commit f33fc77

Browse files
Merge pull request #487 from janhq/update-dev-from-master-2026-04-16-00-59
Sync master with upstream release b8808
2 parents d9959e8 + 408225b commit f33fc77

58 files changed

Lines changed: 2261 additions & 641 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/build-vulkan.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,5 @@ jobs:
9393
export GGML_VK_DISABLE_F16=1
9494
export GGML_VK_DISABLE_COOPMAT=1
9595
# This is using llvmpipe and runs slower than other backends
96-
ctest -L main --verbose --timeout 4800
96+
# test-backend-ops is too slow on llvmpipe, skip it
97+
ctest -L main -E test-backend-ops --verbose --timeout 900

common/chat-auto-parser-generator.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
198198
args_field = format.function_field + "." + args_field;
199199
}
200200

201-
auto tools_parser = p.standard_json_tools(
202-
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
203-
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
204-
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
201+
auto tools_parser = p.eps();
202+
if (format.section_start.empty() && !format.per_call_start.empty()) {
203+
auto single_tool_parser = p.standard_json_tools(
204+
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
205+
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
206+
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
207+
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
208+
} else {
209+
tools_parser = p.standard_json_tools(
210+
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
211+
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
212+
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
213+
}
205214

206215
// Handle content wrappers if present
207216
if (ctx.content && ctx.content->is_always_wrapped()) {

common/chat-auto-parser.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
308308

309309
private:
310310
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
311-
void analyze_tool_calls(const analyze_reasoning & reasoning);
311+
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
312312

313313
// Analyze format based on position of function and argument name in needle
314314
void analyze_tool_call_format(const std::string & haystack,
315315
const std::string & fun_name_needle,
316316
const std::string & arg_name_needle,
317-
const analyze_reasoning & reasoning);
317+
const analyze_reasoning & reasoning,
318+
bool supports_parallel_tool_calls);
318319

319320
// Analyze specifics of JSON native format (entire tool call is a JSON object)
320321
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
321322
const std::string & fun_name_needle,
322323
const std::string & arg_name_needle);
323324

325+
// Check if parallel calls in JSON native format array wrapped or tag wrapped
326+
void analyze_json_native_parallel_calls();
327+
324328
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
325329
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
326330
const std::string & fun_name_needle);

common/chat-diff-analyzer.cpp

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
558558
: analyze_base(tmpl) {
559559
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
560560

561-
analyze_tool_calls(reasoning);
561+
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
562562

563563
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
564564
if (caps.supports_parallel_tool_calls) {
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
577577
}
578578
}
579579

580-
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
580+
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
581581
json assistant_no_tools = json{
582582
{ "role", "assistant" },
583583
{ "content", ASSISTANT_MSG }
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
611611
return;
612612
}
613613

614-
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
614+
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
615615
}
616616

617617
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
618618
const std::string & fun_name_needle,
619619
const std::string & arg_name_needle,
620-
const analyze_reasoning & reasoning) {
620+
const analyze_reasoning & reasoning,
621+
bool supports_parallel_tool_calls) {
621622
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
622623
return;
623624
}
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
660661

661662
if (format.mode == tool_format::JSON_NATIVE) {
662663
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
664+
if (supports_parallel_tool_calls) {
665+
analyze_json_native_parallel_calls();
666+
}
663667
} else {
664668
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
665669
}
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
668672
format.per_call_end = trim_whitespace(format.per_call_end);
669673
}
670674

675+
void analyze_tools::analyze_json_native_parallel_calls() {
676+
json assistant_one_tool = json{
677+
{ "role", "assistant" },
678+
{ "content", "" },
679+
{ "tool_calls", json::array({ first_tool_call }) }
680+
};
681+
682+
json assistant_two_tools = json{
683+
{ "role", "assistant" },
684+
{ "content", "" },
685+
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
686+
};
687+
688+
template_params params;
689+
params.messages = json::array({ user_msg, assistant_one_tool });
690+
params.tools = tools;
691+
params.add_generation_prompt = false;
692+
params.enable_thinking = true;
693+
694+
auto comparison = compare_variants(
695+
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
696+
697+
if (!comparison) {
698+
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
699+
return;
700+
}
701+
702+
std::string & second_call = comparison->diff.right;
703+
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
704+
format.per_call_start = format.section_start;
705+
format.per_call_end = format.section_end;
706+
format.section_start.clear();
707+
format.section_end.clear();
708+
}
709+
}
710+
671711
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
672712
const std::string & fun_name_needle,
673713
const std::string & arg_name_needle) {

common/chat-peg-parser.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
676676
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
677677

678678
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
679-
literal("\"") + tool_name(literal(name)) + literal("\"");
679+
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
680680
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
681681
tool_args(schema(json(), "tool-" + name + "-schema", params));
682682

@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
744744
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
745745

746746
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
747-
literal("\"") + tool_name(literal(name)) + literal("\"");
747+
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
748748
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
749749
tool_args(schema(json(), "tool-" + name + "-schema", params));
750750

docs/build.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,12 @@ Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16
281281

282282
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
283283

284+
### Peer Access
285+
286+
The environment variable `GGML_CUDA_P2P` can be set to enable peer-to-peer access between multiple GPUs, allowing them to transfer data directly rather than to go through system memory.
287+
Requires driver support (usually restricted to workstation/datacenter GPUs).
288+
May cause crashes or corrupted outputs for some motherboards and BIOS settings (e.g. IOMMU).
289+
284290
### Performance Tuning
285291

286292
The following compilation options are also available to tweak performance:

docs/development/HOWTO-add-model.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,23 @@ Note:
130130
- Adding a model-specific API or CLI is an anti-pattern in `libmtmd`. The goal of `libmtmd` is to provide an easy-to-use, model-agnostic library for multimodal pipeline.
131131
- In most cases, `llama-mtmd-cli` should not be modified. If a model requires a specific prompt, either let the user provide it or bake it into the Jinja chat template.
132132

133+
## Tips and tricks
134+
135+
### Working with ggml_rope_ext
136+
137+
PyTorch implementations usually prefer explicitly calculating `freq_cis`/`sin`/`cos` components. However, in llama.cpp, most RoPE operations can be handled via `ggml_rope_ext`, which does not require a sin/cos matrix. This saves memory while allowing the GGML RoPE kernel to be fused with other ops.
138+
139+
However, since `ggml_rope_ext` only provides a subset of the RoPE implementations that models use, converting models from PyTorch to llama.cpp may require some creative adaptations.
140+
141+
For more information about `ggml_rope_ext`, please refer to the in-code documentation in `ggml.h`.
142+
143+
Examples:
144+
- `libmtmd` implements 2D RoPE with `GGML_ROPE_TYPE_NORMAL` ordering by splitting the input tensor in half, applying `ggml_rope_ext` separately to each half, then joining them back together using `ggml_concat`.
145+
- The [Kimi-K2.5](https://github.com/ggml-org/llama.cpp/pull/19170) vision encoder uses vision RoPE with interleaved frequencies. The weights must be permuted during conversion in order to reuse the `build_rope_2d()` function.
146+
- [Gemma 4](https://github.com/ggml-org/llama.cpp/pull/21309) uses "proportional" RoPE. We employ a trick where `rope_freqs` is set to a very large value in the last dimensions to prevent those dimensions from being rotated. See the `Gemma4Model` class in `convert_hf_to_gguf.py`.
147+
- Some models require scaling the input position. For example, `[0, 1, 2, ...]` becomes `[0, 0.5, 1, ...]`. In this case, you can provide the scaling via `freq_scale = 0.5f`.
148+
- Some models use learned RoPE frequencies instead of relying on `powf(freq_base, -2.0 * i / n_dims)`. In this case, you can provide the learned frequencies via the `rope_freqs` tensor (corresponding to the `c` argument in `ggml_rope_ext`), then set `freq_base = 1.0f`. An important note is that `rope_freqs` in GGML is the **inverse** (`theta = pos[i] / rope_freqs`), so you may need to invert `rope_freqs` during conversion.
149+
133150
## GGUF specification
134151

135152
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md

examples/diffusion/diffusion-cli.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
602602

603603
int n_input = input_tokens.size();
604604

605-
if (n_input >= params.n_ctx) {
606-
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
605+
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
606+
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
607607
llama_free(ctx);
608608
llama_model_free(model);
609609
return 1;

ggml/include/ggml-backend.h

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,11 @@ extern "C" {
202202

203203
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
204204

205-
// AllReduce operation for tensor parallelism (meta backend)
206-
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
205+
// Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
206+
typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
207+
typedef void (*ggml_backend_comm_free_t)(void * comm_ctx);
208+
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
209+
207210
// Split buffer type for tensor parallelism (old)
208211
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
209212
// Set the number of threads for the backend
@@ -348,6 +351,53 @@ extern "C" {
348351
// Set a callback to be called for each resulting node during graph compute
349352
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
350353

354+
//
355+
// Meta backend
356+
//
357+
358+
#define GGML_BACKEND_META_MAX_DEVICES 16
359+
360+
enum ggml_backend_meta_split_axis {
361+
// tensor split by tensor dimensions:
362+
GGML_BACKEND_SPLIT_AXIS_0 = 0,
363+
GGML_BACKEND_SPLIT_AXIS_1 = 1,
364+
GGML_BACKEND_SPLIT_AXIS_2 = 2,
365+
GGML_BACKEND_SPLIT_AXIS_3 = 3,
366+
367+
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
368+
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
369+
370+
// for internal bookkeeping only:
371+
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
372+
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
373+
};
374+
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
375+
376+
struct ggml_backend_meta_split_state {
377+
enum ggml_backend_meta_split_axis axis;
378+
379+
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
380+
// - each device has a slice of the tensor along the split axis
381+
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
382+
// - some tensors have an inhomogenenous data layout along the split axis,
383+
// those tensors are divided into segments which are each individually split across devices
384+
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
385+
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
386+
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
387+
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
388+
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
389+
uint32_t n_segments;
390+
};
391+
392+
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
393+
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
394+
395+
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
396+
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
397+
// express this as a backend registry functionality instead
398+
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
399+
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
400+
351401
//
352402
// Utils
353403
//

ggml/include/ggml-rpc.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
extern "C" {
77
#endif
88

9-
#define RPC_PROTO_MAJOR_VERSION 3
10-
#define RPC_PROTO_MINOR_VERSION 6
11-
#define RPC_PROTO_PATCH_VERSION 1
9+
#define RPC_PROTO_MAJOR_VERSION 4
10+
#define RPC_PROTO_MINOR_VERSION 0
11+
#define RPC_PROTO_PATCH_VERSION 0
1212

1313
#ifdef __cplusplus
1414
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");

0 commit comments

Comments
 (0)