janhq
diff --git a/‎.github/workflows/build-vulkan.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build-vulkan.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 13 additions & 4 deletions b/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎common/chat-auto-parser.h‎
Lines changed: 6 additions & 2 deletions b/‎common/chat-auto-parser.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎common/chat-diff-analyzer.cpp‎
Lines changed: 44 additions & 4 deletions b/‎common/chat-diff-analyzer.cpp‎
Lines changed: 44 additions & 4 deletions
diff --git a/‎common/chat-peg-parser.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/chat-peg-parser.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/build.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/build.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/development/HOWTO-add-model.md‎
Lines changed: 17 additions & 0 deletions b/‎docs/development/HOWTO-add-model.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/diffusion/diffusion-cli.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/diffusion/diffusion-cli.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/include/ggml-backend.h‎
Lines changed: 52 additions & 2 deletions b/‎ggml/include/ggml-backend.h‎
Lines changed: 52 additions & 2 deletions
diff --git a/‎ggml/include/ggml-rpc.h‎
Lines changed: 3 additions & 3 deletions b/‎ggml/include/ggml-rpc.h‎
Lines changed: 3 additions & 3 deletions
@@ -93,4 +93,5 @@ jobs:
           export GGML_VK_DISABLE_F16=1
           export GGML_VK_DISABLE_COOPMAT=1
           # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4800
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
         args_field = format.function_field + "." + args_field;
     }
 
-    auto tools_parser = p.standard_json_tools(
-        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
-        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+    auto tools_parser = p.eps();
+    if (format.section_start.empty() && !format.per_call_start.empty()) {
+        auto single_tool_parser = p.standard_json_tools(
+            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
+            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
+    } else {
+        tools_parser = p.standard_json_tools(
+            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
+            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
+    }
 
     // Handle content wrappers if present
     if (ctx.content && ctx.content->is_always_wrapped()) {
 
@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
 
   private:
     // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
-    void analyze_tool_calls(const analyze_reasoning & reasoning);
+    void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
 
     // Analyze format based on position of function and argument name in needle
     void analyze_tool_call_format(const std::string &       haystack,
                                   const std::string &       fun_name_needle,
                                   const std::string &       arg_name_needle,
-                                  const analyze_reasoning & reasoning);
+                                  const analyze_reasoning & reasoning,
+                                  bool                      supports_parallel_tool_calls);
 
     // Analyze specifics of JSON native format (entire tool call is a JSON object)
     void analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                               const std::string & fun_name_needle,
                                               const std::string & arg_name_needle);
 
+    // Check if parallel calls in JSON native format array wrapped or tag wrapped
+    void analyze_json_native_parallel_calls();
+
     // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
     void analyze_tool_call_format_non_json(const std::string & clean_haystack,
                                            const std::string & fun_name_needle);
 
@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
     : analyze_base(tmpl) {
     LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
 
-    analyze_tool_calls(reasoning);
+    analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
 
     if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
         if (caps.supports_parallel_tool_calls) {
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
     }
 }
 
-void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
+void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
     json assistant_no_tools = json{
         { "role",    "assistant"   },
         { "content", ASSISTANT_MSG }
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
         return;
     }
 
-    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
+    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
 }
 
 void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
                                              const std::string &       fun_name_needle,
                                              const std::string &       arg_name_needle,
-                                             const analyze_reasoning & reasoning) {
+                                             const analyze_reasoning & reasoning,
+                                             bool                      supports_parallel_tool_calls) {
     if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
         return;
     }
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
 
     if (format.mode == tool_format::JSON_NATIVE) {
         analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
+        if (supports_parallel_tool_calls) {
+            analyze_json_native_parallel_calls();
+        }
     } else {
         analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
     }
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
     format.per_call_end = trim_whitespace(format.per_call_end);
 }
 
+void analyze_tools::analyze_json_native_parallel_calls() {
+    json assistant_one_tool = json{
+        { "role",       "assistant" },
+        { "content",    ""          },
+        { "tool_calls", json::array({ first_tool_call }) }
+    };
+
+    json assistant_two_tools = json{
+        { "role",       "assistant" },
+        { "content",    ""          },
+        { "tool_calls", json::array({ first_tool_call, second_tool_call }) }
+    };
+
+    template_params params;
+    params.messages              = json::array({ user_msg, assistant_one_tool });
+    params.tools                 = tools;
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        *tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
+        return;
+    }
+
+    std::string & second_call = comparison->diff.right;
+    if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
+        format.per_call_start = format.section_start;
+        format.per_call_end = format.section_end;
+        format.section_start.clear();
+        format.section_end.clear();
+    }
+}
+
 void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                                          const std::string & fun_name_needle,
                                                          const std::string & arg_name_needle) {
 
@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
         ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
-                          literal("\"") + tool_name(literal(name)) + literal("\"");
+                          atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
         auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
                           tool_args(schema(json(), "tool-" + name + "-schema", params));
 
@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
         ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
 
         auto tool_name_ = name_key_parser + space() + literal(":") + space() +
-                         literal("\"") + tool_name(literal(name)) + literal("\"");
+                         atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
         auto tool_args_ = args_key_parser + space() + literal(":") + space() +
                          tool_args(schema(json(), "tool-" + name + "-schema", params));
 
 
@@ -281,6 +281,12 @@ Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16
 
 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
 
+### Peer Access
+
+The environment variable `GGML_CUDA_P2P` can be set to enable peer-to-peer access between multiple GPUs, allowing them to transfer data directly rather than to go through system memory.
+Requires driver support (usually restricted to workstation/datacenter GPUs).
+May cause crashes or corrupted outputs for some motherboards and BIOS settings (e.g. IOMMU).
+
 ### Performance Tuning
 
 The following compilation options are also available to tweak performance:
 
@@ -130,6 +130,23 @@ Note:
 - Adding a model-specific API or CLI is an anti-pattern in `libmtmd`. The goal of `libmtmd` is to provide an easy-to-use, model-agnostic library for multimodal pipeline.
 - In most cases, `llama-mtmd-cli` should not be modified. If a model requires a specific prompt, either let the user provide it or bake it into the Jinja chat template.
 
+## Tips and tricks
+
+### Working with ggml_rope_ext
+
+PyTorch implementations usually prefer explicitly calculating `freq_cis`/`sin`/`cos` components. However, in llama.cpp, most RoPE operations can be handled via `ggml_rope_ext`, which does not require a sin/cos matrix. This saves memory while allowing the GGML RoPE kernel to be fused with other ops.
+
+However, since `ggml_rope_ext` only provides a subset of the RoPE implementations that models use, converting models from PyTorch to llama.cpp may require some creative adaptations.
+
+For more information about `ggml_rope_ext`, please refer to the in-code documentation in `ggml.h`.
+
+Examples:
+- `libmtmd` implements 2D RoPE with `GGML_ROPE_TYPE_NORMAL` ordering by splitting the input tensor in half, applying `ggml_rope_ext` separately to each half, then joining them back together using `ggml_concat`.
+- The [Kimi-K2.5](https://github.com/ggml-org/llama.cpp/pull/19170) vision encoder uses vision RoPE with interleaved frequencies. The weights must be permuted during conversion in order to reuse the `build_rope_2d()` function.
+- [Gemma 4](https://github.com/ggml-org/llama.cpp/pull/21309) uses "proportional" RoPE. We employ a trick where `rope_freqs` is set to a very large value in the last dimensions to prevent those dimensions from being rotated. See the `Gemma4Model` class in `convert_hf_to_gguf.py`.
+- Some models require scaling the input position. For example, `[0, 1, 2, ...]` becomes `[0, 0.5, 1, ...]`. In this case, you can provide the scaling via `freq_scale = 0.5f`.
+- Some models use learned RoPE frequencies instead of relying on `powf(freq_base, -2.0 * i / n_dims)`. In this case, you can provide the learned frequencies via the `rope_freqs` tensor (corresponding to the `c` argument in `ggml_rope_ext`), then set `freq_base = 1.0f`. An important note is that `rope_freqs` in GGML is the **inverse** (`theta = pos[i] / rope_freqs`), so you may need to invert `rope_freqs` during conversion.
+
 ## GGUF specification
 
 https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
 
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
 
     int n_input = input_tokens.size();
 
-    if (n_input >= params.n_ctx) {
-        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
+    if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
+        LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
         llama_free(ctx);
         llama_model_free(model);
         return 1;
 
@@ -202,8 +202,11 @@ extern "C" {
 
     // Common functions that may be obtained using ggml_backend_reg_get_proc_address
 
-    // AllReduce operation for tensor parallelism (meta backend)
-    typedef bool                         (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
+    // Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
+    typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
+    typedef void   (*ggml_backend_comm_free_t)(void * comm_ctx);
+    typedef bool   (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
+
     // Split buffer type for tensor parallelism (old)
     typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
     // Set the number of threads for the backend
@@ -348,6 +351,53 @@ extern "C" {
     // Set a callback to be called for each resulting node during graph compute
     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 
+    //
+    // Meta backend
+    //
+
+#define GGML_BACKEND_META_MAX_DEVICES 16
+
+    enum ggml_backend_meta_split_axis {
+        // tensor split by tensor dimensions:
+        GGML_BACKEND_SPLIT_AXIS_0 = 0,
+        GGML_BACKEND_SPLIT_AXIS_1 = 1,
+        GGML_BACKEND_SPLIT_AXIS_2 = 2,
+        GGML_BACKEND_SPLIT_AXIS_3 = 3,
+
+        GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+        GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+
+        // for internal bookkeeping only:
+        GGML_BACKEND_SPLIT_AXIS_NONE    = 98,
+        GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
+    };
+    GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
+
+    struct ggml_backend_meta_split_state {
+        enum ggml_backend_meta_split_axis axis;
+
+        // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
+        //   - each device has a slice of the tensor along the split axis
+        //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
+        //   - some tensors have an inhomogenenous data layout along the split axis,
+        //     those tensors are divided into segments which are each individually split across devices
+        //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
+        //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
+        //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
+        //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
+        int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
+        uint32_t n_segments;
+    };
+
+    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+    typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+    // TODO: this looks a bit strange - a backend API creates a device. I think we should try
+    //       express this as a backend registry functionality instead
+    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
     //
     // Utils
     //
 
@@ -6,9 +6,9 @@
 extern "C" {
 #endif
 
-#define RPC_PROTO_MAJOR_VERSION    3
-#define RPC_PROTO_MINOR_VERSION    6
-#define RPC_PROTO_PATCH_VERSION    1
+#define RPC_PROTO_MAJOR_VERSION    4
+#define RPC_PROTO_MINOR_VERSION    0
+#define RPC_PROTO_PATCH_VERSION    0
 
 #ifdef  __cplusplus
 static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");