From c3e9ade6dd3ff2a1ceafd2d59062634715b472c4 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 19 May 2026 09:42:36 +0300 Subject: [PATCH 01/12] rpc : keep last_graph_uid in the device context (#23273) With the introduction of MTP we can have multiple compute contexts for the same RPC device. In this case last_graph_uid is not updated properly when contexts are being switched. This patch fixes this by moving last_graph_uid to the device context, making sure it is always updated. closes: #23242 --- ggml/src/ggml-rpc/ggml-rpc.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1cb8f563d85..d3805772183 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() { return &guid; } +struct ggml_backend_rpc_device_context { + std::string endpoint; + uint32_t device; + std::string name; + std::string description; + uint64_t last_graph_uid; +}; + struct ggml_backend_rpc_buffer_type_context { std::string endpoint; uint32_t device; @@ -211,7 +219,6 @@ struct ggml_backend_rpc_context { std::string endpoint; uint32_t device; std::string name; - uint64_t last_graph_uid; }; struct ggml_backend_rpc_buffer_context { @@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend); + ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context; GGML_ASSERT(cgraph->n_nodes > 0); - bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid; + bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid; if (reuse) { rpc_msg_graph_recompute_req request; request.device = rpc_ctx->device; @@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request)); RPC_STATUS_ASSERT(status); } else { - rpc_ctx->last_graph_uid = cgraph->uid; + rpc_dev_ctx->last_graph_uid = cgraph->uid; std::vector input; serialize_graph(rpc_ctx->device, cgraph, input); auto sock = get_socket(rpc_ctx->endpoint); @@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .endpoint = */ endpoint, /* .device = */ device, /* .name = */ dev_name, - /* .last_graph_uid = */ 0, }; auto reg = ggml_backend_rpc_add_server(endpoint); ggml_backend_t backend = new ggml_backend { @@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir } } -// device interface - -struct ggml_backend_rpc_device_context { - std::string endpoint; - uint32_t device; - std::string name; - std::string description; -}; - static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; @@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) { std::string dev_name = "RPC" + std::to_string(dev_id); std::string dev_desc = std::string(endpoint); ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context { - /* .endpoint = */ endpoint, - /* .device = */ ind, - /* .name = */ dev_name, - /* .description = */ dev_desc + /* .endpoint = */ endpoint, + /* .device = */ ind, + /* .name = */ dev_name, + /* .description = */ dev_desc, + /* .last_graph_uid = */ 0, }; ggml_backend_dev_t dev = new ggml_backend_device { From 439f1b193d2d7d8db4d2b70cbf63e3afcbb38df8 Mon Sep 17 00:00:00 2001 From: Intel AI Get-to Market Customer Success and Solutions Date: Mon, 18 May 2026 23:44:02 -0700 Subject: [PATCH 02/12] sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle (#22153) * sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle Signed-off-by: Chun Tao * Use async mem ops for correctness when SYCL graphs are explicitly on. Signed-off-by: Tao, Chun --------- Signed-off-by: Chun Tao Signed-off-by: Tao, Chun Co-authored-by: Chun Tao --- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ebe7c5b351c..2ea47f7153a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; +int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; @@ -304,6 +305,8 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); + g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); + GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested); #ifdef SYCL_FLASH_ATTN GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention); @@ -319,11 +322,11 @@ static void ggml_check_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be - // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in - // other places. + // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder + // staging path while preserving queue ordering semantics. Graph support still depends on the extension being + // available, but it no longer needs to control the non-graph fast path. #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC - g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph; + g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph; if (g_ggml_sycl_use_async_mem_op) { for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) { if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) { From f1c1c5c057f047562b637db0ac7eac11485307bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 08:44:25 +0200 Subject: [PATCH 03/12] convert : filter lora tensor names (#23077) --- convert_lora_to_gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 1b7334617d1..81658ba03d8 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -445,6 +445,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) + # filter base name, ignore tensor transformations for now + data_gen = lambda g=tensor: g # noqa: E731 + if (titem := self.filter_tensors((base_name, data_gen))) is None: + continue + base_name, _ = titem # note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name From aabee047d8ebf7abe2750585a347aa19feced3b5 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Tue, 19 May 2026 14:44:51 +0800 Subject: [PATCH 04/12] [SCYL] add chapter for performance reference in SYCL.md (#23315) * add chapter for performance reference * rm unsupported GPU --- README.md | 2 +- docs/backend/SYCL.md | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a0c14b9d7f0..71327e51453 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Metal](docs/build.md#metal-build) | Apple Silicon | | [BLAS](docs/build.md#blas-build) | All | | [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [SYCL](docs/backend/SYCL.md) | Intel GPU | | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs | | [MUSA](docs/build.md#musa) | Moore Threads GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 155f933b805..0c4660b541c 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -5,6 +5,7 @@ - [News](#news) - [OS](#os) - [Hardware](#hardware) +- [Performance Reference](#performance-reference) - [Docker](#docker) - [Linux](#linux) - [Windows](#windows) @@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on ## News -- 2026.04 - - - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0. +- 2026.04-05 + - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0. - Fused MoE. - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package. @@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the NA +## Performance Reference + + +To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313). + +You could update your test result in it directly. + ## Docker The docker build option is currently limited to *Intel GPU* targets. From c85a242ed021ab6732e2973764437c3c5655102b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 18 May 2026 23:45:41 -0700 Subject: [PATCH 05/12] ggml-webgpu : extend GDN for K>1 (#23299) --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 ++ .../wgsl-shaders/gated_delta_net.wgsl | 24 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 78cb02be06d..921c12b41ac 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, const uint32_t h = (uint32_t) src2->ne[1]; const uint32_t n_tokens = (uint32_t) src2->ne[2]; const uint32_t n_seqs = (uint32_t) src2->ne[3]; + const uint32_t K = (uint32_t) src5->ne[1]; const float scale = 1.0f / sqrtf((float) s_v); uint32_t scale_u32; memcpy(&scale_u32, &scale, sizeof(scale_u32)); @@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, (uint32_t) src0->ne[1], (uint32_t) (src2->ne[3] / src0->ne[3]), + K, scale_u32, }; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl index f9d98fda40b..d68520f8282 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl @@ -39,6 +39,7 @@ struct Params { neq1: u32, rq3: u32, + K: u32, scale: f32, }; @@ -62,11 +63,14 @@ fn main( let iq3 = seq_id / params.rq3; let state_size = S_V * S_V; - let state_base = (seq_id * params.h + head_id) * state_size; + let state_in_base = (seq_id * params.K * params.h + head_id) * state_size; + let state_out_base = (seq_id * params.h + head_id) * state_size; + let state_size_per_snap = state_size * params.h * params.n_seqs; + let shift = i32(params.n_tokens) - i32(params.K); var state: array; for (var i = 0u; i < S_V; i++) { - state[i] = src_state[state_base + col * S_V + i]; + state[i] = src_state[state_in_base + col * S_V + i]; } var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V; @@ -123,10 +127,22 @@ fn main( dst[attn_off + col] = attn_col * params.scale; attn_off += S_V * params.h; + if (params.K > 1u) { + let target_slot = i32(t) - shift; + if (target_slot >= 0 && target_slot < i32(params.K)) { + let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base; + for (var i = 0u; i < S_V; i++) { + dst[slot_base + col * S_V + i] = state[i]; + } + } + } + workgroupBarrier(); } - for (var i = 0u; i < S_V; i++) { - dst[params.s_off + state_base + col * S_V + i] = state[i]; + if (params.K == 1u) { + for (var i = 0u; i < S_V; i++) { + dst[params.s_off + state_out_base + col * S_V + i] = state[i]; + } } } From d2e179a477fc1d1935b68422c1181ef2d62ed2ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:05 +0300 Subject: [PATCH 06/12] llama-eval : add per-task summary stats (#23151) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama-eval : add per-problem summary table to HTML reports - Add chunk_idx and problem_idx to TaskState and saved case dicts - Group completed cases by problem_idx in dump_html() - Render per-problem summary table before individual task table - Columns: Problem (zero-padded), Runs, Correct (n/r), Tokens (min/avg/max), T/s (min/avg/max), Gen s (min/avg/max) - Sorted by problem index, monospace font, right-aligned numbers - Colspan headers for grouped stats, auto width - Simulator: add /v1/models endpoint, timings in response, template-aware question matching, --dataset arg (aime/aime2025) Assisted-by: llama.cpp:local pi * llama-eval : add tabs for Detailed and Summary tables, apply monospace font globally - Wrap Detailed and Summary tables in switchable tabs (Detailed active by default) - Remove summary-section wrapper, use tab labels instead - Apply monospace font to all tables and the top bar Assisted-by: llama.cpp:local pi * llama-eval : redesign top bar as CSS grid label/value pairs - Replace flat span list with 4-column grid layout (2 pairs per row) - Labels in muted color (#888), values in dark (#222) - Bold dataset name and model name - Removed media query, always uses 4 columns Assisted-by: llama.cpp:local pi * llama-eval : use realistic token counts and throughput in simulator - comp_tokens: [30, 80] → [10000, 60000] - tps_gen: derived → uniform [90.0, 110.0] - t_gen_ms: now computed from tokens/tps Assisted-by: llama.cpp:local pi * llama-eval : color Answer column green/red based on correctness Use the same .correct/.incorrect CSS classes on the Answer column to make correct answers green and incorrect answers red. Assisted-by: llama.cpp:local pi * llama-eval : fix pyright errors from max(..., key=len) type inference Use key=lambda x: len(x) instead of key=len so the type checker infers the return type as str instead of Sized, fixing: - unresolved-attribute: Object of type Sized has no attribute lower - not-subscriptable: Cannot subscript object of type Sized Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-eval.py | 189 ++++++++++++++---- examples/llama-eval/llama-server-simulator.py | 99 +++++++-- 2 files changed, 233 insertions(+), 55 deletions(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index e833070eee9..4bdd239c007 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -149,6 +149,8 @@ class TaskState: t_gen_ms: Optional[float] = None reasoning_content: Optional[str] = None server_name: Optional[str] = None + chunk_idx: int = 0 + problem_idx: int = 0 class EvalState: @@ -233,7 +235,9 @@ def add_result( tps_gen: Optional[float] = None, t_gen_ms: Optional[float] = None, reasoning_content: Optional[str] = None, - server_name: Optional[str] = None + server_name: Optional[str] = None, + chunk_idx: int = 0, + problem_idx: int = 0, ): with self._lock: if "cases" not in self.task_states: @@ -252,7 +256,9 @@ def add_result( "tps_gen": tps_gen, "t_gen_ms": t_gen_ms, "reasoning_content": reasoning_content, - "server_name": server_name + "server_name": server_name, + "chunk_idx": chunk_idx, + "problem_idx": problem_idx, } self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) @@ -289,6 +295,9 @@ def dump(self): all_cases = {} for i, task_id in tasks_to_save: question_text, prompt, expected = self.get_case(i) + # Extract chunk_idx from task_id for pending cases + _parts = task_id.rsplit("_", 2) + _chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 if task_id in self.task_states.get("cases", {}): all_cases[task_id] = self.task_states["cases"][task_id] else: @@ -306,7 +315,9 @@ def dump(self): "tps_gen": None, "t_gen_ms": None, "reasoning_content": None, - "server_name": None + "server_name": None, + "chunk_idx": _chunk_idx, + "problem_idx": i, } ci_lower, ci_upper = self.accuracy_ci() @@ -382,11 +393,12 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) escaped_server = self._escape_html(server_name) + answer_class = status_class if status == "ok" else "" rows.append(f""" {task_id} {status_text} {self._escape_html(expected)} - {self._escape_html(answer)} + {self._escape_html(answer)} {tokens_str} {tps_str} {t_gen_str} @@ -405,6 +417,53 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A rows_html = "\n".join(rows) + # ---- per-problem summary table ---- + problem_groups: Dict[int, List[Dict[str, Any]]] = {} + for _tid, _case in cases.items(): + if _case.get("status") != "ok": + continue + _pidx = _case.get("problem_idx") + if _pidx is None: + _p_parts = _tid.rsplit("_", 2) + _pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0 + problem_groups.setdefault(_pidx, []).append(_case) + + summary_rows_html = "" + if problem_groups: + def _stat(v, fmt=".1f", avg_fmt=None): + if not v: + return ("–", "–", "–") + af = fmt if avg_fmt is None else avg_fmt + return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}") + + summary_data = [] + for pidx, g in problem_groups.items(): + runs = len(g) + n_ok = sum(1 for c in g if c.get("correct", False)) + toks = [c["tokens"] for c in g if c.get("tokens") is not None] + tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None] + tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None] + summary_data.append(( + pidx, runs, n_ok, + _stat(toks, "d", ".0f"), + _stat(tps), + _stat(tg), + )) + + summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending + + summary_rows_html = "\n".join( + f""" + {p:03d} + {r} + {n}/{r} + {tk[0]}{tk[1]}{tk[2]} + {tp[0]}{tp[1]}{tp[2]} + {tg[0]}{tg[1]}{tg[2]} + """ + for p, r, n, tk, tp, tg in summary_data + ) + html_content = f""" @@ -412,10 +471,10 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A {self.dataset_type.upper()} Eval
- {self.dataset_type.upper()} - Model: {self.model_name or 'N/A'} - Accuracy: {accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%] - Correct: {n_correct} / {len(completed)} - Pending: {n_pending} - Time: {self.total_time:.1f}s - Sampling: {sampling_str} +
Dataset
{self.dataset_type.upper()}
+
Model
{self.model_name or 'N/A'}
+
Accuracy
{accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]
+
Correct
{n_correct} / {len(completed)}
+
Pending
{n_pending}
+
Time
{self.total_time:.1f}s
+
Sampling
{sampling_str}
+
+
+ + +
+
+ + + + + + + + + + + + + + + {rows_html} + +
IDGoldAnswerTokensT/sGen sServer
+
+
+ + + + + + + + + + + + + + + + + + + + + {summary_rows_html} + +
ProblemRunsCorrectTokensT/sGen s
minavgmaxminavgmaxminavgmax
- - - - - - - - - - - - - - - {rows_html} - -
IDGoldAnswerTokensT/sGen sServer
""" @@ -1062,12 +1172,19 @@ def _process_single_case( ) -> TaskState: question_text, prompt, expected = eval_state.get_case(i) + # Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}" + _parts = task_id.rsplit("_", 2) + chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 + problem_idx = i + task_state = TaskState( task_id=task_id, prompt=prompt, expected=expected, question_text=question_text, - server_name=server_config.name + server_name=server_config.name, + chunk_idx=chunk_idx, + problem_idx=problem_idx, ) try: @@ -1085,7 +1202,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, None, {"finish_reason": finish_reason}, False, task_state.status, - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() return task_state @@ -1108,7 +1226,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, answer, grader_log, is_correct, "ok", - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py index 2f9cdc5450d..e64ba89335d 100755 --- a/examples/llama-eval/llama-server-simulator.py +++ b/examples/llama-eval/llama-server-simulator.py @@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]: return int(match.group(0)) class AimeDataset: - def __init__(self, split: str = "train"): + def __init__(self, split: str = "train", dataset_type: str = "aime"): self.split = split + self.dataset_type = dataset_type self.questions: List[Dict] = [] self._load_dataset() - def _load_dataset(self): - print(f"Loading AIME dataset (split: {self.split})...") + def _get_question_text(self, question: Dict) -> str: + """Get question text, handling different dataset field names.""" + return question.get("problem", question.get("question", "")) - cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" - if cache_path.exists(): - print(f"Using cached dataset from {cache_path}") - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + def _load_dataset(self): + if self.dataset_type == "aime": + print(f"Loading AIME dataset (split: {self.split})...") + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + elif self.dataset_type == "aime2025": + print(f"Loading AIME2025 dataset...") + ds_list = [] + for config_name in ["AIME2025-I", "AIME2025-II"]: + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test") + ds_list.extend(ds) + ds = ds_list else: - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + raise ValueError(f"Unknown dataset type: {self.dataset_type}") self.questions = list(ds) - print(f"AIME dataset loaded: {len(self.questions)} questions") + print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions") def find_question(self, request_text: str) -> Optional[Dict]: + # Strip common template prefixes to get the actual question text + # Templates include things like "Solve the following math problem step by step..." + # The actual question usually follows a blank line or after the template instruction + cleaned = request_text + # Split on double newline and take the part that looks like the problem + parts = cleaned.split('\n\n') + if len(parts) > 1: + # Find the part that's longest (likely the actual problem text) + problem_parts = [p for p in parts if len(p.strip()) > 100] + if problem_parts: + cleaned = max(problem_parts, key=lambda x: len(x)) + best_match = None best_distance = -1 best_index = -1 for i, question in enumerate(self.questions): - question_text = question["problem"] - request_lower = request_text.lower() + question_text = self._get_question_text(question) + request_lower = cleaned.lower() question_lower = question_text.lower() + # Check if question text is contained in the cleaned request + if question_lower in request_lower or request_lower in question_lower: + debug_log(f"DEBUG: Found substring match at index {i}") + return question + # Exact match if question_lower == request_lower: debug_log(f"DEBUG: Found exact match at index {i}") @@ -118,7 +154,7 @@ def find_question(self, request_text: str) -> Optional[Dict]: debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}") return best_match - debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...") + debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...") return None def get_answer(self, question: Dict) -> str: @@ -134,15 +170,16 @@ def __init__( port: int = 8033, host: str = "localhost", success_rate: float = 0.8, - dataset_split: str = "train" + dataset_split: str = "train", + dataset_type: str = "aime" ): self.port = port self.host = host self.success_rate = success_rate - self.dataset = AimeDataset(dataset_split) + self.dataset = AimeDataset(dataset_split, dataset_type) self.eval_state = EvalState( - id="aime-2025", - tasks=["aime"], + id=dataset_type, + tasks=[dataset_type], task_states={}, sampling_config={"temperature": 0, "max_tokens": 2048} ) @@ -159,6 +196,10 @@ def _generate_response( else: response_text = self._generate_wrong_answer(question) + comp_tokens = random.randint(10000, 60000) + tps_gen = random.uniform(90.0, 110.0) + t_gen_ms = comp_tokens / tps_gen * 1000 + return { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", @@ -176,8 +217,12 @@ def _generate_response( ], "usage": { "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 + "completion_tokens": comp_tokens, + "total_tokens": 100 + comp_tokens + }, + "timings": { + "predicted_ms": t_gen_ms, + "predicted_per_second": tps_gen } } @@ -218,6 +263,12 @@ def _process_request(self, request_data: Dict) -> Dict: return response class RequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/v1/models": + self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200) + return + self._send_json({"error": "Not found"}, 404) + def do_POST(self): if self.path != "/v1/chat/completions": self._send_json({"error": "Not found"}, 404) @@ -280,6 +331,13 @@ def main(): default=0.8, help="Success rate 0-1 (default: 0.8)" ) + parser.add_argument( + "--dataset", + type=str, + default="aime", + choices=["aime", "aime2025"], + help="Dataset type (default: aime)" + ) parser.add_argument( "--dataset-split", type=str, @@ -294,7 +352,8 @@ def main(): port=args.port, host=args.host, success_rate=args.success_rate, - dataset_split=args.dataset_split + dataset_split=args.dataset_split, + dataset_type=args.dataset ) server = HTTPServer((args.host, args.port), RequestHandler) @@ -304,7 +363,7 @@ def main(): print("\n=== llama-server-simulator ===") print(f"Server running on http://{args.host}:{args.port}") print(f"Success rate: {args.success_rate}") - print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") + print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions") print("\nPress Ctrl+C to stop\n") try: From cd963fee6a86387d598ebe3888017376d6e9e8f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:34 +0300 Subject: [PATCH 07/12] save-load-state : refactor tests and improve readability (#23196) * save-load-state : refactor into separate phase functions - Split monolithic main() into 4 self-contained phase functions, each managing its own context/sampler/batch lifecycle - Each function tokenizes internally using its local ctx instance - main() is now a clean orchestrator: init -> run phases -> assert results - Proper resource cleanup on every exit path (return {} on error) Assisted-by: llama.cpp:local pi * save-load-state : use params.out_file instead of separate state_file - Remove state_file parameter from all phase functions - Each function accesses params.out_file directly - Initialize params.out_file in main alongside params.prompt Assisted-by: llama.cpp:local pi * save-load-state : use smart pointers for ctx and smpl - Replace raw llama_context* with llama_context_ptr - Replace raw llama_sampler* with llama_sampler_ptr - Remove all manual llama_free() and llama_sampler_free() calls - Keep llama_batch as raw (managed manually with llama_batch_free) Assisted-by: llama.cpp:local pi * save-load-state : add local llama_batch_ptr RAII wrapper - Add llama_batch_ptr struct holding llama_batch by value - Calls llama_batch_free() in destructor - Eliminates all manual llama_batch_free() calls Assisted-by: llama.cpp:local pi * save-load-state : replace printf/fprintf with logging macros - Add log.h include - Replace fprintf(stderr, ...) errors with LOG_ERR - Replace fprintf(stderr, ...) info with LOG_TRC - Replace printf output with LOG Assisted-by: llama.cpp:local pi * save-load-state : refactor tests to check results inline Each follow-up phase now accepts an expected result and performs the comparison internally instead of collecting results in main(). Assisted-by: llama.cpp:local pi * save-load-state : improve test output readability Add phase labels, remove redundant run prefixes, and show PASS after each test. Assisted-by: llama.cpp:local pi * pi : add rule about git signing * save-load-state : simplify llama_batch_ptr Change get() to return a reference and remove operator*(). Use batch.get() throughout for consistency. Assisted-by: llama.cpp:local pi * save-load-state : extract generate_tokens helper Factor out the repeated token generation loop into a shared helper function used by all phases. Assisted-by: llama.cpp:local pi * save-load-state : update comments to use test terminology Replace "Phase" with "Test" and list each test's steps as bullet points. Assisted-by: llama.cpp:local pi * save-load-state : rename test functions Rename to test_baseline, test_state_load, test_seq_cp_host, test_seq_cp_device. Update comments and logs accordingly. Assisted-by: llama.cpp:local pi * pi : add rule to never git push without confirmation Assisted-by: llama.cpp:local pi * common : add model_only option to common_init_from_params Add bool model_only parameter to skip context creation, sampler init, and context-dependent setup. Use in save-load-state to initialize only the model, with each test creating its own context. Assisted-by: llama.cpp:local pi --------- Co-authored-by: ggerganov --- .pi/gg/SYSTEM.md | 2 + common/common.cpp | 14 +- common/common.h | 4 +- examples/save-load-state/save-load-state.cpp | 453 ++++++++++--------- 4 files changed, 254 insertions(+), 219 deletions(-) diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 727a850b183..b7597a4c3ae 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -22,6 +22,8 @@ Pull requests (PRs): Commits: - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag - Do not explicitly set the git author in commits - rely on the default git config +- Always use `--no-gpg-sign` when committing +- Never `git push` without explicit confirmation from the user Resources (read on demand): - [CONTRIBUTING.md](CONTRIBUTING.md) diff --git a/common/common.cpp b/common/common.cpp index 9cf11ea9f5f..aef06263e3f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1160,7 +1160,7 @@ struct common_init_result::impl { std::vector samplers_seq_config; }; -common_init_result::common_init_result(common_params & params) : +common_init_result::common_init_result(common_params & params, bool model_only) : pimpl(new impl{}) { auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); @@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) : pimpl->model.reset(model); + if (model_only) { + return; + } + const llama_vocab * vocab = llama_model_get_vocab(model); // load and optionally apply lora adapters @@ -1309,8 +1313,8 @@ std::vector & common_init_result::lora() { return pimpl->lora; } -common_init_result_ptr common_init_from_params(common_params & params) { - common_init_result_ptr res(new common_init_result(params)); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only) { + common_init_result_ptr res(new common_init_result(params, model_only)); llama_model * model = res->model(); if (model == NULL) { @@ -1318,6 +1322,10 @@ common_init_result_ptr common_init_from_params(common_params & params) { return res; } + if (model_only) { + return res; + } + llama_context * lctx = res->context(); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index 1d3d788b2de..e03f7037454 100644 --- a/common/common.h +++ b/common/common.h @@ -857,7 +857,7 @@ struct common_sampler; // note: defines the model, context, samplers, ets. lifetimes struct common_init_result { - common_init_result(common_params & params); + common_init_result(common_params & params, bool model_only = false); ~common_init_result(); llama_model * model(); @@ -875,7 +875,7 @@ struct common_init_result { using common_init_result_ptr = std::unique_ptr; -common_init_result_ptr common_init_from_params(common_params & params); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index e6f5e9802ab..97ab7c6de3b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,320 +1,345 @@ #include "arg.h" #include "common.h" -#include "llama.h" +#include "log.h" +#include "llama-cpp.h" #include #include -#include +struct llama_batch_ptr { + llama_batch batch; -int main(int argc, char ** argv) { - std::setlocale(LC_NUMERIC, "C"); + llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max) + : batch{llama_batch_init(n_tokens, embd, n_seq_max)} {} - common_params params; + ~llama_batch_ptr() { llama_batch_free(batch); } - params.prompt = "The quick brown fox"; - params.sampling.seed = 1234; + llama_batch_ptr(const llama_batch_ptr &) = delete; + llama_batch_ptr & operator=(const llama_batch_ptr &) = delete; + llama_batch_ptr(llama_batch_ptr &&) = default; + llama_batch_ptr & operator=(llama_batch_ptr &&) = default; - const std::string_view state_file = "dump_state.bin"; + llama_batch & get() { return batch; } + const llama_batch & get() const { return batch; } +}; - common_init(); +static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) { + std::string result; + llama_batch_ptr batch(1, 0, 1); - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { - return 1; - } + for (int i = 0; i < n_predict; i++) { + auto next_token = llama_sampler_sample(smpl, ctx, -1); + auto next_token_str = common_token_to_piece(ctx, next_token); - if (params.n_parallel == 1) { - // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache - printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__); - params.kv_unified = true; - } + LOG("%s", next_token_str.c_str()); + result += next_token_str; - if (params.n_predict < 0) { - params.n_predict = 16; + common_batch_clear(batch.get()); + common_batch_add(batch.get(), next_token, n_past, {seq_id}, true); + + if (llama_decode(ctx, batch.get())) { + LOG_ERR("\n%s: failed to evaluate\n", __func__); + return {}; + } + n_past++; } - auto n_past = 0; + return result; +} - std::string result0; - std::string result1; - std::string result2; - std::string result3; +// Test 1: baseline +// - tokenize the prompt +// - decode all but the last token +// - save state to disk +// - decode the last token +// - generate n_predict tokens +static std::string test_baseline(struct llama_model * model, const struct common_params & params) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; - // init + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - ggml_backend_load_all(); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - auto llama_init = common_init_from_params(params); + auto n_past = 0; + if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) { + LOG_ERR("%s: failed to decode prompt\n", __func__); + return {}; + } - auto * model = llama_init->model(); - auto * ctx = llama_init->context(); + LOG("\n=== Test 1: baseline ===\n"); + LOG("%s", params.prompt.c_str()); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return {}; } - auto sparams = llama_sampler_chain_default_params(); + LOG("\n"); - llama_sampler * smpl = llama_sampler_chain_init(sparams); + return result; +} - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); +// Test 2: state load +// - create a new context +// - load state from file +// - replay the last prompt token +// - generate n_predict tokens and compare against expected result +static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; - const bool save_state = true; - if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) { - return 1; - } + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - // first run - printf("\nfirst run: %s", params.prompt.c_str()); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - llama_batch batch = llama_batch_init(1, 0, 1); + LOG("\n=== Test 2: state load ===\n"); + LOG("%s", params.prompt.c_str()); - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; - printf("%s", next_token_str.c_str()); - result0 += next_token_str; + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - if (llama_decode(ctx, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } + n_past++; + + // Generate tokens + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} - printf("\n\n"); - // make new context - llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); +// Test 3: seq copy (host) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the CPU path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - printf("\nsecond run: %s", params.prompt.c_str()); + LOG("\n=== Test 3: seq copy (host) ===\n"); + LOG("%s", params.prompt.c_str()); - // load state from file - std::vector unused_sts(tokens.size()); // unused session tokens. + // Load state from file + std::vector unused_sts(tokens.size()); size_t n_token_count_out = 0; - if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx2, tokens.back(), n_past)) { - return 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } - ++n_past; + n_past++; - // second run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = common_token_to_piece(ctx2, next_token); - - printf("%s", next_token_str.c_str()); - result1 += next_token_str; + // Migrate KV cache from seq 0 to seq 1 (CPU path) + { + std::vector seq_store(llama_state_seq_get_size(ctx.get(), 0)); + const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); - if (llama_decode(ctx2, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; + const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; } - n_past += 1; + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); } - printf("\n\n"); + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } - if (result0 != result1) { - fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); - return 1; + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; } - // make new context - auto params_ctx3 = common_context_params_to_llama(params); - params_ctx3.n_seq_max = 2; - llama_context * ctx3 = llama_init_from_model(model, params_ctx3); + LOG("\nPASS\n"); + return true; +} + - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); +// Test 4: seq copy (device) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the on-device path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - printf("\nsingle seq run: %s", params.prompt.c_str()); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; + LOG("\n=== Test 4: seq copy (device) ===\n"); + LOG("%s", params.prompt.c_str()); - if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx3, tokens.back(), n_past)) { - return 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } - ++n_past; + n_past++; - // save seq 0 and load into seq 1 + // Migrate KV cache from seq 0 to seq 1 (on-device path) { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); + std::vector seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); + const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); - // erase whole kv - llama_memory_clear(llama_get_memory(ctx3), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); - // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); + const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); } - // third run with seq 1 instead of 0 - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = common_token_to_piece(ctx3, next_token); - - printf("%s", next_token_str.c_str()); - result2 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } - if (llama_decode(ctx3, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; } - // test on-device state save/load - auto params_ctx4 = common_context_params_to_llama(params); - params_ctx4.n_seq_max = 2; - llama_context * ctx4 = llama_init_from_model(model, params_ctx4); + LOG("\nPASS\n"); + return true; +} - llama_sampler * smpl4 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed)); +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); - printf("\nsingle seq run: %s", params.prompt.c_str()); + common_params params; + params.prompt = "The quick brown fox"; + params.out_file = "dump_state.bin"; + params.sampling.seed = 1234; - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; + common_init(); - if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx4, tokens.back(), n_past)) { - return 1; + if (params.n_parallel == 1) { + LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__); + params.kv_unified = true; } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); - const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx4), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - // restore kv into seq 0 - const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); + if (params.n_predict < 0) { + params.n_predict = 16; } - // forth run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl4, ctx4, -1); - auto next_token_str = common_token_to_piece(ctx4, next_token); - - printf("%s", next_token_str.c_str()); - result3 += next_token_str; + ggml_backend_load_all(); - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); + auto llama_init = common_init_from_params(params, true); + auto * model = llama_init->model(); - if (llama_decode(ctx4, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + if (model == nullptr) { + LOG_ERR("%s: failed to init\n", __func__); + return 1; } - printf("\n"); - - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - llama_sampler_free(smpl4); + GGML_ASSERT(llama_init->context() == nullptr); - llama_batch_free(batch); - - // this one is managed by common_init_result - //llama_free(ctx); + // Test 1: baseline (saves state to disk) + auto result_baseline = test_baseline(model, params); + if (result_baseline.empty()) { + return 1; + } - llama_free(ctx2); - llama_free(ctx3); - llama_free(ctx4); + // Test 2: state load + if (!test_state_load(model, params, result_baseline)) { + return 1; + } - if (result0 != result2) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); + // Test 3: seq copy (host) + if (!test_seq_cp_host(model, params, result_baseline)) { return 1; } - if (result0 != result3) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); + // Test 4: seq copy (device) + if (!test_seq_cp_device(model, params, result_baseline)) { return 1; } - fprintf(stderr, "\n%s : success\n", __func__); + LOG("\nAll tests passed.\n"); return 0; } From 3c81c8deeabba01fa40869325ea80d07eef75fc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:58 +0300 Subject: [PATCH 08/12] server : print graphs reused in slot timings (#23279) Add graphs reused counter to the per-slot timing output, printed via llama_perf_context(). Assisted-by: llama.cpp:local pi Co-authored-by: ggerganov --- tools/server/server-context.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6b16c6b4962..88b207ad556 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -467,20 +467,26 @@ struct server_slot { const double n_gen_second = 1e3 / t_token_generation * n_decoded; SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second); + + SLT_INF(*this, + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_token_generation, n_decoded, t_gen, n_gen_second); + + SLT_INF(*this, " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + SLT_INF(*this, + " graphs reused = %10d\n", + llama_perf_context(ctx_tgt).n_reused); + if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_CNT(*this, - "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total - ); + SLT_INF(*this, + "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total); } common_speculative_print_stats(spec); From ccee42642677005555b28c6ef93760e2604348e8 Mon Sep 17 00:00:00 2001 From: Pascal Date: Tue, 19 May 2026 08:49:01 +0200 Subject: [PATCH 09/12] server-context: guarantee there is at least 1 token to decode (#23280) --- tools/server/server-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 88b207ad556..dc3189e1705 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2589,9 +2589,9 @@ struct server_context_impl { llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa); + const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); - if (n_past > 0 && n_past < slot.prompt.n_tokens()) { + if (n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); if (pos_min == -1) { SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); From 00c461ce1a9deb238eed40a8f869a72729fa3d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 09:06:56 +0200 Subject: [PATCH 10/12] ci : install server kleidiai runner dependencies (#23259) --- .github/workflows/server-self-hosted.yml | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index d06ad3d24c5..3522681d9d1 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -152,6 +152,32 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + - name: Build id: cmake_build run: | From 4b262ab662d46fd9dd1d53671b82c09d8b0af024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 10:11:04 +0200 Subject: [PATCH 11/12] ci : install libssl-dev (#23325) --- .github/workflows/server-self-hosted.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 3522681d9d1..857c72a4619 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -160,6 +160,7 @@ jobs: sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ apt-get install -y \ build-essential \ + libssl-dev \ python3-venv \ gpg \ wget \ From 6db130445d29b243ee2171efb8cd61b84a1c5322 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Tue, 19 May 2026 10:16:04 +0200 Subject: [PATCH 12/12] ui: Bump packages + address build warnings (#23300) * chore: Update vulnerable packages * chore: Formatting * refactor: Update Tailwind CSS imports * ci: Use `ubuntu-latest` for Unit/E2E UI tests * chore: Bump package * fix: Add missing tag * refactor: Enums files naming --- .github/workflows/ui-ci.yml | 4 +- tools/ui/.gitignore | 2 +- tools/ui/eslint.config.js | 5 +- tools/ui/package-lock.json | 54 ++++---- tools/ui/src/app.css | 5 +- ...tAttachmentsPreviewCurrentItemVideo.svelte | 1 + .../MarkdownContent/MarkdownContent.svelte | 2 +- .../settings/SettingsChat/SettingsChat.svelte | 2 +- .../SettingsChat/SettingsChatFields.svelte | 2 +- tools/ui/src/lib/constants/mcp.ts | 2 +- .../ui/src/lib/constants/settings-registry.ts | 4 +- .../src/lib/constants/supported-file-types.ts | 2 +- tools/ui/src/lib/constants/tools.ts | 2 +- .../enums/{agentic.ts => agentic.enums.ts} | 0 .../{attachment.ts => attachment.enums.ts} | 0 .../src/lib/enums/{chat.ts => chat.enums.ts} | 0 .../lib/enums/{files.ts => files.enums.ts} | 0 tools/ui/src/lib/enums/index.ts | 22 ++-- .../enums/{keyboard.ts => keyboard.enums.ts} | 0 .../ui/src/lib/enums/{mcp.ts => mcp.enums.ts} | 0 .../lib/enums/{model.ts => model.enums.ts} | 0 .../lib/enums/{server.ts => server.enums.ts} | 0 .../enums/{settings.ts => settings.enums.ts} | 0 .../lib/enums/{tools.ts => tools.enums.ts} | 0 tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} | 0 tools/ui/src/lib/types/mcp.d.ts | 4 +- tools/ui/src/routes/+layout.svelte | 29 ++-- tools/ui/vitest-setup-client.ts | 124 +++++++++--------- tools/ui/vitest.shims.d.ts | 1 + 29 files changed, 138 insertions(+), 129 deletions(-) rename tools/ui/src/lib/enums/{agentic.ts => agentic.enums.ts} (100%) rename tools/ui/src/lib/enums/{attachment.ts => attachment.enums.ts} (100%) rename tools/ui/src/lib/enums/{chat.ts => chat.enums.ts} (100%) rename tools/ui/src/lib/enums/{files.ts => files.enums.ts} (100%) rename tools/ui/src/lib/enums/{keyboard.ts => keyboard.enums.ts} (100%) rename tools/ui/src/lib/enums/{mcp.ts => mcp.enums.ts} (100%) rename tools/ui/src/lib/enums/{model.ts => model.enums.ts} (100%) rename tools/ui/src/lib/enums/{server.ts => server.enums.ts} (100%) rename tools/ui/src/lib/enums/{settings.ts => settings.enums.ts} (100%) rename tools/ui/src/lib/enums/{tools.ts => tools.enums.ts} (100%) rename tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} (100%) create mode 100644 tools/ui/vitest.shims.d.ts diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml index 7f6f467ddaa..761a9319414 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui-ci.yml @@ -41,7 +41,7 @@ jobs: ui-checks: name: UI Checks needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest continue-on-error: true steps: - name: Checkout code @@ -93,7 +93,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v6 diff --git a/tools/ui/.gitignore b/tools/ui/.gitignore index 051d884b08e..22ed6125f4c 100644 --- a/tools/ui/.gitignore +++ b/tools/ui/.gitignore @@ -25,4 +25,4 @@ vite.config.ts.timestamp-* *storybook.log storybook-static -*.code-workspace \ No newline at end of file +*.code-workspace diff --git a/tools/ui/eslint.config.js b/tools/ui/eslint.config.js index 185da1dabbe..4ed9dd7ca3a 100644 --- a/tools/ui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -20,9 +20,7 @@ export default ts.config( prettier, ...svelte.configs.prettier, { - languageOptions: { - globals: { ...globals.browser, ...globals.node } - }, + languageOptions: { globals: { ...globals.browser, ...globals.node } }, rules: { // typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects. // see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors @@ -30,6 +28,7 @@ export default ts.config( 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply 'svelte/no-navigation-without-resolve': 'off', + // Enforce empty line at end of file 'eol-last': 'error' } diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index 3686eb3261e..4d012c81990 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -2307,9 +2307,9 @@ } }, "node_modules/@sveltejs/kit": { - "version": "2.59.1", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz", - "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==", + "version": "2.60.1", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", + "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", "dev": true, "license": "MIT", "dependencies": { @@ -2318,7 +2318,7 @@ "@types/cookie": "^0.6.0", "acorn": "^8.14.1", "cookie": "^0.6.0", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.2", "kleur": "^4.1.5", "magic-string": "^0.30.5", @@ -4296,9 +4296,9 @@ } }, "node_modules/devalue": { - "version": "5.6.4", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz", - "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", + "version": "5.8.1", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", + "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", "license": "MIT" }, "node_modules/devlop": { @@ -4856,12 +4856,12 @@ } }, "node_modules/express-rate-limit": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz", - "integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==", + "version": "8.5.2", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", + "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", "license": "MIT", "dependencies": { - "ip-address": "10.1.0" + "ip-address": "^10.2.0" }, "engines": { "node": ">= 16" @@ -4909,9 +4909,9 @@ "license": "MIT" }, "node_modules/fast-uri": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", - "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", "funding": [ { "type": "github", @@ -5541,9 +5541,9 @@ } }, "node_modules/hono": { - "version": "4.12.14", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz", - "integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==", + "version": "4.12.19", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz", + "integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==", "license": "MIT", "engines": { "node": ">=16.9.0" @@ -5722,9 +5722,9 @@ "license": "MIT" }, "node_modules/ip-address": { - "version": "10.1.0", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz", - "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==", + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", + "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", "license": "MIT", "engines": { "node": ">= 12" @@ -9245,9 +9245,9 @@ } }, "node_modules/svelte": { - "version": "5.55.1", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz", - "integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==", + "version": "5.55.7", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz", + "integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==", "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", @@ -9259,7 +9259,7 @@ "aria-query": "5.3.1", "axobject-query": "^4.1.0", "clsx": "^2.1.1", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.1", "esrap": "^2.2.4", "is-reference": "^3.0.3", @@ -10606,9 +10606,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.18.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", - "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", "dev": true, "license": "MIT", "engines": { diff --git a/tools/ui/src/app.css b/tools/ui/src/app.css index d6dc6670c0c..29b1d3c640b 100644 --- a/tools/ui/src/app.css +++ b/tools/ui/src/app.css @@ -1,6 +1,7 @@ @import 'tailwindcss'; -@source "."; - +@source '.'; +@plugin '@tailwindcss/forms'; +@plugin '@tailwindcss/typography'; @import 'tw-animate-css'; @custom-variant dark (&:is(.dark *)); diff --git a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte index 4ebbd592280..62040b36f9d 100644 --- a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte @@ -15,6 +15,7 @@ {#if videoSrc} {:else} diff --git a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte index 3a11854b6e4..0412414ae39 100644 --- a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte +++ b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte @@ -28,7 +28,7 @@ SETTINGS_KEYS } from '$lib/constants'; import { ColorMode, UrlProtocol } from '$lib/enums'; - import { FileTypeText } from '$lib/enums/files'; + import { FileTypeText } from '$lib/enums/files.enums'; import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils'; import '$styles/katex-custom.scss'; import githubDarkCss from 'highlight.js/styles/github-dark.css?inline'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte index 109c8ff9dac..d017fe20469 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte @@ -17,7 +17,7 @@ } from '$lib/constants'; import { RouterService } from '$lib/services/router.service'; import { setMode } from 'mode-watcher'; - import { ColorMode } from '$lib/enums/ui'; + import { ColorMode } from '$lib/enums/ui.enums'; import { fade } from 'svelte/transition'; import { goto } from '$app/navigation'; import { page } from '$app/state'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte index 069855eebef..7c1c5c89776 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte @@ -6,7 +6,7 @@ import * as Select from '$lib/components/ui/select'; import { Textarea } from '$lib/components/ui/textarea'; import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants'; - import { SettingsFieldType } from '$lib/enums/settings'; + import { SettingsFieldType } from '$lib/enums/settings.enums'; import { settingsStore } from '$lib/stores/settings.svelte'; import { serverStore } from '$lib/stores/server.svelte'; import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte'; diff --git a/tools/ui/src/lib/constants/mcp.ts b/tools/ui/src/lib/constants/mcp.ts index 19bdd92ea75..918eb9f94b7 100644 --- a/tools/ui/src/lib/constants/mcp.ts +++ b/tools/ui/src/lib/constants/mcp.ts @@ -2,7 +2,7 @@ import { Zap, Globe, Radio } from '@lucide/svelte'; import { MCPTransportType } from '$lib/enums'; import type { ClientCapabilities, Implementation } from '$lib/types'; import type { Component } from 'svelte'; -import { MimeTypeImage } from '$lib/enums/files'; +import { MimeTypeImage } from '$lib/enums/files.enums'; export const DEFAULT_CLIENT_VERSION = '1.0.0'; export const MCP_CLIENT_NAME = 'llama-ui-mcp'; diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts index bdbb17d962c..93b3cd5edb5 100644 --- a/tools/ui/src/lib/constants/settings-registry.ts +++ b/tools/ui/src/lib/constants/settings-registry.ts @@ -1,5 +1,5 @@ -import { ColorMode } from '$lib/enums/ui'; -import { SettingsFieldType } from '$lib/enums/settings'; +import { ColorMode } from '$lib/enums/ui.enums'; +import { SettingsFieldType } from '$lib/enums/settings.enums'; import { SyncableParameterType } from '$lib/enums'; import { Funnel, diff --git a/tools/ui/src/lib/constants/supported-file-types.ts b/tools/ui/src/lib/constants/supported-file-types.ts index 34505438916..4141161548d 100644 --- a/tools/ui/src/lib/constants/supported-file-types.ts +++ b/tools/ui/src/lib/constants/supported-file-types.ts @@ -18,7 +18,7 @@ import { MimeTypeApplication, MimeTypeText } from '$lib/enums'; -import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files'; +import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files.enums'; // File type configuration using enums export const AUDIO_FILE_TYPES = { diff --git a/tools/ui/src/lib/constants/tools.ts b/tools/ui/src/lib/constants/tools.ts index 22b22309c39..efc3476cd79 100644 --- a/tools/ui/src/lib/constants/tools.ts +++ b/tools/ui/src/lib/constants/tools.ts @@ -1,4 +1,4 @@ -import { ToolSource } from '$lib/enums/tools'; +import { ToolSource } from '$lib/enums/tools.enums'; export const TOOL_GROUP_LABELS = { [ToolSource.BUILTIN]: 'Built-in', diff --git a/tools/ui/src/lib/enums/agentic.ts b/tools/ui/src/lib/enums/agentic.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/agentic.ts rename to tools/ui/src/lib/enums/agentic.enums.ts diff --git a/tools/ui/src/lib/enums/attachment.ts b/tools/ui/src/lib/enums/attachment.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/attachment.ts rename to tools/ui/src/lib/enums/attachment.enums.ts diff --git a/tools/ui/src/lib/enums/chat.ts b/tools/ui/src/lib/enums/chat.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/chat.ts rename to tools/ui/src/lib/enums/chat.enums.ts diff --git a/tools/ui/src/lib/enums/files.ts b/tools/ui/src/lib/enums/files.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/files.ts rename to tools/ui/src/lib/enums/files.enums.ts diff --git a/tools/ui/src/lib/enums/index.ts b/tools/ui/src/lib/enums/index.ts index 3cf81286bc1..a17cca1d8e1 100644 --- a/tools/ui/src/lib/enums/index.ts +++ b/tools/ui/src/lib/enums/index.ts @@ -4,9 +4,9 @@ export { AttachmentItemEnabledWhen, AttachmentAction, AttachmentItemVisibleWhen -} from './attachment'; +} from './attachment.enums'; -export { AgenticSectionType, ToolCallType } from './agentic'; +export { AgenticSectionType, ToolCallType } from './agentic.enums'; export { ChatMessageStatsView, @@ -17,7 +17,7 @@ export { MessageType, PdfViewMode, ReasoningFormat -} from './chat'; +} from './chat.enums'; export { FileTypeCategory, @@ -38,7 +38,7 @@ export { MimeTypeImage, MimeTypeText, SpecialFileType -} from './files'; +} from './files.enums'; export { MCPConnectionPhase, @@ -48,16 +48,16 @@ export { MCPContentType, MCPRefType, JsonSchemaType -} from './mcp'; +} from './mcp.enums'; -export { ModelModality } from './model'; +export { ModelModality } from './model.enums'; -export { ServerRole, ServerModelStatus } from './server'; +export { ServerRole, ServerModelStatus } from './server.enums'; -export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings'; +export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings.enums'; -export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui'; +export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui.enums'; -export { KeyboardKey } from './keyboard'; +export { KeyboardKey } from './keyboard.enums'; -export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools'; +export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums'; diff --git a/tools/ui/src/lib/enums/keyboard.ts b/tools/ui/src/lib/enums/keyboard.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/keyboard.ts rename to tools/ui/src/lib/enums/keyboard.enums.ts diff --git a/tools/ui/src/lib/enums/mcp.ts b/tools/ui/src/lib/enums/mcp.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/mcp.ts rename to tools/ui/src/lib/enums/mcp.enums.ts diff --git a/tools/ui/src/lib/enums/model.ts b/tools/ui/src/lib/enums/model.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/model.ts rename to tools/ui/src/lib/enums/model.enums.ts diff --git a/tools/ui/src/lib/enums/server.ts b/tools/ui/src/lib/enums/server.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/server.ts rename to tools/ui/src/lib/enums/server.enums.ts diff --git a/tools/ui/src/lib/enums/settings.ts b/tools/ui/src/lib/enums/settings.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/settings.ts rename to tools/ui/src/lib/enums/settings.enums.ts diff --git a/tools/ui/src/lib/enums/tools.ts b/tools/ui/src/lib/enums/tools.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/tools.ts rename to tools/ui/src/lib/enums/tools.enums.ts diff --git a/tools/ui/src/lib/enums/ui.ts b/tools/ui/src/lib/enums/ui.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/ui.ts rename to tools/ui/src/lib/enums/ui.enums.ts diff --git a/tools/ui/src/lib/types/mcp.d.ts b/tools/ui/src/lib/types/mcp.d.ts index 7aa050cdfa7..2a292614203 100644 --- a/tools/ui/src/lib/types/mcp.d.ts +++ b/tools/ui/src/lib/types/mcp.d.ts @@ -1,5 +1,5 @@ -import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp'; -import type { ToolSource } from '$lib/enums/tools'; +import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp.enums'; +import type { ToolSource } from '$lib/enums/tools.enums'; import type { Client, ClientCapabilities as SDKClientCapabilities, diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index b35d20a5cd5..78227df3ce7 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -7,11 +7,13 @@ import { untrack } from 'svelte'; import { onMount } from 'svelte'; import { fade } from 'svelte/transition'; + import { DesktopIconStrip, DialogConversationTitleUpdate, SidebarNavigation } from '$lib/components/app'; + import { conversationsStore } from '$lib/stores/conversations.svelte'; import * as Sidebar from '$lib/components/ui/sidebar/index.js'; import * as Tooltip from '$lib/components/ui/tooltip'; @@ -30,26 +32,29 @@ import { conversations } from '$lib/stores/conversations.svelte'; let { children } = $props(); - let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop); let isMobile = new IsMobile(); let isDesktop = $derived(!isMobile.current); let sidebarOpen = $state(false); let mounted = $state(false); let innerHeight = $state(); + let chatSidebar: - | { activateSearchMode?: () => void; editActiveConversation?: () => void } + | { + activateSearchMode?: () => void; + editActiveConversation?: () => void; + } | undefined = $state(); let titleUpdateDialogOpen = $state(false); let titleUpdateCurrentTitle = $state(''); let titleUpdateNewTitle = $state(''); let titleUpdateResolve: ((value: boolean) => void) | null = null; - const panelNav = useSettingsNavigation(); function navigateToConversation(direction: -1 | 1) { const allConvs = conversations(); + if (allConvs.length === 0) return; const currentId = page.params.id; @@ -61,6 +66,7 @@ } const idx = allConvs.findIndex((c) => c.id === currentId); + if (idx === -1) return; const targetIdx = idx + direction; @@ -75,9 +81,7 @@ // Global keyboard shortcuts const { handleKeydown } = useKeyboardShortcuts({ editActiveConversation: () => chatSidebar?.editActiveConversation?.(), - navigateToPrevConversation: () => navigateToConversation(-1), - navigateToNextConversation: () => navigateToConversation(1) }); @@ -139,6 +143,7 @@ $effect(() => { if (alwaysShowSidebarOnDesktop && isDesktop) { sidebarOpen = true; + return; } }); @@ -175,6 +180,7 @@ // Only fetch router models once when we have models loaded and in router mode if (isRouter && modelsCount > 0 && !routerModelsFetched) { routerModelsFetched = true; + untrack(() => { modelsStore.fetchRouterModels(); }); @@ -223,7 +229,6 @@ -
- - - + {#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)} {#if mounted} @@ -266,9 +271,9 @@ /> {/if} - - {@render children?.()} - + {@render children?.()}
diff --git a/tools/ui/vitest-setup-client.ts b/tools/ui/vitest-setup-client.ts index 0b753db02b1..90994442eb2 100644 --- a/tools/ui/vitest-setup-client.ts +++ b/tools/ui/vitest-setup-client.ts @@ -9,70 +9,72 @@ import { beforeEach, vi } from 'vitest'; beforeEach(() => { const originalFetch = globalThis.fetch; - vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => { - const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + vi.spyOn(globalThis, 'fetch').mockImplementation( + async (input: RequestInfo | URL, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; - // Mock server props endpoint - if (url.includes('/server')) { - return new Response( - JSON.stringify({ - mode: 'router', - version: 'test', - git_commit: 'test', - git_branch: 'test' - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock server props endpoint + if (url.includes('/server')) { + return new Response( + JSON.stringify({ + mode: 'router', + version: 'test', + git_commit: 'test', + git_branch: 'test' + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock models list endpoint - if (/\/v1\/models|\/models\b/.test(url)) { - return new Response( - JSON.stringify({ - object: 'list', - data: [ - { - id: 'test-model.gguf', - object: 'model', - owned_by: 'llamacpp', - created: 0, - in_cache: false, - path: 'models/test-model.gguf', - status: { value: 'unloaded' }, - meta: {} - } - ], - models: [ - { - model: 'test-model.gguf', - name: 'Test Model', - details: {} - } - ] - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock models list endpoint + if (/\/v1\/models|\/models\b/.test(url)) { + return new Response( + JSON.stringify({ + object: 'list', + data: [ + { + id: 'test-model.gguf', + object: 'model', + owned_by: 'llamacpp', + created: 0, + in_cache: false, + path: 'models/test-model.gguf', + status: { value: 'unloaded' }, + meta: {} + } + ], + models: [ + { + model: 'test-model.gguf', + name: 'Test Model', + details: {} + } + ] + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock /props endpoint (used for modalities) - if (url.includes('/props')) { - return new Response( - JSON.stringify({ - default_generation_settings: { n_ctx: 2048 } - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock /props endpoint (used for modalities) + if (url.includes('/props')) { + return new Response( + JSON.stringify({ + default_generation_settings: { n_ctx: 2048 } + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock /tools endpoint (used for built-in tools list) - if (url.includes('/tools')) { - return new Response(JSON.stringify([]), { - status: 200, - headers: { 'Content-Type': 'application/json' } - }); - } + // Mock /tools endpoint (used for built-in tools list) + if (url.includes('/tools')) { + return new Response(JSON.stringify([]), { + status: 200, + headers: { 'Content-Type': 'application/json' } + }); + } - // Default: use real fetch - return originalFetch(input, init); - }); + // Default: use real fetch + return originalFetch(input, init); + } + ); }); diff --git a/tools/ui/vitest.shims.d.ts b/tools/ui/vitest.shims.d.ts new file mode 100644 index 00000000000..03b1801a60c --- /dev/null +++ b/tools/ui/vitest.shims.d.ts @@ -0,0 +1 @@ +///