From c3e9ade6dd3ff2a1ceafd2d59062634715b472c4 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Tue, 19 May 2026 09:42:36 +0300
Subject: [PATCH 01/12] rpc : keep last_graph_uid in the device context
 (#23273)

With the introduction of MTP we can have multiple compute contexts for
the same RPC device. In this case last_graph_uid is not updated properly
when contexts are being switched. This patch fixes this by moving
last_graph_uid to the device context, making sure it is always updated.

closes: #23242
---
 ggml/src/ggml-rpc/ggml-rpc.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 1cb8f563d85..d3805772183 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() {
     return &guid;
 }
 
+struct ggml_backend_rpc_device_context {
+    std::string endpoint;
+    uint32_t    device;
+    std::string name;
+    std::string description;
+    uint64_t    last_graph_uid;
+};
+
 struct ggml_backend_rpc_buffer_type_context {
     std::string endpoint;
     uint32_t    device;
@@ -211,7 +219,6 @@ struct ggml_backend_rpc_context {
     std::string endpoint;
     uint32_t    device;
     std::string name;
-    uint64_t    last_graph_uid;
 };
 
 struct ggml_backend_rpc_buffer_context {
@@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
 
 static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+    ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend);
+    ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context;
 
     GGML_ASSERT(cgraph->n_nodes > 0);
-    bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid;
+    bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid;
     if (reuse) {
         rpc_msg_graph_recompute_req request;
         request.device = rpc_ctx->device;
@@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
         bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
         RPC_STATUS_ASSERT(status);
     } else {
-        rpc_ctx->last_graph_uid = cgraph->uid;
+        rpc_dev_ctx->last_graph_uid = cgraph->uid;
         std::vector<uint8_t> input;
         serialize_graph(rpc_ctx->device, cgraph, input);
         auto sock = get_socket(rpc_ctx->endpoint);
@@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
         /* .endpoint       = */ endpoint,
         /* .device         = */ device,
         /* .name           = */ dev_name,
-        /* .last_graph_uid = */ 0,
     };
     auto reg = ggml_backend_rpc_add_server(endpoint);
     ggml_backend_t backend = new ggml_backend {
@@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
     }
 }
 
-// device interface
-
-struct ggml_backend_rpc_device_context {
-    std::string endpoint;
-    uint32_t    device;
-    std::string name;
-    std::string description;
-};
-
 static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
 
@@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
         std::string dev_name = "RPC" + std::to_string(dev_id);
         std::string dev_desc = std::string(endpoint);
         ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
-            /* .endpoint    = */ endpoint,
-            /* .device      = */ ind,
-            /* .name        = */ dev_name,
-            /* .description = */ dev_desc
+            /* .endpoint    = */    endpoint,
+            /* .device      = */    ind,
+            /* .name        = */    dev_name,
+            /* .description = */    dev_desc,
+            /* .last_graph_uid = */ 0,
         };
 
         ggml_backend_dev_t dev = new ggml_backend_device {

From 439f1b193d2d7d8db4d2b70cbf63e3afcbb38df8 Mon Sep 17 00:00:00 2001
From: Intel AI Get-to Market Customer Success and Solutions
 <ai.gtm.css@gmail.com>
Date: Mon, 18 May 2026 23:44:02 -0700
Subject: [PATCH 02/12] sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle
 (#22153)

* sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle

Signed-off-by: Chun Tao <chun.tao@intel.com>

* Use async mem ops for correctness when SYCL graphs are explicitly on.

Signed-off-by: Tao, Chun <chun.tao@intel.com>

---------

Signed-off-by: Chun Tao <chun.tao@intel.com>
Signed-off-by: Tao, Chun <chun.tao@intel.com>
Co-authored-by: Chun Tao <chun.tao@intel.com>
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index ebe7c5b351c..2ea47f7153a 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0;
 int g_ggml_sycl_disable_dnn = 0;
 int g_ggml_sycl_prioritize_dmmv = 0;
 int g_ggml_sycl_use_async_mem_op = 0;
+int g_ggml_sycl_use_async_mem_op_requested = 1;
 int g_ggml_sycl_enable_level_zero = 0;
 int g_ggml_sycl_enable_flash_attention = 1;
 
@@ -304,6 +305,8 @@ static void ggml_check_sycl() try {
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
 #endif
         GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
+        g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
+        GGML_LOG_INFO("  GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
 
 #ifdef SYCL_FLASH_ATTN
         GGML_LOG_INFO("  GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
@@ -319,11 +322,11 @@ static void ggml_check_sycl() try {
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 */
-        // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
-        // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
-        // other places.
+        // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
+        // staging path while preserving queue ordering semantics. Graph support still depends on the extension being
+        // available, but it no longer needs to control the non-graph fast path.
 #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
-        g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
+        g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
         if (g_ggml_sycl_use_async_mem_op) {
             for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
                 if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {

From f1c1c5c057f047562b637db0ac7eac11485307bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 19 May 2026 08:44:25 +0200
Subject: [PATCH 03/12] convert : filter lora tensor names (#23077)

---
 convert_lora_to_gguf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 1b7334617d1..81658ba03d8 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -445,6 +445,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                     if self.lazy:
                         tensor = LazyTorchTensor.from_eager(tensor)
                     base_name = get_base_tensor_name(name)
+                    # filter base name, ignore tensor transformations for now
+                    data_gen = lambda g=tensor: g  # noqa: E731
+                    if (titem := self.filter_tensors((base_name, data_gen))) is None:
+                        continue
+                    base_name, _ = titem
                     # note: mergekit-extract-lora also adds token embeddings to the adapter
                     is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
                     is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name

From aabee047d8ebf7abe2750585a347aa19feced3b5 Mon Sep 17 00:00:00 2001
From: Neo Zhang <zhang.jianyu@outlook.com>
Date: Tue, 19 May 2026 14:44:51 +0800
Subject: [PATCH 04/12] [SCYL] add chapter for performance reference in SYCL.md
 (#23315)

* add chapter for performance reference

* rm unsupported GPU
---
 README.md            |  2 +-
 docs/backend/SYCL.md | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a0c14b9d7f0..71327e51453 100644
--- a/README.md
+++ b/README.md
@@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Metal](docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 155f933b805..0c4660b541c 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -5,6 +5,7 @@
 - [News](#news)
 - [OS](#os)
 - [Hardware](#hardware)
+- [Performance Reference](#performance-reference)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on
 
 ## News
 
-- 2026.04
-
-  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
+- 2026.04-05
+  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0.
   - Fused MoE.
   - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
 
@@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 
 NA
 
+## Performance Reference
+
+
+To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313).
+
+You could update your test result in it directly.
+
 ## Docker
 
 The docker build option is currently limited to *Intel GPU* targets.

From c85a242ed021ab6732e2973764437c3c5655102b Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 18 May 2026 23:45:41 -0700
Subject: [PATCH 05/12] ggml-webgpu : extend GDN for K>1 (#23299)

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          |  2 ++
 .../wgsl-shaders/gated_delta_net.wgsl         | 24 +++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 78cb02be06d..921c12b41ac 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
     const uint32_t h        = (uint32_t) src2->ne[1];
     const uint32_t n_tokens = (uint32_t) src2->ne[2];
     const uint32_t n_seqs   = (uint32_t) src2->ne[3];
+    const uint32_t K        = (uint32_t) src5->ne[1];
     const float    scale    = 1.0f / sqrtf((float) s_v);
     uint32_t       scale_u32;
     memcpy(&scale_u32, &scale, sizeof(scale_u32));
@@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
 
         (uint32_t) src0->ne[1],
         (uint32_t) (src2->ne[3] / src0->ne[3]),
+        K,
         scale_u32,
     };
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl
index f9d98fda40b..d68520f8282 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl
@@ -39,6 +39,7 @@ struct Params {
 
     neq1: u32,
     rq3: u32,
+    K: u32,
     scale: f32,
 };
 
@@ -62,11 +63,14 @@ fn main(
     let iq3 = seq_id / params.rq3;
 
     let state_size = S_V * S_V;
-    let state_base = (seq_id * params.h + head_id) * state_size;
+    let state_in_base = (seq_id * params.K * params.h + head_id) * state_size;
+    let state_out_base = (seq_id * params.h + head_id) * state_size;
+    let state_size_per_snap = state_size * params.h * params.n_seqs;
+    let shift = i32(params.n_tokens) - i32(params.K);
 
     var state: array<f32, S_V>;
     for (var i = 0u; i < S_V; i++) {
-        state[i] = src_state[state_base + col * S_V + i];
+        state[i] = src_state[state_in_base + col * S_V + i];
     }
 
     var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V;
@@ -123,10 +127,22 @@ fn main(
         dst[attn_off + col] = attn_col * params.scale;
         attn_off += S_V * params.h;
 
+        if (params.K > 1u) {
+            let target_slot = i32(t) - shift;
+            if (target_slot >= 0 && target_slot < i32(params.K)) {
+                let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base;
+                for (var i = 0u; i < S_V; i++) {
+                    dst[slot_base + col * S_V + i] = state[i];
+                }
+            }
+        }
+
         workgroupBarrier();
     }
 
-    for (var i = 0u; i < S_V; i++) {
-        dst[params.s_off + state_base + col * S_V + i] = state[i];
+    if (params.K == 1u) {
+        for (var i = 0u; i < S_V; i++) {
+            dst[params.s_off + state_out_base + col * S_V + i] = state[i];
+        }
     }
 }

From d2e179a477fc1d1935b68422c1181ef2d62ed2ef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 May 2026 09:46:05 +0300
Subject: [PATCH 06/12] llama-eval : add per-task summary stats (#23151)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* llama-eval : add per-problem summary table to HTML reports

- Add chunk_idx and problem_idx to TaskState and saved case dicts
- Group completed cases by problem_idx in dump_html()
- Render per-problem summary table before individual task table
  - Columns: Problem (zero-padded), Runs, Correct (n/r),
    Tokens (min/avg/max), T/s (min/avg/max), Gen s (min/avg/max)
  - Sorted by problem index, monospace font, right-aligned numbers
  - Colspan headers for grouped stats, auto width
- Simulator: add /v1/models endpoint, timings in response,
  template-aware question matching, --dataset arg (aime/aime2025)

Assisted-by: llama.cpp:local pi

* llama-eval : add tabs for Detailed and Summary tables, apply monospace font globally

- Wrap Detailed and Summary tables in switchable tabs (Detailed active by default)
- Remove summary-section wrapper, use tab labels instead
- Apply monospace font to all tables and the top bar

Assisted-by: llama.cpp:local pi

* llama-eval : redesign top bar as CSS grid label/value pairs

- Replace flat span list with 4-column grid layout (2 pairs per row)
- Labels in muted color (#888), values in dark (#222)
- Bold dataset name and model name
- Removed media query, always uses 4 columns

Assisted-by: llama.cpp:local pi

* llama-eval : use realistic token counts and throughput in simulator

- comp_tokens: [30, 80] → [10000, 60000]
- tps_gen: derived → uniform [90.0, 110.0]
- t_gen_ms: now computed from tokens/tps

Assisted-by: llama.cpp:local pi

* llama-eval : color Answer column green/red based on correctness

Use the same .correct/.incorrect CSS classes on the Answer column
to make correct answers green and incorrect answers red.

Assisted-by: llama.cpp:local pi

* llama-eval : fix pyright errors from max(..., key=len) type inference

Use key=lambda x: len(x) instead of key=len so the type checker
infers the return type as str instead of Sized, fixing:
  - unresolved-attribute: Object of type Sized has no attribute lower
  - not-subscriptable: Cannot subscript object of type Sized

Assisted-by: llama.cpp:local pi
---
 examples/llama-eval/llama-eval.py             | 189 ++++++++++++++----
 examples/llama-eval/llama-server-simulator.py |  99 +++++++--
 2 files changed, 233 insertions(+), 55 deletions(-)

diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py
index e833070eee9..4bdd239c007 100755
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
@@ -149,6 +149,8 @@ class TaskState:
     t_gen_ms: Optional[float] = None
     reasoning_content: Optional[str] = None
     server_name: Optional[str] = None
+    chunk_idx: int = 0
+    problem_idx: int = 0
 
 
 class EvalState:
@@ -233,7 +235,9 @@ def add_result(
         tps_gen: Optional[float] = None,
         t_gen_ms: Optional[float] = None,
         reasoning_content: Optional[str] = None,
-        server_name: Optional[str] = None
+        server_name: Optional[str] = None,
+        chunk_idx: int = 0,
+        problem_idx: int = 0,
     ):
         with self._lock:
             if "cases" not in self.task_states:
@@ -252,7 +256,9 @@ def add_result(
                 "tps_gen": tps_gen,
                 "t_gen_ms": t_gen_ms,
                 "reasoning_content": reasoning_content,
-                "server_name": server_name
+                "server_name": server_name,
+                "chunk_idx": chunk_idx,
+                "problem_idx": problem_idx,
             }
 
             self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False))
@@ -289,6 +295,9 @@ def dump(self):
             all_cases = {}
             for i, task_id in tasks_to_save:
                 question_text, prompt, expected = self.get_case(i)
+                # Extract chunk_idx from task_id for pending cases
+                _parts = task_id.rsplit("_", 2)
+                _chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
                 if task_id in self.task_states.get("cases", {}):
                     all_cases[task_id] = self.task_states["cases"][task_id]
                 else:
@@ -306,7 +315,9 @@ def dump(self):
                         "tps_gen": None,
                         "t_gen_ms": None,
                         "reasoning_content": None,
-                        "server_name": None
+                        "server_name": None,
+                        "chunk_idx": _chunk_idx,
+                        "problem_idx": i,
                     }
 
             ci_lower, ci_upper = self.accuracy_ci()
@@ -382,11 +393,12 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A
             grader_log_str = self._escape_html(json.dumps(grader_log, indent=2))
             escaped_server = self._escape_html(server_name)
 
+            answer_class = status_class if status == "ok" else ""
             rows.append(f"""<tr class="task-row" onclick="toggleDetails('{task_id}')">
                 <td>{task_id}</td>
                 <td class="{status_class}">{status_text}</td>
                 <td>{self._escape_html(expected)}</td>
-                <td>{self._escape_html(answer)}</td>
+                <td class="{answer_class}">{self._escape_html(answer)}</td>
                 <td>{tokens_str}</td>
                 <td>{tps_str}</td>
                 <td>{t_gen_str}</td>
@@ -405,6 +417,53 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A
 
         rows_html = "\n".join(rows)
 
+        # ---- per-problem summary table ----
+        problem_groups: Dict[int, List[Dict[str, Any]]] = {}
+        for _tid, _case in cases.items():
+            if _case.get("status") != "ok":
+                continue
+            _pidx = _case.get("problem_idx")
+            if _pidx is None:
+                _p_parts = _tid.rsplit("_", 2)
+                _pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0
+            problem_groups.setdefault(_pidx, []).append(_case)
+
+        summary_rows_html = ""
+        if problem_groups:
+            def _stat(v, fmt=".1f", avg_fmt=None):
+                if not v:
+                    return ("–", "–", "–")
+                af = fmt if avg_fmt is None else avg_fmt
+                return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}")
+
+            summary_data = []
+            for pidx, g in problem_groups.items():
+                runs = len(g)
+                n_ok = sum(1 for c in g if c.get("correct", False))
+                toks = [c["tokens"] for c in g if c.get("tokens") is not None]
+                tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None]
+                tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None]
+                summary_data.append((
+                    pidx, runs, n_ok,
+                    _stat(toks, "d", ".0f"),
+                    _stat(tps),
+                    _stat(tg),
+                ))
+
+            summary_data.sort(key=lambda r: r[0])  # sort by problem index ascending
+
+            summary_rows_html = "\n".join(
+                f"""<tr class="summary-row">
+                    <td>{p:03d}</td>
+                    <td>{r}</td>
+                    <td>{n}/{r}</td>
+                    <td>{tk[0]}</td><td>{tk[1]}</td><td>{tk[2]}</td>
+                    <td>{tp[0]}</td><td>{tp[1]}</td><td>{tp[2]}</td>
+                    <td>{tg[0]}</td><td>{tg[1]}</td><td>{tg[2]}</td>
+                </tr>"""
+                for p, r, n, tk, tp, tg in summary_data
+            )
+
         html_content = f"""<!DOCTYPE html>
 <html>
 <head>
@@ -412,10 +471,10 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A
 <title>{self.dataset_type.upper()} Eval</title>
 <style>
         body {{ font-family: system-ui, sans-serif; margin: 0; padding: 16px; background: #fff; color: #222; }}
-        .bar {{ padding: 8px 0; font-size: 14px; color: #555; }}
-        .bar span {{ margin-right: 20px; }}
-        .bar b {{ color: #222; }}
-        table {{ width: 100%; border-collapse: collapse; font-size: 13px; }}
+        .bar {{ padding: 8px 0; font-size: 13px; color: #555; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; display: grid; grid-template-columns: auto 1fr auto 1fr; gap: 2px 12px; align-items: baseline; }}
+        .bar .label {{ color: #888; }}
+        .bar .value {{ color: #222; }}
+        table {{ width: 100%; border-collapse: collapse; font-size: 13px; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; }}
         th {{ text-align: left; padding: 6px 8px; border-bottom: 2px solid #ccc; font-weight: 600; }}
         td {{ padding: 4px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
         .task-row {{ cursor: pointer; }}
@@ -429,37 +488,88 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A
         .details-content {{ padding: 8px 16px; background: #f6f8fa; font-size: 12px; }}
         .details-content b {{ color: #555; }}
         .details-content pre {{ background: #fff; border: 1px solid #e1e4e8; padding: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word; margin: 4px 0 8px; }}
+        .summary-table {{ margin-bottom: 16px; font-size: 13px; width: 100%; }}
+        .summary-row {{ background: #fafbfc; }}
+        .summary-row:hover {{ background: #f5f5f5; }}
+        .summary-table th {{ text-align: right; font-weight: 600; }}
+        .summary-table th:first-child {{ text-align: left; }}
+        .summary-table th[colspan] {{ text-align: center; }}
+        .summary-table td {{ text-align: right; }}
+        .summary-table td:first-child {{ text-align: left; }}
+        .tabs {{ display: flex; border-bottom: 2px solid #ddd; margin: 12px 0 0; }}
+        .tab-btn {{ padding: 6px 16px; border: none; background: none; font-size: 13px; cursor: pointer; color: #555; border-bottom: 2px solid transparent; margin-bottom: -2px; font-weight: 500; }}
+        .tab-btn:hover {{ color: #222; }}
+        .tab-btn.active {{ color: #222; border-bottom-color: #222; font-weight: 600; }}
+        .tab-content {{ display: none; }}
+        .tab-content.active {{ display: block; }}
 </style>
 </head>
 <body>
     <div class="bar">
-        <span><b>{self.dataset_type.upper()}</b></span>
-        <span>Model: {self.model_name or 'N/A'}</span>
-        <span>Accuracy: <b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</span>
-        <span>Correct: <span class="correct">{n_correct}</span> / {len(completed)}</span>
-        <span>Pending: {n_pending}</span>
-        <span>Time: {self.total_time:.1f}s</span>
-        <span>Sampling: {sampling_str}</span>
+        <div class="label">Dataset</div><div class="value"><b>{self.dataset_type.upper()}</b></div>
+        <div class="label">Model</div><div class="value"><b>{self.model_name or 'N/A'}</b></div>
+        <div class="label">Accuracy</div><div class="value"><b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</div>
+        <div class="label">Correct</div><div class="value"><span class="correct">{n_correct}</span> / {len(completed)}</div>
+        <div class="label">Pending</div><div class="value">{n_pending}</div>
+        <div class="label">Time</div><div class="value">{self.total_time:.1f}s</div>
+        <div class="label">Sampling</div><div class="value">{sampling_str}</div>
+    </div>
+    <div class="tabs">
+        <button class="tab-btn active" data-tab="detailed" onclick="switchTab(this)">Detailed</button>
+        <button class="tab-btn" data-tab="summary" onclick="switchTab(this)">Summary</button>
+    </div>
+    <div id="tab-detailed" class="tab-content active">
+        <table>
+            <thead>
+                <tr>
+                    <th>ID</th>
+                    <th></th>
+                    <th>Gold</th>
+                    <th>Answer</th>
+                    <th>Tokens</th>
+                    <th>T/s</th>
+                    <th>Gen s</th>
+                    <th>Server</th>
+                </tr>
+            </thead>
+            <tbody>
+                {rows_html}
+            </tbody>
+        </table>
+    </div>
+    <div id="tab-summary" class="tab-content">
+        <table class="summary-table">
+            <thead>
+                <tr>
+                    <th>Problem</th>
+                    <th>Runs</th>
+                    <th>Correct</th>
+                    <th colspan="3">Tokens</th>
+                    <th colspan="3">T/s</th>
+                    <th colspan="3">Gen s</th>
+                </tr>
+                <tr>
+                    <th></th>
+                    <th></th>
+                    <th></th>
+                    <th>min</th><th>avg</th><th>max</th>
+                    <th>min</th><th>avg</th><th>max</th>
+                    <th>min</th><th>avg</th><th>max</th>
+                </tr>
+            </thead>
+            <tbody>
+                {summary_rows_html}
+            </tbody>
+        </table>
     </div>
-    <table>
-        <thead>
-            <tr>
-                <th>ID</th>
-                <th></th>
-                <th>Gold</th>
-                <th>Answer</th>
-                <th>Tokens</th>
-                <th>T/s</th>
-                <th>Gen s</th>
-                <th>Server</th>
-            </tr>
-        </thead>
-        <tbody>
-            {rows_html}
-        </tbody>
-    </table>
     <script>
         function toggleDetails(id) {{ document.getElementById('details-'+id).classList.toggle('open'); }}
+        function switchTab(btn) {{
+            document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
+            document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+            btn.classList.add('active');
+            document.getElementById('tab-'+btn.dataset.tab).classList.add('active');
+        }}
     </script>
 </body>
 </html>"""
@@ -1062,12 +1172,19 @@ def _process_single_case(
     ) -> TaskState:
         question_text, prompt, expected = eval_state.get_case(i)
 
+        # Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}"
+        _parts = task_id.rsplit("_", 2)
+        chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
+        problem_idx = i
+
         task_state = TaskState(
             task_id=task_id,
             prompt=prompt,
             expected=expected,
             question_text=question_text,
-            server_name=server_config.name
+            server_name=server_config.name,
+            chunk_idx=chunk_idx,
+            problem_idx=problem_idx,
         )
 
         try:
@@ -1085,7 +1202,8 @@ def _process_single_case(
                 eval_state.add_result(
                     task_id, prompt, expected, result, None,
                     {"finish_reason": finish_reason}, False, task_state.status,
-                    tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
+                    tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
+                    chunk_idx, problem_idx,
                 )
                 eval_state.dump()
                 return task_state
@@ -1108,7 +1226,8 @@ def _process_single_case(
             eval_state.add_result(
                 task_id, prompt, expected, result, answer,
                 grader_log, is_correct, "ok",
-                tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
+                tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
+                chunk_idx, problem_idx,
             )
 
             eval_state.dump()
diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py
index 2f9cdc5450d..e64ba89335d 100755
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]:
     return int(match.group(0))
 
 class AimeDataset:
-    def __init__(self, split: str = "train"):
+    def __init__(self, split: str = "train", dataset_type: str = "aime"):
         self.split = split
+        self.dataset_type = dataset_type
         self.questions: List[Dict] = []
         self._load_dataset()
 
-    def _load_dataset(self):
-        print(f"Loading AIME dataset (split: {self.split})...")
+    def _get_question_text(self, question: Dict) -> str:
+        """Get question text, handling different dataset field names."""
+        return question.get("problem", question.get("question", ""))
 
-        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
-        if cache_path.exists():
-            print(f"Using cached dataset from {cache_path}")
-            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+    def _load_dataset(self):
+        if self.dataset_type == "aime":
+            print(f"Loading AIME dataset (split: {self.split})...")
+            cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
+            if cache_path.exists():
+                print(f"Using cached dataset from {cache_path}")
+                ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+            else:
+                ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+        elif self.dataset_type == "aime2025":
+            print(f"Loading AIME2025 dataset...")
+            ds_list = []
+            for config_name in ["AIME2025-I", "AIME2025-II"]:
+                cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0"
+                if cache_path.exists():
+                    print(f"Using cached dataset from {cache_path}")
+                    ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
+                else:
+                    ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test")
+                ds_list.extend(ds)
+            ds = ds_list
         else:
-            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+            raise ValueError(f"Unknown dataset type: {self.dataset_type}")
 
         self.questions = list(ds)
-        print(f"AIME dataset loaded: {len(self.questions)} questions")
+        print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions")
 
     def find_question(self, request_text: str) -> Optional[Dict]:
+        # Strip common template prefixes to get the actual question text
+        # Templates include things like "Solve the following math problem step by step..."
+        # The actual question usually follows a blank line or after the template instruction
+        cleaned = request_text
+        # Split on double newline and take the part that looks like the problem
+        parts = cleaned.split('\n\n')
+        if len(parts) > 1:
+            # Find the part that's longest (likely the actual problem text)
+            problem_parts = [p for p in parts if len(p.strip()) > 100]
+            if problem_parts:
+                cleaned = max(problem_parts, key=lambda x: len(x))
+
         best_match = None
         best_distance = -1
         best_index = -1
 
         for i, question in enumerate(self.questions):
-            question_text = question["problem"]
-            request_lower = request_text.lower()
+            question_text = self._get_question_text(question)
+            request_lower = cleaned.lower()
             question_lower = question_text.lower()
 
+            # Check if question text is contained in the cleaned request
+            if question_lower in request_lower or request_lower in question_lower:
+                debug_log(f"DEBUG: Found substring match at index {i}")
+                return question
+
             # Exact match
             if question_lower == request_lower:
                 debug_log(f"DEBUG: Found exact match at index {i}")
@@ -118,7 +154,7 @@ def find_question(self, request_text: str) -> Optional[Dict]:
             debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
             return best_match
 
-        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
+        debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...")
         return None
 
     def get_answer(self, question: Dict) -> str:
@@ -134,15 +170,16 @@ def __init__(
         port: int = 8033,
         host: str = "localhost",
         success_rate: float = 0.8,
-        dataset_split: str = "train"
+        dataset_split: str = "train",
+        dataset_type: str = "aime"
     ):
         self.port = port
         self.host = host
         self.success_rate = success_rate
-        self.dataset = AimeDataset(dataset_split)
+        self.dataset = AimeDataset(dataset_split, dataset_type)
         self.eval_state = EvalState(
-            id="aime-2025",
-            tasks=["aime"],
+            id=dataset_type,
+            tasks=[dataset_type],
             task_states={},
             sampling_config={"temperature": 0, "max_tokens": 2048}
         )
@@ -159,6 +196,10 @@ def _generate_response(
         else:
             response_text = self._generate_wrong_answer(question)
 
+        comp_tokens = random.randint(10000, 60000)
+        tps_gen = random.uniform(90.0, 110.0)
+        t_gen_ms = comp_tokens / tps_gen * 1000
+
         return {
             "id": f"chatcmpl-{int(time.time())}",
             "object": "chat.completion",
@@ -176,8 +217,12 @@ def _generate_response(
             ],
             "usage": {
                 "prompt_tokens": 100,
-                "completion_tokens": 50,
-                "total_tokens": 150
+                "completion_tokens": comp_tokens,
+                "total_tokens": 100 + comp_tokens
+            },
+            "timings": {
+                "predicted_ms": t_gen_ms,
+                "predicted_per_second": tps_gen
             }
         }
 
@@ -218,6 +263,12 @@ def _process_request(self, request_data: Dict) -> Dict:
         return response
 
 class RequestHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/v1/models":
+            self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200)
+            return
+        self._send_json({"error": "Not found"}, 404)
+
     def do_POST(self):
         if self.path != "/v1/chat/completions":
             self._send_json({"error": "Not found"}, 404)
@@ -280,6 +331,13 @@ def main():
         default=0.8,
         help="Success rate 0-1 (default: 0.8)"
     )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="aime",
+        choices=["aime", "aime2025"],
+        help="Dataset type (default: aime)"
+    )
     parser.add_argument(
         "--dataset-split",
         type=str,
@@ -294,7 +352,8 @@ def main():
         port=args.port,
         host=args.host,
         success_rate=args.success_rate,
-        dataset_split=args.dataset_split
+        dataset_split=args.dataset_split,
+        dataset_type=args.dataset
     )
 
     server = HTTPServer((args.host, args.port), RequestHandler)
@@ -304,7 +363,7 @@ def main():
     print("\n=== llama-server-simulator ===")
     print(f"Server running on http://{args.host}:{args.port}")
     print(f"Success rate: {args.success_rate}")
-    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
+    print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions")
     print("\nPress Ctrl+C to stop\n")
 
     try:

From cd963fee6a86387d598ebe3888017376d6e9e8f6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 May 2026 09:46:34 +0300
Subject: [PATCH 07/12] save-load-state : refactor tests and improve
 readability (#23196)

* save-load-state : refactor into separate phase functions

- Split monolithic main() into 4 self-contained phase functions, each
  managing its own context/sampler/batch lifecycle
- Each function tokenizes internally using its local ctx instance
- main() is now a clean orchestrator: init -> run phases -> assert results
- Proper resource cleanup on every exit path (return {} on error)

Assisted-by: llama.cpp:local pi

* save-load-state : use params.out_file instead of separate state_file

- Remove state_file parameter from all phase functions
- Each function accesses params.out_file directly
- Initialize params.out_file in main alongside params.prompt

Assisted-by: llama.cpp:local pi

* save-load-state : use smart pointers for ctx and smpl

- Replace raw llama_context* with llama_context_ptr
- Replace raw llama_sampler* with llama_sampler_ptr
- Remove all manual llama_free() and llama_sampler_free() calls
- Keep llama_batch as raw (managed manually with llama_batch_free)

Assisted-by: llama.cpp:local pi

* save-load-state : add local llama_batch_ptr RAII wrapper

- Add llama_batch_ptr struct holding llama_batch by value
- Calls llama_batch_free() in destructor
- Eliminates all manual llama_batch_free() calls

Assisted-by: llama.cpp:local pi

* save-load-state : replace printf/fprintf with logging macros

- Add log.h include
- Replace fprintf(stderr, ...) errors with LOG_ERR
- Replace fprintf(stderr, ...) info with LOG_TRC
- Replace printf output with LOG

Assisted-by: llama.cpp:local pi

* save-load-state : refactor tests to check results inline

Each follow-up phase now accepts an expected result and performs
the comparison internally instead of collecting results in main().

Assisted-by: llama.cpp:local pi

* save-load-state : improve test output readability

Add phase labels, remove redundant run prefixes, and show
PASS after each test.

Assisted-by: llama.cpp:local pi

* pi : add rule about git signing

* save-load-state : simplify llama_batch_ptr

Change get() to return a reference and remove operator*().
Use batch.get() throughout for consistency.

Assisted-by: llama.cpp:local pi

* save-load-state : extract generate_tokens helper

Factor out the repeated token generation loop into a shared
helper function used by all phases.

Assisted-by: llama.cpp:local pi

* save-load-state : update comments to use test terminology

Replace "Phase" with "Test" and list each test's steps
as bullet points.

Assisted-by: llama.cpp:local pi

* save-load-state : rename test functions

Rename to test_baseline, test_state_load, test_seq_cp_host,
test_seq_cp_device. Update comments and logs accordingly.

Assisted-by: llama.cpp:local pi

* pi : add rule to never git push without confirmation

Assisted-by: llama.cpp:local pi

* common : add model_only option to common_init_from_params

Add bool model_only parameter to skip context creation,
sampler init, and context-dependent setup.

Use in save-load-state to initialize only the model,
with each test creating its own context.

Assisted-by: llama.cpp:local pi

---------

Co-authored-by: ggerganov <ggerganov@users.noreply.github.com>
---
 .pi/gg/SYSTEM.md                             |   2 +
 common/common.cpp                            |  14 +-
 common/common.h                              |   4 +-
 examples/save-load-state/save-load-state.cpp | 453 ++++++++++---------
 4 files changed, 254 insertions(+), 219 deletions(-)

diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
index 727a850b183..b7597a4c3ae 100644
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -22,6 +22,8 @@ Pull requests (PRs):
 Commits:
 - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
 - Do not explicitly set the git author in commits - rely on the default git config
+- Always use `--no-gpg-sign` when committing
+- Never `git push` without explicit confirmation from the user
 
 Resources (read on demand):
 - [CONTRIBUTING.md](CONTRIBUTING.md)
diff --git a/common/common.cpp b/common/common.cpp
index 9cf11ea9f5f..aef06263e3f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1160,7 +1160,7 @@ struct common_init_result::impl {
     std::vector<llama_sampler_seq_config> samplers_seq_config;
 };
 
-common_init_result::common_init_result(common_params & params) :
+common_init_result::common_init_result(common_params & params, bool model_only) :
     pimpl(new impl{}) {
     auto mparams = common_model_params_to_llama(params);
     auto cparams = common_context_params_to_llama(params);
@@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) :
 
     pimpl->model.reset(model);
 
+    if (model_only) {
+        return;
+    }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // load and optionally apply lora adapters
@@ -1309,8 +1313,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
     return pimpl->lora;
 }
 
-common_init_result_ptr common_init_from_params(common_params & params) {
-    common_init_result_ptr res(new common_init_result(params));
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
+    common_init_result_ptr res(new common_init_result(params, model_only));
 
     llama_model * model = res->model();
     if (model == NULL) {
@@ -1318,6 +1322,10 @@ common_init_result_ptr common_init_from_params(common_params & params) {
         return res;
     }
 
+    if (model_only) {
+        return res;
+    }
+
     llama_context * lctx = res->context();
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
diff --git a/common/common.h b/common/common.h
index 1d3d788b2de..e03f7037454 100644
--- a/common/common.h
+++ b/common/common.h
@@ -857,7 +857,7 @@ struct common_sampler;
 
 // note: defines the model, context, samplers, ets. lifetimes
 struct common_init_result {
-    common_init_result(common_params & params);
+    common_init_result(common_params & params, bool model_only = false);
     ~common_init_result();
 
     llama_model * model();
@@ -875,7 +875,7 @@ struct common_init_result {
 
 using common_init_result_ptr = std::unique_ptr<common_init_result>;
 
-common_init_result_ptr common_init_from_params(common_params & params);
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
 
 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index e6f5e9802ab..97ab7c6de3b 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,320 +1,345 @@
 #include "arg.h"
 #include "common.h"
-#include "llama.h"
+#include "log.h"
+#include "llama-cpp.h"
 
 #include <clocale>
 #include <vector>
-#include <cstdio>
 
+struct llama_batch_ptr {
+    llama_batch batch;
 
-int main(int argc, char ** argv) {
-    std::setlocale(LC_NUMERIC, "C");
+    llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max)
+        : batch{llama_batch_init(n_tokens, embd, n_seq_max)} {}
 
-    common_params params;
+    ~llama_batch_ptr() { llama_batch_free(batch); }
 
-    params.prompt = "The quick brown fox";
-    params.sampling.seed = 1234;
+    llama_batch_ptr(const llama_batch_ptr &) = delete;
+    llama_batch_ptr & operator=(const llama_batch_ptr &) = delete;
+    llama_batch_ptr(llama_batch_ptr &&) = default;
+    llama_batch_ptr & operator=(llama_batch_ptr &&) = default;
 
-    const std::string_view state_file = "dump_state.bin";
+    llama_batch & get() { return batch; }
+    const llama_batch & get() const { return batch; }
+};
 
-    common_init();
+static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) {
+    std::string result;
+    llama_batch_ptr batch(1, 0, 1);
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
+    for (int i = 0; i < n_predict; i++) {
+        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
+        auto next_token_str = common_token_to_piece(ctx, next_token);
 
-    if (params.n_parallel == 1) {
-        // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
-        printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
-        params.kv_unified = true;
-    }
+        LOG("%s", next_token_str.c_str());
+        result += next_token_str;
 
-    if (params.n_predict < 0) {
-        params.n_predict = 16;
+        common_batch_clear(batch.get());
+        common_batch_add(batch.get(), next_token, n_past, {seq_id}, true);
+
+        if (llama_decode(ctx, batch.get())) {
+            LOG_ERR("\n%s: failed to evaluate\n", __func__);
+            return {};
+        }
+        n_past++;
     }
 
-    auto n_past = 0;
+    return result;
+}
 
-    std::string result0;
-    std::string result1;
-    std::string result2;
-    std::string result3;
+// Test 1: baseline
+// - tokenize the prompt
+// - decode all but the last token
+// - save state to disk
+// - decode the last token
+// - generate n_predict tokens
+static std::string test_baseline(struct llama_model * model, const struct common_params & params) {
+    auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
 
-    // init
+    auto sparams = llama_sampler_chain_default_params();
+    auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
+    llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
 
-    ggml_backend_load_all();
+    auto tokens = common_tokenize(ctx.get(), params.prompt, true);
 
-    auto llama_init = common_init_from_params(params);
+    auto n_past = 0;
+    if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) {
+        LOG_ERR("%s: failed to decode prompt\n", __func__);
+        return {};
+    }
 
-    auto * model = llama_init->model();
-    auto * ctx   = llama_init->context();
+    LOG("\n=== Test 1: baseline ===\n");
+    LOG("%s", params.prompt.c_str());
 
-    if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
-        return 1;
+    auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
+    if (result.empty()) {
+        return {};
     }
 
-    auto sparams = llama_sampler_chain_default_params();
+    LOG("\n");
 
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    return result;
+}
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
 
-    // tokenize prompt
-    auto tokens = common_tokenize(ctx, params.prompt, true);
+// Test 2: state load
+// - create a new context
+// - load state from file
+// - replay the last prompt token
+// - generate n_predict tokens and compare against expected result
+static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
+    auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
 
-    const bool save_state = true;
-    if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
-        return 1;
-    }
+    auto sparams = llama_sampler_chain_default_params();
+    auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
+    llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
 
-    // first run
-    printf("\nfirst run: %s", params.prompt.c_str());
+    auto tokens = common_tokenize(ctx.get(), params.prompt, true);
 
-    llama_batch batch = llama_batch_init(1, 0, 1);
+    LOG("\n=== Test 2: state load ===\n");
+    LOG("%s", params.prompt.c_str());
 
-    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
-        auto next_token_str = common_token_to_piece(ctx, next_token);
+    // Load state from file
+    std::vector<llama_token> unused_sts(tokens.size());
+    size_t n_token_count_out = 0;
 
-        printf("%s", next_token_str.c_str());
-        result0 += next_token_str;
+    if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        LOG_ERR("\n%s: failed to load state\n", __func__);
+        return false;
+    }
 
-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {0}, true);
+    LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
 
-        if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            return 1;
-        }
-        n_past += 1;
+    // Replay last token
+    int n_past = (int) n_token_count_out;
+    if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
+        return false;
     }
+    n_past++;
+
+    // Generate tokens
+    auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
+    if (result.empty()) {
+        return false;
+    }
+
+    if (result != expected_result) {
+        LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
+        return false;
+    }
+
+    LOG("\nPASS\n");
+    return true;
+}
 
-    printf("\n\n");
 
-    // make new context
-    llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
+// Test 3: seq copy (host)
+// - create a multi-seq context
+// - load state from file
+// - replay the last prompt token
+// - migrate KV cache from seq 0 to seq 1 via the CPU path
+// - generate n_predict tokens on seq 1 and compare against expected result
+static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
+    auto params_ctx = common_context_params_to_llama(params);
+    params_ctx.n_seq_max = 2;
+    auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
 
-    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
+    auto sparams = llama_sampler_chain_default_params();
+    auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
+    llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
 
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
+    auto tokens = common_tokenize(ctx.get(), params.prompt, true);
 
-    printf("\nsecond run: %s", params.prompt.c_str());
+    LOG("\n=== Test 3: seq copy (host) ===\n");
+    LOG("%s", params.prompt.c_str());
 
-    // load state from file
-    std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
+    // Load state from file
+    std::vector<llama_token> unused_sts(tokens.size());
     size_t n_token_count_out = 0;
 
-    if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
-        return 1;
+    if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        LOG_ERR("\n%s: failed to load state\n", __func__);
+        return false;
     }
 
-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+    LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
 
-    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
-        return 1;
+    // Replay last token
+    int n_past = (int) n_token_count_out;
+    if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
+        return false;
     }
-    ++n_past;
+    n_past++;
 
-    // second run
-    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
-        auto next_token_str = common_token_to_piece(ctx2, next_token);
-
-        printf("%s", next_token_str.c_str());
-        result1 += next_token_str;
+    // Migrate KV cache from seq 0 to seq 1 (CPU path)
+    {
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx.get(), 0));
+        const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0);
+        if (ncopy != seq_store.size()) {
+            LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            return false;
+        }
+        LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
 
-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {0}, true);
+        llama_memory_clear(llama_get_memory(ctx.get()), true);
+        LOG_TRC("%s: kv cache cleared\n", __func__);
 
-        if (llama_decode(ctx2, batch)) {
-            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            return 1;
+        const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1);
+        if (nset != seq_store.size()) {
+            LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            return false;
         }
-        n_past += 1;
+        LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
     }
 
-    printf("\n\n");
+    // Generate tokens on seq 1
+    auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
+    if (result.empty()) {
+        return false;
+    }
 
-    if (result0 != result1) {
-        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
-        return 1;
+    if (result != expected_result) {
+        LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
+        return false;
     }
 
-    // make new context
-    auto params_ctx3 = common_context_params_to_llama(params);
-    params_ctx3.n_seq_max = 2;
-    llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
+    LOG("\nPASS\n");
+    return true;
+}
+
 
-    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
+// Test 4: seq copy (device)
+// - create a multi-seq context
+// - load state from file
+// - replay the last prompt token
+// - migrate KV cache from seq 0 to seq 1 via the on-device path
+// - generate n_predict tokens on seq 1 and compare against expected result
+static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
+    auto params_ctx = common_context_params_to_llama(params);
+    params_ctx.n_seq_max = 2;
+    auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
 
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
+    auto sparams = llama_sampler_chain_default_params();
+    auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
+    llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
 
-    printf("\nsingle seq run: %s", params.prompt.c_str());
+    auto tokens = common_tokenize(ctx.get(), params.prompt, true);
 
-    // load state (rng, logits, embedding and kv_cache) from file
-    n_token_count_out = 0;
+    LOG("\n=== Test 4: seq copy (device) ===\n");
+    LOG("%s", params.prompt.c_str());
 
-    if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
-        return 1;
+    // Load state from file
+    std::vector<llama_token> unused_sts(tokens.size());
+    size_t n_token_count_out = 0;
+
+    if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
+        LOG_ERR("\n%s: failed to load state\n", __func__);
+        return false;
     }
 
-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
+    LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
 
-    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
-        return 1;
+    // Replay last token
+    int n_past = (int) n_token_count_out;
+    if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
+        return false;
     }
-    ++n_past;
+    n_past++;
 
-    // save seq 0 and load into seq 1
+    // Migrate KV cache from seq 0 to seq 1 (on-device path)
     {
-        // save kv of seq 0
-        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
-        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
+        const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
         if (ncopy != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
-            return 1;
+            LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            return false;
         }
-        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+        LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
 
-        // erase whole kv
-        llama_memory_clear(llama_get_memory(ctx3), true);
-        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+        llama_memory_clear(llama_get_memory(ctx.get()), true);
+        LOG_TRC("%s: kv cache cleared\n", __func__);
 
-        // restore kv into seq 1
-        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
+        const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
         if (nset != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
-            return 1;
+            LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            return false;
         }
-        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+        LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
     }
 
-    // third run with seq 1 instead of 0
-    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
-        auto next_token_str = common_token_to_piece(ctx3, next_token);
-
-        printf("%s", next_token_str.c_str());
-        result2 += next_token_str;
-
-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {1}, true);
+    // Generate tokens on seq 1
+    auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
+    if (result.empty()) {
+        return false;
+    }
 
-        if (llama_decode(ctx3, batch)) {
-            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            return 1;
-        }
-        n_past += 1;
+    if (result != expected_result) {
+        LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
+        return false;
     }
 
-    // test on-device state save/load
-    auto params_ctx4 = common_context_params_to_llama(params);
-    params_ctx4.n_seq_max = 2;
-    llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
+    LOG("\nPASS\n");
+    return true;
+}
 
-    llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
+int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
 
-    printf("\nsingle seq run: %s", params.prompt.c_str());
+    common_params params;
+    params.prompt = "The quick brown fox";
+    params.out_file = "dump_state.bin";
+    params.sampling.seed = 1234;
 
-    // load state (rng, logits, embedding and kv_cache) from file
-    n_token_count_out = 0;
+    common_init();
 
-    if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
-        fprintf(stderr, "\n%s : failed to load state\n", __func__);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
-
-    // restore state (last tokens)
-    n_past = n_token_count_out;
-    if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
-        return 1;
+    if (params.n_parallel == 1) {
+        LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
+        params.kv_unified = true;
     }
-    ++n_past;
-
-    // save seq 0 and load into seq 1
-    {
-        // save kv of seq 0
-        std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
-        const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-        if (ncopy != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
-            return 1;
-        }
-        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
-
-        // erase whole kv
-        llama_memory_clear(llama_get_memory(ctx4), true);
-        fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
-        // restore kv into seq 0
-        const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
-        if (nset != seq_store.size()) {
-            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
-            return 1;
-        }
-        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    if (params.n_predict < 0) {
+        params.n_predict = 16;
     }
 
-    // forth run
-    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl4, ctx4, -1);
-        auto next_token_str = common_token_to_piece(ctx4, next_token);
-
-        printf("%s", next_token_str.c_str());
-        result3 += next_token_str;
+    ggml_backend_load_all();
 
-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {1}, true);
+    auto llama_init = common_init_from_params(params, true);
+    auto * model = llama_init->model();
 
-        if (llama_decode(ctx4, batch)) {
-            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
-            return 1;
-        }
-        n_past += 1;
+    if (model == nullptr) {
+        LOG_ERR("%s: failed to init\n", __func__);
+        return 1;
     }
 
-    printf("\n");
-
-    llama_sampler_free(smpl);
-    llama_sampler_free(smpl2);
-    llama_sampler_free(smpl3);
-    llama_sampler_free(smpl4);
+    GGML_ASSERT(llama_init->context() == nullptr);
 
-    llama_batch_free(batch);
-
-    // this one is managed by common_init_result
-    //llama_free(ctx);
+    // Test 1: baseline (saves state to disk)
+    auto result_baseline = test_baseline(model, params);
+    if (result_baseline.empty()) {
+        return 1;
+    }
 
-    llama_free(ctx2);
-    llama_free(ctx3);
-    llama_free(ctx4);
+    // Test 2: state load
+    if (!test_state_load(model, params, result_baseline)) {
+        return 1;
+    }
 
-    if (result0 != result2) {
-        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+    // Test 3: seq copy (host)
+    if (!test_seq_cp_host(model, params, result_baseline)) {
         return 1;
     }
 
-    if (result0 != result3) {
-        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+    // Test 4: seq copy (device)
+    if (!test_seq_cp_device(model, params, result_baseline)) {
         return 1;
     }
 
-    fprintf(stderr, "\n%s : success\n", __func__);
+    LOG("\nAll tests passed.\n");
 
     return 0;
 }

From 3c81c8deeabba01fa40869325ea80d07eef75fc6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 May 2026 09:46:58 +0300
Subject: [PATCH 08/12] server : print graphs reused in slot timings (#23279)

Add graphs reused counter to the per-slot timing output, printed via
llama_perf_context().

Assisted-by: llama.cpp:local pi

Co-authored-by: ggerganov <ggerganov@users.noreply.github.com>
---
 tools/server/server-context.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 6b16c6b4962..88b207ad556 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -467,20 +467,26 @@ struct server_slot {
         const double n_gen_second = 1e3 / t_token_generation * n_decoded;
 
         SLT_INF(*this,
-                "\n"
-                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
-                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second);
+
+        SLT_INF(*this,
+                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                t_token_generation, n_decoded, t_gen, n_gen_second);
+
+        SLT_INF(*this,
                 "      total time = %10.2f ms / %5d tokens\n",
-                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
-                t_token_generation, n_decoded, t_gen, n_gen_second,
                 t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
 
+        SLT_INF(*this,
+                "   graphs reused = %10d\n",
+                llama_perf_context(ctx_tgt).n_reused);
+
         if (n_draft_total > 0) {
             const float draft_ratio = (float) n_draft_accepted / n_draft_total;
-            SLT_CNT(*this,
-                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total
-            );
+            SLT_INF(*this,
+                    "draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total);
         }
 
         common_speculative_print_stats(spec);

From ccee42642677005555b28c6ef93760e2604348e8 Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Tue, 19 May 2026 08:49:01 +0200
Subject: [PATCH 09/12] server-context: guarantee there is at least 1 token to
 decode (#23280)

---
 tools/server/server-context.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 88b207ad556..dc3189e1705 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2589,9 +2589,9 @@ struct server_context_impl {
                             llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
 
                             // the largest pos_min required for a checkpoint to be useful
-                            const auto pos_min_thold = std::max(0, pos_next - n_swa);
+                            const auto pos_min_thold = std::max(0, pos_next - n_swa - 1);
 
-                            if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
+                            if (n_past > 0 && n_past <= slot.prompt.n_tokens()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
                                 if (pos_min == -1) {
                                     SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);

From 00c461ce1a9deb238eed40a8f869a72729fa3d4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 19 May 2026 09:06:56 +0200
Subject: [PATCH 10/12] ci : install server kleidiai runner dependencies
 (#23259)

---
 .github/workflows/server-self-hosted.yml | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index d06ad3d24c5..3522681d9d1 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -152,6 +152,32 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+           build-essential \
+           python3-venv \
+           gpg \
+           wget \
+           time \
+           git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+           | gpg --dearmor \
+           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+           | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
       - name: Build
         id: cmake_build
         run: |

From 4b262ab662d46fd9dd1d53671b82c09d8b0af024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 19 May 2026 10:11:04 +0200
Subject: [PATCH 11/12] ci : install libssl-dev (#23325)

---
 .github/workflows/server-self-hosted.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index 3522681d9d1..857c72a4619 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -160,6 +160,7 @@ jobs:
           sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
           apt-get install -y \
            build-essential \
+           libssl-dev \
            python3-venv \
            gpg \
            wget \

From 6db130445d29b243ee2171efb8cd61b84a1c5322 Mon Sep 17 00:00:00 2001
From: Aleksander Grygier <aleksander.grygier@gmail.com>
Date: Tue, 19 May 2026 10:16:04 +0200
Subject: [PATCH 12/12] ui: Bump packages + address build warnings (#23300)

* chore: Update vulnerable packages

* chore: Formatting

* refactor: Update Tailwind CSS imports

* ci: Use `ubuntu-latest` for Unit/E2E UI tests

* chore: Bump package

* fix: Add missing tag

* refactor: Enums files naming
---
 .github/workflows/ui-ci.yml                   |   4 +-
 tools/ui/.gitignore                           |   2 +-
 tools/ui/eslint.config.js                     |   5 +-
 tools/ui/package-lock.json                    |  54 ++++----
 tools/ui/src/app.css                          |   5 +-
 ...tAttachmentsPreviewCurrentItemVideo.svelte |   1 +
 .../MarkdownContent/MarkdownContent.svelte    |   2 +-
 .../settings/SettingsChat/SettingsChat.svelte |   2 +-
 .../SettingsChat/SettingsChatFields.svelte    |   2 +-
 tools/ui/src/lib/constants/mcp.ts             |   2 +-
 .../ui/src/lib/constants/settings-registry.ts |   4 +-
 .../src/lib/constants/supported-file-types.ts |   2 +-
 tools/ui/src/lib/constants/tools.ts           |   2 +-
 .../enums/{agentic.ts => agentic.enums.ts}    |   0
 .../{attachment.ts => attachment.enums.ts}    |   0
 .../src/lib/enums/{chat.ts => chat.enums.ts}  |   0
 .../lib/enums/{files.ts => files.enums.ts}    |   0
 tools/ui/src/lib/enums/index.ts               |  22 ++--
 .../enums/{keyboard.ts => keyboard.enums.ts}  |   0
 .../ui/src/lib/enums/{mcp.ts => mcp.enums.ts} |   0
 .../lib/enums/{model.ts => model.enums.ts}    |   0
 .../lib/enums/{server.ts => server.enums.ts}  |   0
 .../enums/{settings.ts => settings.enums.ts}  |   0
 .../lib/enums/{tools.ts => tools.enums.ts}    |   0
 tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} |   0
 tools/ui/src/lib/types/mcp.d.ts               |   4 +-
 tools/ui/src/routes/+layout.svelte            |  29 ++--
 tools/ui/vitest-setup-client.ts               | 124 +++++++++---------
 tools/ui/vitest.shims.d.ts                    |   1 +
 29 files changed, 138 insertions(+), 129 deletions(-)
 rename tools/ui/src/lib/enums/{agentic.ts => agentic.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{attachment.ts => attachment.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{chat.ts => chat.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{files.ts => files.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{keyboard.ts => keyboard.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{mcp.ts => mcp.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{model.ts => model.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{server.ts => server.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{settings.ts => settings.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{tools.ts => tools.enums.ts} (100%)
 rename tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} (100%)
 create mode 100644 tools/ui/vitest.shims.d.ts

diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml
index 7f6f467ddaa..761a9319414 100644
--- a/.github/workflows/ui-ci.yml
+++ b/.github/workflows/ui-ci.yml
@@ -41,7 +41,7 @@ jobs:
   ui-checks:
     name: UI Checks
     needs: ui-build
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
     continue-on-error: true
     steps:
       - name: Checkout code
@@ -93,7 +93,7 @@ jobs:
   e2e-tests:
     name: E2E Tests
     needs: ui-build
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout code
         uses: actions/checkout@v6
diff --git a/tools/ui/.gitignore b/tools/ui/.gitignore
index 051d884b08e..22ed6125f4c 100644
--- a/tools/ui/.gitignore
+++ b/tools/ui/.gitignore
@@ -25,4 +25,4 @@ vite.config.ts.timestamp-*
 
 *storybook.log
 storybook-static
-*.code-workspace
\ No newline at end of file
+*.code-workspace
diff --git a/tools/ui/eslint.config.js b/tools/ui/eslint.config.js
index 185da1dabbe..4ed9dd7ca3a 100644
--- a/tools/ui/eslint.config.js
+++ b/tools/ui/eslint.config.js
@@ -20,9 +20,7 @@ export default ts.config(
 	prettier,
 	...svelte.configs.prettier,
 	{
-		languageOptions: {
-			globals: { ...globals.browser, ...globals.node }
-		},
+		languageOptions: { globals: { ...globals.browser, ...globals.node } },
 		rules: {
 			// typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects.
 			// see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors
@@ -30,6 +28,7 @@ export default ts.config(
 			'svelte/no-at-html-tags': 'off',
 			// This app uses hash-based routing (#/) where resolve() from $app/paths does not apply
 			'svelte/no-navigation-without-resolve': 'off',
+
 			// Enforce empty line at end of file
 			'eol-last': 'error'
 		}
diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json
index 3686eb3261e..4d012c81990 100644
--- a/tools/ui/package-lock.json
+++ b/tools/ui/package-lock.json
@@ -2307,9 +2307,9 @@
 			}
 		},
 		"node_modules/@sveltejs/kit": {
-			"version": "2.59.1",
-			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
-			"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
+			"version": "2.60.1",
+			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz",
+			"integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
@@ -2318,7 +2318,7 @@
 				"@types/cookie": "^0.6.0",
 				"acorn": "^8.14.1",
 				"cookie": "^0.6.0",
-				"devalue": "^5.6.4",
+				"devalue": "^5.8.1",
 				"esm-env": "^1.2.2",
 				"kleur": "^4.1.5",
 				"magic-string": "^0.30.5",
@@ -4296,9 +4296,9 @@
 			}
 		},
 		"node_modules/devalue": {
-			"version": "5.6.4",
-			"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz",
-			"integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==",
+			"version": "5.8.1",
+			"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz",
+			"integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==",
 			"license": "MIT"
 		},
 		"node_modules/devlop": {
@@ -4856,12 +4856,12 @@
 			}
 		},
 		"node_modules/express-rate-limit": {
-			"version": "8.5.0",
-			"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz",
-			"integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==",
+			"version": "8.5.2",
+			"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz",
+			"integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==",
 			"license": "MIT",
 			"dependencies": {
-				"ip-address": "10.1.0"
+				"ip-address": "^10.2.0"
 			},
 			"engines": {
 				"node": ">= 16"
@@ -4909,9 +4909,9 @@
 			"license": "MIT"
 		},
 		"node_modules/fast-uri": {
-			"version": "3.1.0",
-			"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
-			"integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz",
+			"integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==",
 			"funding": [
 				{
 					"type": "github",
@@ -5541,9 +5541,9 @@
 			}
 		},
 		"node_modules/hono": {
-			"version": "4.12.14",
-			"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz",
-			"integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==",
+			"version": "4.12.19",
+			"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz",
+			"integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==",
 			"license": "MIT",
 			"engines": {
 				"node": ">=16.9.0"
@@ -5722,9 +5722,9 @@
 			"license": "MIT"
 		},
 		"node_modules/ip-address": {
-			"version": "10.1.0",
-			"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
-			"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
+			"version": "10.2.0",
+			"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
+			"integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
 			"license": "MIT",
 			"engines": {
 				"node": ">= 12"
@@ -9245,9 +9245,9 @@
 			}
 		},
 		"node_modules/svelte": {
-			"version": "5.55.1",
-			"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz",
-			"integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==",
+			"version": "5.55.7",
+			"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz",
+			"integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==",
 			"license": "MIT",
 			"dependencies": {
 				"@jridgewell/remapping": "^2.3.4",
@@ -9259,7 +9259,7 @@
 				"aria-query": "5.3.1",
 				"axobject-query": "^4.1.0",
 				"clsx": "^2.1.1",
-				"devalue": "^5.6.4",
+				"devalue": "^5.8.1",
 				"esm-env": "^1.2.1",
 				"esrap": "^2.2.4",
 				"is-reference": "^3.0.3",
@@ -10606,9 +10606,9 @@
 			"license": "ISC"
 		},
 		"node_modules/ws": {
-			"version": "8.18.3",
-			"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
-			"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
+			"version": "8.20.1",
+			"resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz",
+			"integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==",
 			"dev": true,
 			"license": "MIT",
 			"engines": {
diff --git a/tools/ui/src/app.css b/tools/ui/src/app.css
index d6dc6670c0c..29b1d3c640b 100644
--- a/tools/ui/src/app.css
+++ b/tools/ui/src/app.css
@@ -1,6 +1,7 @@
 @import 'tailwindcss';
-@source ".";
-
+@source '.';
+@plugin '@tailwindcss/forms';
+@plugin '@tailwindcss/typography';
 @import 'tw-animate-css';
 
 @custom-variant dark (&:is(.dark *));
diff --git a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte
index 4ebbd592280..62040b36f9d 100644
--- a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte
@@ -15,6 +15,7 @@
 
 		{#if videoSrc}
 			<video controls class="mb-4 w-full" src={videoSrc}>
+				<track kind="captions" src="" />
 				Your browser does not support the video element.
 			</video>
 		{:else}
diff --git a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte
index 3a11854b6e4..0412414ae39 100644
--- a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte
+++ b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte
@@ -28,7 +28,7 @@
 		SETTINGS_KEYS
 	} from '$lib/constants';
 	import { ColorMode, UrlProtocol } from '$lib/enums';
-	import { FileTypeText } from '$lib/enums/files';
+	import { FileTypeText } from '$lib/enums/files.enums';
 	import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils';
 	import '$styles/katex-custom.scss';
 	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte
index 109c8ff9dac..d017fe20469 100644
--- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte
+++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte
@@ -17,7 +17,7 @@
 	} from '$lib/constants';
 	import { RouterService } from '$lib/services/router.service';
 	import { setMode } from 'mode-watcher';
-	import { ColorMode } from '$lib/enums/ui';
+	import { ColorMode } from '$lib/enums/ui.enums';
 	import { fade } from 'svelte/transition';
 	import { goto } from '$app/navigation';
 	import { page } from '$app/state';
diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte
index 069855eebef..7c1c5c89776 100644
--- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte
+++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte
@@ -6,7 +6,7 @@
 	import * as Select from '$lib/components/ui/select';
 	import { Textarea } from '$lib/components/ui/textarea';
 	import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
-	import { SettingsFieldType } from '$lib/enums/settings';
+	import { SettingsFieldType } from '$lib/enums/settings.enums';
 	import { settingsStore } from '$lib/stores/settings.svelte';
 	import { serverStore } from '$lib/stores/server.svelte';
 	import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte';
diff --git a/tools/ui/src/lib/constants/mcp.ts b/tools/ui/src/lib/constants/mcp.ts
index 19bdd92ea75..918eb9f94b7 100644
--- a/tools/ui/src/lib/constants/mcp.ts
+++ b/tools/ui/src/lib/constants/mcp.ts
@@ -2,7 +2,7 @@ import { Zap, Globe, Radio } from '@lucide/svelte';
 import { MCPTransportType } from '$lib/enums';
 import type { ClientCapabilities, Implementation } from '$lib/types';
 import type { Component } from 'svelte';
-import { MimeTypeImage } from '$lib/enums/files';
+import { MimeTypeImage } from '$lib/enums/files.enums';
 
 export const DEFAULT_CLIENT_VERSION = '1.0.0';
 export const MCP_CLIENT_NAME = 'llama-ui-mcp';
diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts
index bdbb17d962c..93b3cd5edb5 100644
--- a/tools/ui/src/lib/constants/settings-registry.ts
+++ b/tools/ui/src/lib/constants/settings-registry.ts
@@ -1,5 +1,5 @@
-import { ColorMode } from '$lib/enums/ui';
-import { SettingsFieldType } from '$lib/enums/settings';
+import { ColorMode } from '$lib/enums/ui.enums';
+import { SettingsFieldType } from '$lib/enums/settings.enums';
 import { SyncableParameterType } from '$lib/enums';
 import {
 	Funnel,
diff --git a/tools/ui/src/lib/constants/supported-file-types.ts b/tools/ui/src/lib/constants/supported-file-types.ts
index 34505438916..4141161548d 100644
--- a/tools/ui/src/lib/constants/supported-file-types.ts
+++ b/tools/ui/src/lib/constants/supported-file-types.ts
@@ -18,7 +18,7 @@ import {
 	MimeTypeApplication,
 	MimeTypeText
 } from '$lib/enums';
-import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files';
+import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files.enums';
 
 // File type configuration using enums
 export const AUDIO_FILE_TYPES = {
diff --git a/tools/ui/src/lib/constants/tools.ts b/tools/ui/src/lib/constants/tools.ts
index 22b22309c39..efc3476cd79 100644
--- a/tools/ui/src/lib/constants/tools.ts
+++ b/tools/ui/src/lib/constants/tools.ts
@@ -1,4 +1,4 @@
-import { ToolSource } from '$lib/enums/tools';
+import { ToolSource } from '$lib/enums/tools.enums';
 
 export const TOOL_GROUP_LABELS = {
 	[ToolSource.BUILTIN]: 'Built-in',
diff --git a/tools/ui/src/lib/enums/agentic.ts b/tools/ui/src/lib/enums/agentic.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/agentic.ts
rename to tools/ui/src/lib/enums/agentic.enums.ts
diff --git a/tools/ui/src/lib/enums/attachment.ts b/tools/ui/src/lib/enums/attachment.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/attachment.ts
rename to tools/ui/src/lib/enums/attachment.enums.ts
diff --git a/tools/ui/src/lib/enums/chat.ts b/tools/ui/src/lib/enums/chat.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/chat.ts
rename to tools/ui/src/lib/enums/chat.enums.ts
diff --git a/tools/ui/src/lib/enums/files.ts b/tools/ui/src/lib/enums/files.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/files.ts
rename to tools/ui/src/lib/enums/files.enums.ts
diff --git a/tools/ui/src/lib/enums/index.ts b/tools/ui/src/lib/enums/index.ts
index 3cf81286bc1..a17cca1d8e1 100644
--- a/tools/ui/src/lib/enums/index.ts
+++ b/tools/ui/src/lib/enums/index.ts
@@ -4,9 +4,9 @@ export {
 	AttachmentItemEnabledWhen,
 	AttachmentAction,
 	AttachmentItemVisibleWhen
-} from './attachment';
+} from './attachment.enums';
 
-export { AgenticSectionType, ToolCallType } from './agentic';
+export { AgenticSectionType, ToolCallType } from './agentic.enums';
 
 export {
 	ChatMessageStatsView,
@@ -17,7 +17,7 @@ export {
 	MessageType,
 	PdfViewMode,
 	ReasoningFormat
-} from './chat';
+} from './chat.enums';
 
 export {
 	FileTypeCategory,
@@ -38,7 +38,7 @@ export {
 	MimeTypeImage,
 	MimeTypeText,
 	SpecialFileType
-} from './files';
+} from './files.enums';
 
 export {
 	MCPConnectionPhase,
@@ -48,16 +48,16 @@ export {
 	MCPContentType,
 	MCPRefType,
 	JsonSchemaType
-} from './mcp';
+} from './mcp.enums';
 
-export { ModelModality } from './model';
+export { ModelModality } from './model.enums';
 
-export { ServerRole, ServerModelStatus } from './server';
+export { ServerRole, ServerModelStatus } from './server.enums';
 
-export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings';
+export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings.enums';
 
-export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui';
+export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui.enums';
 
-export { KeyboardKey } from './keyboard';
+export { KeyboardKey } from './keyboard.enums';
 
-export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools';
+export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums';
diff --git a/tools/ui/src/lib/enums/keyboard.ts b/tools/ui/src/lib/enums/keyboard.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/keyboard.ts
rename to tools/ui/src/lib/enums/keyboard.enums.ts
diff --git a/tools/ui/src/lib/enums/mcp.ts b/tools/ui/src/lib/enums/mcp.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/mcp.ts
rename to tools/ui/src/lib/enums/mcp.enums.ts
diff --git a/tools/ui/src/lib/enums/model.ts b/tools/ui/src/lib/enums/model.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/model.ts
rename to tools/ui/src/lib/enums/model.enums.ts
diff --git a/tools/ui/src/lib/enums/server.ts b/tools/ui/src/lib/enums/server.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/server.ts
rename to tools/ui/src/lib/enums/server.enums.ts
diff --git a/tools/ui/src/lib/enums/settings.ts b/tools/ui/src/lib/enums/settings.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/settings.ts
rename to tools/ui/src/lib/enums/settings.enums.ts
diff --git a/tools/ui/src/lib/enums/tools.ts b/tools/ui/src/lib/enums/tools.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/tools.ts
rename to tools/ui/src/lib/enums/tools.enums.ts
diff --git a/tools/ui/src/lib/enums/ui.ts b/tools/ui/src/lib/enums/ui.enums.ts
similarity index 100%
rename from tools/ui/src/lib/enums/ui.ts
rename to tools/ui/src/lib/enums/ui.enums.ts
diff --git a/tools/ui/src/lib/types/mcp.d.ts b/tools/ui/src/lib/types/mcp.d.ts
index 7aa050cdfa7..2a292614203 100644
--- a/tools/ui/src/lib/types/mcp.d.ts
+++ b/tools/ui/src/lib/types/mcp.d.ts
@@ -1,5 +1,5 @@
-import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp';
-import type { ToolSource } from '$lib/enums/tools';
+import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp.enums';
+import type { ToolSource } from '$lib/enums/tools.enums';
 import type {
 	Client,
 	ClientCapabilities as SDKClientCapabilities,
diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte
index b35d20a5cd5..78227df3ce7 100644
--- a/tools/ui/src/routes/+layout.svelte
+++ b/tools/ui/src/routes/+layout.svelte
@@ -7,11 +7,13 @@
 	import { untrack } from 'svelte';
 	import { onMount } from 'svelte';
 	import { fade } from 'svelte/transition';
+
 	import {
 		DesktopIconStrip,
 		DialogConversationTitleUpdate,
 		SidebarNavigation
 	} from '$lib/components/app';
+
 	import { conversationsStore } from '$lib/stores/conversations.svelte';
 	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
 	import * as Tooltip from '$lib/components/ui/tooltip';
@@ -30,26 +32,29 @@
 	import { conversations } from '$lib/stores/conversations.svelte';
 
 	let { children } = $props();
-
 	let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
 	let isMobile = new IsMobile();
 	let isDesktop = $derived(!isMobile.current);
 	let sidebarOpen = $state(false);
 	let mounted = $state(false);
 	let innerHeight = $state<number | undefined>();
+
 	let chatSidebar:
-		| { activateSearchMode?: () => void; editActiveConversation?: () => void }
+		| {
+				activateSearchMode?: () => void;
+				editActiveConversation?: () => void;
+		  }
 		| undefined = $state();
 
 	let titleUpdateDialogOpen = $state(false);
 	let titleUpdateCurrentTitle = $state('');
 	let titleUpdateNewTitle = $state('');
 	let titleUpdateResolve: ((value: boolean) => void) | null = null;
-
 	const panelNav = useSettingsNavigation();
 
 	function navigateToConversation(direction: -1 | 1) {
 		const allConvs = conversations();
+
 		if (allConvs.length === 0) return;
 
 		const currentId = page.params.id;
@@ -61,6 +66,7 @@
 		}
 
 		const idx = allConvs.findIndex((c) => c.id === currentId);
+
 		if (idx === -1) return;
 
 		const targetIdx = idx + direction;
@@ -75,9 +81,7 @@
 	// Global keyboard shortcuts
 	const { handleKeydown } = useKeyboardShortcuts({
 		editActiveConversation: () => chatSidebar?.editActiveConversation?.(),
-
 		navigateToPrevConversation: () => navigateToConversation(-1),
-
 		navigateToNextConversation: () => navigateToConversation(1)
 	});
 
@@ -139,6 +143,7 @@
 	$effect(() => {
 		if (alwaysShowSidebarOnDesktop && isDesktop) {
 			sidebarOpen = true;
+
 			return;
 		}
 	});
@@ -175,6 +180,7 @@
 		// Only fetch router models once when we have models loaded and in router mode
 		if (isRouter && modelsCount > 0 && !routerModelsFetched) {
 			routerModelsFetched = true;
+
 			untrack(() => {
 				modelsStore.fetchRouterModels();
 			});
@@ -223,7 +229,6 @@
 
 <Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
 	<ModeWatcher />
-
 	<Toaster richColors />
 
 	<DialogConversationTitleUpdate
@@ -236,9 +241,9 @@
 
 	<Sidebar.Provider bind:open={sidebarOpen}>
 		<div class="flex h-screen w-full" style:height="{innerHeight}px">
-			<Sidebar.Root variant="floating" class="h-full">
-				<SidebarNavigation bind:this={chatSidebar} />
-			</Sidebar.Root>
+			<Sidebar.Root variant="floating" class="h-full"
+				><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
+			>
 
 			{#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)}
 				{#if mounted}
@@ -266,9 +271,9 @@
 				/>
 			{/if}
 
-			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
-				{@render children?.()}
-			</Sidebar.Inset>
+			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden"
+				>{@render children?.()}</Sidebar.Inset
+			>
 		</div>
 	</Sidebar.Provider>
 </Tooltip.Provider>
diff --git a/tools/ui/vitest-setup-client.ts b/tools/ui/vitest-setup-client.ts
index 0b753db02b1..90994442eb2 100644
--- a/tools/ui/vitest-setup-client.ts
+++ b/tools/ui/vitest-setup-client.ts
@@ -9,70 +9,72 @@ import { beforeEach, vi } from 'vitest';
 beforeEach(() => {
 	const originalFetch = globalThis.fetch;
 
-	vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => {
-		const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
+	vi.spyOn(globalThis, 'fetch').mockImplementation(
+		async (input: RequestInfo | URL, init?: RequestInit) => {
+			const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
 
-		// Mock server props endpoint
-		if (url.includes('/server')) {
-			return new Response(
-				JSON.stringify({
-					mode: 'router',
-					version: 'test',
-					git_commit: 'test',
-					git_branch: 'test'
-				}),
-				{ status: 200, headers: { 'Content-Type': 'application/json' } }
-			);
-		}
+			// Mock server props endpoint
+			if (url.includes('/server')) {
+				return new Response(
+					JSON.stringify({
+						mode: 'router',
+						version: 'test',
+						git_commit: 'test',
+						git_branch: 'test'
+					}),
+					{ status: 200, headers: { 'Content-Type': 'application/json' } }
+				);
+			}
 
-		// Mock models list endpoint
-		if (/\/v1\/models|\/models\b/.test(url)) {
-			return new Response(
-				JSON.stringify({
-					object: 'list',
-					data: [
-						{
-							id: 'test-model.gguf',
-							object: 'model',
-							owned_by: 'llamacpp',
-							created: 0,
-							in_cache: false,
-							path: 'models/test-model.gguf',
-							status: { value: 'unloaded' },
-							meta: {}
-						}
-					],
-					models: [
-						{
-							model: 'test-model.gguf',
-							name: 'Test Model',
-							details: {}
-						}
-					]
-				}),
-				{ status: 200, headers: { 'Content-Type': 'application/json' } }
-			);
-		}
+			// Mock models list endpoint
+			if (/\/v1\/models|\/models\b/.test(url)) {
+				return new Response(
+					JSON.stringify({
+						object: 'list',
+						data: [
+							{
+								id: 'test-model.gguf',
+								object: 'model',
+								owned_by: 'llamacpp',
+								created: 0,
+								in_cache: false,
+								path: 'models/test-model.gguf',
+								status: { value: 'unloaded' },
+								meta: {}
+							}
+						],
+						models: [
+							{
+								model: 'test-model.gguf',
+								name: 'Test Model',
+								details: {}
+							}
+						]
+					}),
+					{ status: 200, headers: { 'Content-Type': 'application/json' } }
+				);
+			}
 
-		// Mock /props endpoint (used for modalities)
-		if (url.includes('/props')) {
-			return new Response(
-				JSON.stringify({
-					default_generation_settings: { n_ctx: 2048 }
-				}),
-				{ status: 200, headers: { 'Content-Type': 'application/json' } }
-			);
-		}
+			// Mock /props endpoint (used for modalities)
+			if (url.includes('/props')) {
+				return new Response(
+					JSON.stringify({
+						default_generation_settings: { n_ctx: 2048 }
+					}),
+					{ status: 200, headers: { 'Content-Type': 'application/json' } }
+				);
+			}
 
-		// Mock /tools endpoint (used for built-in tools list)
-		if (url.includes('/tools')) {
-			return new Response(JSON.stringify([]), {
-				status: 200,
-				headers: { 'Content-Type': 'application/json' }
-			});
-		}
+			// Mock /tools endpoint (used for built-in tools list)
+			if (url.includes('/tools')) {
+				return new Response(JSON.stringify([]), {
+					status: 200,
+					headers: { 'Content-Type': 'application/json' }
+				});
+			}
 
-		// Default: use real fetch
-		return originalFetch(input, init);
-	});
+			// Default: use real fetch
+			return originalFetch(input, init);
+		}
+	);
 });
diff --git a/tools/ui/vitest.shims.d.ts b/tools/ui/vitest.shims.d.ts
new file mode 100644
index 00000000000..03b1801a60c
--- /dev/null
+++ b/tools/ui/vitest.shims.d.ts
@@ -0,0 +1 @@
+/// <reference types="@vitest/browser-playwright" />