CrazyForks · pull · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
@@ -152,6 +152,33 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+           build-essential \
+           libssl-dev \
+           python3-venv \
+           gpg \
+           wget \
+           time \
+           git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+           | gpg --dearmor \
+           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+           | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
       - name: Build
         id: cmake_build
         run: |

diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml
@@ -41,7 +41,7 @@ jobs:
   ui-checks:
     name: UI Checks
     needs: ui-build
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
     continue-on-error: true
     steps:
       - name: Checkout code
@@ -93,7 +93,7 @@ jobs:
   e2e-tests:
     name: E2E Tests
     needs: ui-build
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout code
         uses: actions/checkout@v6

diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
@@ -22,6 +22,8 @@ Pull requests (PRs):
 Commits:
 - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
 - Do not explicitly set the git author in commits - rely on the default git config
+- Always use `--no-gpg-sign` when committing
+- Never `git push` without explicit confirmation from the user
 
 Resources (read on demand):
 - [CONTRIBUTING.md](CONTRIBUTING.md)

diff --git a/README.md b/README.md
@@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Metal](docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |

diff --git a/common/common.cpp b/common/common.cpp
@@ -1160,7 +1160,7 @@ struct common_init_result::impl {
     std::vector<llama_sampler_seq_config> samplers_seq_config;
 };
 
-common_init_result::common_init_result(common_params & params) :
+common_init_result::common_init_result(common_params & params, bool model_only) :
     pimpl(new impl{}) {
     auto mparams = common_model_params_to_llama(params);
     auto cparams = common_context_params_to_llama(params);
@@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) :
 
     pimpl->model.reset(model);
 
+    if (model_only) {
+        return;
+    }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // load and optionally apply lora adapters
@@ -1309,15 +1313,19 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
     return pimpl->lora;
 }
 
-common_init_result_ptr common_init_from_params(common_params & params) {
-    common_init_result_ptr res(new common_init_result(params));
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
+    common_init_result_ptr res(new common_init_result(params, model_only));
 
     llama_model * model = res->model();
     if (model == NULL) {
         LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
         return res;
     }
 
+    if (model_only) {
+        return res;
+    }
+
     llama_context * lctx = res->context();
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());

diff --git a/common/common.h b/common/common.h
@@ -857,7 +857,7 @@ struct common_sampler;
 
 // note: defines the model, context, samplers, ets. lifetimes
 struct common_init_result {
-    common_init_result(common_params & params);
+    common_init_result(common_params & params, bool model_only = false);
     ~common_init_result();
 
     llama_model * model();
@@ -875,7 +875,7 @@ struct common_init_result {
 
 using common_init_result_ptr = std::unique_ptr<common_init_result>;
 
-common_init_result_ptr common_init_from_params(common_params & params);
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
 
 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
@@ -445,6 +445,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                     if self.lazy:
                         tensor = LazyTorchTensor.from_eager(tensor)
                     base_name = get_base_tensor_name(name)
+                    # filter base name, ignore tensor transformations for now
+                    data_gen = lambda g=tensor: g  # noqa: E731
+                    if (titem := self.filter_tensors((base_name, data_gen))) is None:
+                        continue
+                    base_name, _ = titem
                     # note: mergekit-extract-lora also adds token embeddings to the adapter
                     is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
                     is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
@@ -5,6 +5,7 @@
 - [News](#news)
 - [OS](#os)
 - [Hardware](#hardware)
+- [Performance Reference](#performance-reference)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on
 
 ## News
 
-- 2026.04
-
-  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
+- 2026.04-05
+  - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0.
   - Fused MoE.
   - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
 
@@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 
 NA
 
+## Performance Reference
+
+
+To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313).
+
+You could update your test result in it directly.
+
 ## Docker
 
 The docker build option is currently limited to *Intel GPU* targets.