Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/server-self-hosted.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,33 @@ jobs:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

- name: Dependencies
id: depends
run: |
set -euxo pipefail
sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
apt-get install -y \
build-essential \
libssl-dev \
python3-venv \
gpg \
wget \
time \
git-lfs

git lfs install

# install the latest cmake
sudo install -d /usr/share/keyrings
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
| gpg --dearmor \
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
| sudo tee /etc/apt/sources.list.d/kitware.list
sudo apt-get update
sudo apt-get install -y cmake

- name: Build
id: cmake_build
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ui-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
ui-checks:
name: UI Checks
needs: ui-build
runs-on: ubuntu-slim
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Checkout code
Expand Down Expand Up @@ -93,7 +93,7 @@ jobs:
e2e-tests:
name: E2E Tests
needs: ui-build
runs-on: ubuntu-slim
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
Expand Down
2 changes: 2 additions & 0 deletions .pi/gg/SYSTEM.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Pull requests (PRs):
Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user

Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Metal](docs/build.md#metal-build) | Apple Silicon |
| [BLAS](docs/build.md#blas-build) | All |
| [BLIS](docs/backend/BLIS.md) | All |
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [SYCL](docs/backend/SYCL.md) | Intel GPU |
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
Expand Down
14 changes: 11 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,7 +1160,7 @@ struct common_init_result::impl {
std::vector<llama_sampler_seq_config> samplers_seq_config;
};

common_init_result::common_init_result(common_params & params) :
common_init_result::common_init_result(common_params & params, bool model_only) :
pimpl(new impl{}) {
auto mparams = common_model_params_to_llama(params);
auto cparams = common_context_params_to_llama(params);
Expand All @@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) :

pimpl->model.reset(model);

if (model_only) {
return;
}

const llama_vocab * vocab = llama_model_get_vocab(model);

// load and optionally apply lora adapters
Expand Down Expand Up @@ -1309,15 +1313,19 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
return pimpl->lora;
}

common_init_result_ptr common_init_from_params(common_params & params) {
common_init_result_ptr res(new common_init_result(params));
common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
common_init_result_ptr res(new common_init_result(params, model_only));

llama_model * model = res->model();
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return res;
}

if (model_only) {
return res;
}

llama_context * lctx = res->context();
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
Expand Down
4 changes: 2 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ struct common_sampler;

// note: defines the model, context, samplers, ets. lifetimes
struct common_init_result {
common_init_result(common_params & params);
common_init_result(common_params & params, bool model_only = false);
~common_init_result();

llama_model * model();
Expand All @@ -875,7 +875,7 @@ struct common_init_result {

using common_init_result_ptr = std::unique_ptr<common_init_result>;

common_init_result_ptr common_init_from_params(common_params & params);
common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);

struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
Expand Down
5 changes: 5 additions & 0 deletions convert_lora_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name)
# filter base name, ignore tensor transformations for now
data_gen = lambda g=tensor: g # noqa: E731
if (titem := self.filter_tensors((base_name, data_gen))) is None:
continue
base_name, _ = titem
# note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
Expand Down
13 changes: 10 additions & 3 deletions docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Performance Reference](#performance-reference)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
Expand Down Expand Up @@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on

## News

- 2026.04

- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
- 2026.04-05
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0.
- Fused MoE.
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.

Expand Down Expand Up @@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the

NA

## Performance Reference


To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313).

You could update your test result in it directly.

## Docker

The docker build option is currently limited to *Intel GPU* targets.
Expand Down
Loading
Loading