Skip to content

Commit 52f1096

Browse files
wine99cavusmustafathedanhoffmanravi9
authored
openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944)
* Thread safety per request only * Fix ROPE yarn case * Fix sticky stateful config * Use i4/i8 directly for symmetric quant * Use weightless caching * Add WeightlessCacheAttribute to reduce NPU memory usage * Gelu tanh support (#125) * Imrope support (#126) * fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free * add GPU,NPU support in OV Dockerfile * add build-openvino.yml ci * Fix sticky stateful config * add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml * fix thread-safety of shared runtime context * rope type abstraction for frontend translations * fix editorconfig --------- Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com> Co-authored-by: Dan Hoffman <dhoff749@gmail.com> Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
1 parent 606fa42 commit 52f1096

21 files changed

Lines changed: 818 additions & 539 deletions

.devops/openvino.Dockerfile

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
22
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
33
ARG UBUNTU_VERSION=24.04
44

5-
# Optional proxy build arguments - empty by default
5+
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
6+
ARG IGC_VERSION=v2.30.1
7+
ARG IGC_VERSION_FULL=2_2.30.1+20950
8+
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
9+
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
10+
ARG IGDGMM_VERSION=22.9.0
11+
12+
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
13+
ARG NPU_DRIVER_VERSION=v1.32.0
14+
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
15+
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
16+
17+
# Optional proxy build arguments
618
ARG http_proxy=
719
ARG https_proxy=
820

@@ -78,13 +90,47 @@ ARG http_proxy
7890
ARG https_proxy
7991

8092
RUN apt-get update \
81-
&& apt-get install -y libgomp1 libtbb12 curl \
93+
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
8294
&& apt autoremove -y \
8395
&& apt clean -y \
8496
&& rm -rf /tmp/* /var/tmp/* \
8597
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
8698
&& find /var/cache -type f -delete
8799

100+
# Install GPU drivers
101+
ARG IGC_VERSION
102+
ARG IGC_VERSION_FULL
103+
ARG COMPUTE_RUNTIME_VERSION
104+
ARG COMPUTE_RUNTIME_VERSION_FULL
105+
ARG IGDGMM_VERSION
106+
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
107+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
108+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
109+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
110+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
111+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
112+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
113+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
114+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
115+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
116+
&& dpkg --install *.deb \
117+
&& rm -rf /tmp/neo/
118+
119+
# Install NPU drivers
120+
ARG NPU_DRIVER_VERSION
121+
ARG NPU_DRIVER_FULL
122+
ARG LIBZE1_VERSION
123+
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
124+
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
125+
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
126+
&& dpkg --install *.deb \
127+
&& rm -rf /tmp/npu/
128+
129+
RUN cd /tmp \
130+
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
131+
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
132+
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
133+
88134
COPY --from=build /app/lib/ /app/
89135

90136
### Full (all binaries)
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
name: CI (openvino)
2+
3+
on:
4+
workflow_dispatch: # allows manual triggering
5+
push:
6+
branches:
7+
- master
8+
paths: [
9+
'.github/workflows/build-openvino.yml',
10+
'**/CMakeLists.txt',
11+
'**/.cmake',
12+
'**/*.h',
13+
'**/*.hpp',
14+
'**/*.c',
15+
'**/*.cpp',
16+
]
17+
18+
pull_request:
19+
types: [opened, synchronize, reopened]
20+
paths: [
21+
'.github/workflows/build-openvino.yml',
22+
'ggml/src/ggml-openvino/**'
23+
]
24+
25+
concurrency:
26+
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
27+
cancel-in-progress: true
28+
29+
env:
30+
GGML_NLOOP: 3
31+
GGML_N_THREADS: 1
32+
LLAMA_LOG_COLORS: 1
33+
LLAMA_LOG_PREFIX: 1
34+
LLAMA_LOG_TIMESTAMPS: 1
35+
36+
jobs:
37+
ubuntu-24-openvino:
38+
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
39+
40+
concurrency:
41+
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
42+
cancel-in-progress: false
43+
44+
strategy:
45+
matrix:
46+
include:
47+
- variant: cpu
48+
runner: '"ubuntu-24.04"'
49+
openvino_device: "CPU"
50+
- variant: gpu
51+
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
52+
openvino_device: "GPU"
53+
54+
runs-on: ${{ fromJSON(matrix.runner) }}
55+
56+
env:
57+
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
58+
OPENVINO_VERSION_MAJOR: "2026.0"
59+
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
60+
61+
steps:
62+
- name: Clone
63+
id: checkout
64+
uses: actions/checkout@v6
65+
66+
- name: ccache
67+
if: runner.environment == 'github-hosted'
68+
uses: ggml-org/ccache-action@v1.2.21
69+
with:
70+
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
71+
evict-old-files: 1d
72+
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
73+
74+
- name: Dependencies
75+
id: depends
76+
run: |
77+
sudo apt-get update
78+
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
79+
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
80+
81+
- name: Use OpenVINO Toolkit Cache
82+
if: runner.environment == 'github-hosted'
83+
uses: actions/cache@v5
84+
id: cache-openvino
85+
with:
86+
path: ./openvino_toolkit
87+
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
88+
89+
- name: Setup OpenVINO Toolkit
90+
if: steps.cache-openvino.outputs.cache-hit != 'true'
91+
uses: ./.github/actions/linux-setup-openvino
92+
with:
93+
path: ./openvino_toolkit
94+
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
95+
version_full: ${{ env.OPENVINO_VERSION_FULL }}
96+
97+
- name: Install OpenVINO dependencies
98+
run: |
99+
cd ./openvino_toolkit
100+
chmod +x ./install_dependencies/install_openvino_dependencies.sh
101+
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
102+
103+
- name: Build
104+
id: cmake_build
105+
run: |
106+
source ./openvino_toolkit/setupvars.sh
107+
cmake -B build/ReleaseOV -G Ninja \
108+
-DCMAKE_BUILD_TYPE=Release \
109+
-DGGML_OPENVINO=ON
110+
time cmake --build build/ReleaseOV --config Release -j $(nproc)
111+
112+
- name: Test
113+
id: cmake_test
114+
# TODO: fix and re-enable the `test-llama-archs` test below
115+
run: |
116+
cd ${{ github.workspace }}
117+
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
118+
export GGML_OPENVINO_DEVICE=GPU
119+
fi
120+
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

.github/workflows/build-self-hosted.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,10 @@ jobs:
265265
ggml-ci-intel-openvino-gpu-low-perf:
266266
runs-on: [self-hosted, Linux, Intel, OpenVINO]
267267

268+
concurrency:
269+
group: openvino-gpu-${{ github.head_ref || github.ref }}
270+
cancel-in-progress: false
271+
268272
env:
269273
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
270274
OPENVINO_VERSION_MAJOR: "2026.0"

.github/workflows/build.yml

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -656,86 +656,6 @@ jobs:
656656
-DGGML_SYCL_F16=ON
657657
time cmake --build build --config Release -j $(nproc)
658658
659-
ubuntu-24-openvino:
660-
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
661-
strategy:
662-
matrix:
663-
include:
664-
- variant: cpu
665-
runner: '"ubuntu-24.04"'
666-
openvino_device: "CPU"
667-
- variant: gpu
668-
runner: '["self-hosted","Linux","X64","Intel"]'
669-
openvino_device: "GPU"
670-
671-
runs-on: ${{ fromJSON(matrix.runner) }}
672-
673-
env:
674-
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
675-
OPENVINO_VERSION_MAJOR: "2026.0"
676-
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
677-
678-
steps:
679-
- name: Clone
680-
id: checkout
681-
uses: actions/checkout@v6
682-
683-
- name: ccache
684-
if: runner.environment == 'github-hosted'
685-
uses: ggml-org/ccache-action@v1.2.21
686-
with:
687-
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
688-
evict-old-files: 1d
689-
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
690-
691-
- name: Dependencies
692-
id: depends
693-
run: |
694-
sudo apt-get update
695-
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
696-
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
697-
698-
- name: Use OpenVINO Toolkit Cache
699-
if: runner.environment == 'github-hosted'
700-
uses: actions/cache@v5
701-
id: cache-openvino
702-
with:
703-
path: ./openvino_toolkit
704-
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
705-
706-
- name: Setup OpenVINO Toolkit
707-
if: steps.cache-openvino.outputs.cache-hit != 'true'
708-
uses: ./.github/actions/linux-setup-openvino
709-
with:
710-
path: ./openvino_toolkit
711-
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
712-
version_full: ${{ env.OPENVINO_VERSION_FULL }}
713-
714-
- name: Install OpenVINO dependencies
715-
run: |
716-
cd ./openvino_toolkit
717-
chmod +x ./install_dependencies/install_openvino_dependencies.sh
718-
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
719-
720-
- name: Build
721-
id: cmake_build
722-
run: |
723-
source ./openvino_toolkit/setupvars.sh
724-
cmake -B build/ReleaseOV -G Ninja \
725-
-DCMAKE_BUILD_TYPE=Release \
726-
-DGGML_OPENVINO=ON
727-
time cmake --build build/ReleaseOV --config Release -j $(nproc)
728-
729-
- name: Test
730-
id: cmake_test
731-
# TODO: fix and re-enable the `test-llama-archs` test below
732-
run: |
733-
cd ${{ github.workspace }}
734-
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
735-
export GGML_OPENVINO_DEVICE=GPU
736-
fi
737-
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
738-
739659
windows-latest:
740660
runs-on: windows-2025
741661

docs/backend/OPENVINO.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
244244
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
245245
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
246246
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
247-
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
248247

249248
> [!NOTE]
250249
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
274273
Run llama.cpp with OpenVINO backend Docker container.
275274
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
276275

277-
> [!NOTE]
278-
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
279276

280277
```bash
281278
# Run Docker container

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include <iomanip>
2020
#include <map>
2121
#include <memory>
22-
#include <mutex>
2322
#include <openvino/core/dimension.hpp>
2423
#include <openvino/core/except.hpp>
2524
#include <openvino/core/node.hpp>
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
207206
break;
208207
}
209208
case GGML_OP_ROPE: {
209+
const int mode = node->op_params[2];
210+
switch (mode) {
211+
case GGML_ROPE_TYPE_NEOX: {
212+
op_case = 0x00010000;
213+
break;
214+
}
215+
case GGML_ROPE_TYPE_IMROPE: {
216+
op_case = 0x00020000;
217+
break;
218+
}
219+
default:
220+
op_case = 0x00000000;
221+
break;
222+
}
210223
if (node->src[0]->op == GGML_OP_VIEW) {
211-
op_case = 2;
224+
op_case = (op_case | 0x00000002);
212225
}
213226
break;
214227
}
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
573586
}
574587

575588
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
576-
static std::mutex weights_mutex;
577-
std::lock_guard<std::mutex> lock(weights_mutex);
578-
579589
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
580590
auto * nodes = cgraph->nodes;
581591
auto n_nodes = cgraph->n_nodes;

0 commit comments

Comments
 (0)