Skip to content

Commit c6136ba

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 9e80780 + 82d3f4d commit c6136ba

123 files changed

Lines changed: 7533 additions & 5098 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devops/openvino.Dockerfile

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
22
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
33
ARG UBUNTU_VERSION=24.04
44

5-
# Optional proxy build arguments - empty by default
5+
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
6+
ARG IGC_VERSION=v2.30.1
7+
ARG IGC_VERSION_FULL=2_2.30.1+20950
8+
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
9+
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
10+
ARG IGDGMM_VERSION=22.9.0
11+
12+
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
13+
ARG NPU_DRIVER_VERSION=v1.32.0
14+
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
15+
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
16+
17+
# Optional proxy build arguments
618
ARG http_proxy=
719
ARG https_proxy=
820

@@ -78,13 +90,47 @@ ARG http_proxy
7890
ARG https_proxy
7991

8092
RUN apt-get update \
81-
&& apt-get install -y libgomp1 libtbb12 curl \
93+
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
8294
&& apt autoremove -y \
8395
&& apt clean -y \
8496
&& rm -rf /tmp/* /var/tmp/* \
8597
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
8698
&& find /var/cache -type f -delete
8799

100+
# Install GPU drivers
101+
ARG IGC_VERSION
102+
ARG IGC_VERSION_FULL
103+
ARG COMPUTE_RUNTIME_VERSION
104+
ARG COMPUTE_RUNTIME_VERSION_FULL
105+
ARG IGDGMM_VERSION
106+
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
107+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
108+
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
109+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
110+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
111+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
112+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
113+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
114+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
115+
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
116+
&& dpkg --install *.deb \
117+
&& rm -rf /tmp/neo/
118+
119+
# Install NPU drivers
120+
ARG NPU_DRIVER_VERSION
121+
ARG NPU_DRIVER_FULL
122+
ARG LIBZE1_VERSION
123+
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
124+
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
125+
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
126+
&& dpkg --install *.deb \
127+
&& rm -rf /tmp/npu/
128+
129+
RUN cd /tmp \
130+
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
131+
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
132+
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
133+
88134
COPY --from=build /app/lib/ /app/
89135

90136
### Full (all binaries)

.github/workflows/build-cross.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ jobs:
246246
apt-get install -y --no-install-recommends \
247247
build-essential \
248248
glslc \
249+
spirv-headers \
249250
gcc-14-loongarch64-linux-gnu \
250251
g++-14-loongarch64-linux-gnu \
251252
libvulkan-dev:loong64
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
name: CI (openvino)
2+
3+
on:
4+
workflow_dispatch: # allows manual triggering
5+
push:
6+
branches:
7+
- master
8+
paths: [
9+
'.github/workflows/build-openvino.yml',
10+
'**/CMakeLists.txt',
11+
'**/.cmake',
12+
'**/*.h',
13+
'**/*.hpp',
14+
'**/*.c',
15+
'**/*.cpp',
16+
]
17+
18+
pull_request:
19+
types: [opened, synchronize, reopened]
20+
paths: [
21+
'.github/workflows/build-openvino.yml',
22+
'ggml/src/ggml-openvino/**'
23+
]
24+
25+
concurrency:
26+
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
27+
cancel-in-progress: true
28+
29+
env:
30+
GGML_NLOOP: 3
31+
GGML_N_THREADS: 1
32+
LLAMA_LOG_COLORS: 1
33+
LLAMA_LOG_PREFIX: 1
34+
LLAMA_LOG_TIMESTAMPS: 1
35+
36+
jobs:
37+
ubuntu-24-openvino:
38+
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
39+
40+
concurrency:
41+
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
42+
cancel-in-progress: false
43+
44+
strategy:
45+
matrix:
46+
include:
47+
- variant: cpu
48+
runner: '"ubuntu-24.04"'
49+
openvino_device: "CPU"
50+
- variant: gpu
51+
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
52+
openvino_device: "GPU"
53+
54+
runs-on: ${{ fromJSON(matrix.runner) }}
55+
56+
env:
57+
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
58+
OPENVINO_VERSION_MAJOR: "2026.0"
59+
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
60+
61+
steps:
62+
- name: Clone
63+
id: checkout
64+
uses: actions/checkout@v6
65+
66+
- name: ccache
67+
if: runner.environment == 'github-hosted'
68+
uses: ggml-org/ccache-action@v1.2.21
69+
with:
70+
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
71+
evict-old-files: 1d
72+
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
73+
74+
- name: Dependencies
75+
id: depends
76+
run: |
77+
sudo apt-get update
78+
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
79+
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
80+
81+
- name: Use OpenVINO Toolkit Cache
82+
if: runner.environment == 'github-hosted'
83+
uses: actions/cache@v5
84+
id: cache-openvino
85+
with:
86+
path: ./openvino_toolkit
87+
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
88+
89+
- name: Setup OpenVINO Toolkit
90+
if: steps.cache-openvino.outputs.cache-hit != 'true'
91+
uses: ./.github/actions/linux-setup-openvino
92+
with:
93+
path: ./openvino_toolkit
94+
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
95+
version_full: ${{ env.OPENVINO_VERSION_FULL }}
96+
97+
- name: Install OpenVINO dependencies
98+
run: |
99+
cd ./openvino_toolkit
100+
chmod +x ./install_dependencies/install_openvino_dependencies.sh
101+
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
102+
103+
- name: Build
104+
id: cmake_build
105+
run: |
106+
source ./openvino_toolkit/setupvars.sh
107+
cmake -B build/ReleaseOV -G Ninja \
108+
-DCMAKE_BUILD_TYPE=Release \
109+
-DGGML_OPENVINO=ON
110+
time cmake --build build/ReleaseOV --config Release -j $(nproc)
111+
112+
- name: Test
113+
id: cmake_test
114+
# TODO: fix and re-enable the `test-llama-archs` test below
115+
run: |
116+
cd ${{ github.workspace }}
117+
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
118+
export GGML_OPENVINO_DEVICE=GPU
119+
fi
120+
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

.github/workflows/build-self-hosted.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,10 @@ jobs:
265265
ggml-ci-intel-openvino-gpu-low-perf:
266266
runs-on: [self-hosted, Linux, Intel, OpenVINO]
267267

268+
concurrency:
269+
group: openvino-gpu-${{ github.head_ref || github.ref }}
270+
cancel-in-progress: false
271+
268272
env:
269273
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
270274
OPENVINO_VERSION_MAJOR: "2026.0"

.github/workflows/build.yml

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -656,86 +656,6 @@ jobs:
656656
-DGGML_SYCL_F16=ON
657657
time cmake --build build --config Release -j $(nproc)
658658
659-
ubuntu-24-openvino:
660-
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
661-
strategy:
662-
matrix:
663-
include:
664-
- variant: cpu
665-
runner: '"ubuntu-24.04"'
666-
openvino_device: "CPU"
667-
- variant: gpu
668-
runner: '["self-hosted","Linux","X64","Intel"]'
669-
openvino_device: "GPU"
670-
671-
runs-on: ${{ fromJSON(matrix.runner) }}
672-
673-
env:
674-
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
675-
OPENVINO_VERSION_MAJOR: "2026.0"
676-
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
677-
678-
steps:
679-
- name: Clone
680-
id: checkout
681-
uses: actions/checkout@v6
682-
683-
- name: ccache
684-
if: runner.environment == 'github-hosted'
685-
uses: ggml-org/ccache-action@v1.2.21
686-
with:
687-
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
688-
evict-old-files: 1d
689-
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
690-
691-
- name: Dependencies
692-
id: depends
693-
run: |
694-
sudo apt-get update
695-
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
696-
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
697-
698-
- name: Use OpenVINO Toolkit Cache
699-
if: runner.environment == 'github-hosted'
700-
uses: actions/cache@v5
701-
id: cache-openvino
702-
with:
703-
path: ./openvino_toolkit
704-
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
705-
706-
- name: Setup OpenVINO Toolkit
707-
if: steps.cache-openvino.outputs.cache-hit != 'true'
708-
uses: ./.github/actions/linux-setup-openvino
709-
with:
710-
path: ./openvino_toolkit
711-
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
712-
version_full: ${{ env.OPENVINO_VERSION_FULL }}
713-
714-
- name: Install OpenVINO dependencies
715-
run: |
716-
cd ./openvino_toolkit
717-
chmod +x ./install_dependencies/install_openvino_dependencies.sh
718-
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
719-
720-
- name: Build
721-
id: cmake_build
722-
run: |
723-
source ./openvino_toolkit/setupvars.sh
724-
cmake -B build/ReleaseOV -G Ninja \
725-
-DCMAKE_BUILD_TYPE=Release \
726-
-DGGML_OPENVINO=ON
727-
time cmake --build build/ReleaseOV --config Release -j $(nproc)
728-
729-
- name: Test
730-
id: cmake_test
731-
# TODO: fix and re-enable the `test-llama-archs` test below
732-
run: |
733-
cd ${{ github.workspace }}
734-
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
735-
export GGML_OPENVINO_DEVICE=GPU
736-
fi
737-
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
738-
739659
windows-latest:
740660
runs-on: windows-2025
741661

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
/ci/ @ggerganov
2424
/cmake/ @ggerganov
2525
/common/ @ggml-org/llama-common
26+
/common/fit.* @JohannesGaessler
2627
/common/jinja/ @CISC
2728
/common/ngram-map.* @srogmann
2829
/convert_*.py @CISC

common/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ add_library(${TARGET}
7373
debug.h
7474
download.cpp
7575
download.h
76+
fit.cpp
77+
fit.h
7678
hf-cache.cpp
7779
hf-cache.h
7880
http.h

common/arg.cpp

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
292292
hf_tag = "default";
293293
}
294294

295-
std::string model_endpoint = get_model_endpoint();
295+
std::string model_endpoint = common_get_model_endpoint();
296296
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
297297

298298
// prepare local path for caching
@@ -1339,13 +1339,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13391339
}
13401340
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
13411341
add_opt(common_arg(
1342-
{"--clear-idle"},
1343-
{"--no-clear-idle"},
1342+
{"--cache-idle-slots"},
1343+
{"--no-cache-idle-slots"},
13441344
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
13451345
[](common_params & params, bool value) {
1346-
params.clear_idle = value;
1346+
params.cache_idle_slots = value;
13471347
}
1348-
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
1348+
).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
13491349
add_opt(common_arg(
13501350
{"--context-shift"},
13511351
{"--no-context-shift"},
@@ -2449,6 +2449,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24492449
}
24502450
}
24512451
).set_env("LLAMA_ARG_FIT"));
2452+
add_opt(common_arg(
2453+
{ "-fitp", "--fit-print" }, "[on|off]",
2454+
string_format("print the estimated required memory ('on' or 'off', default: '%s')", params.fit_params_print ? "on" : "off"),
2455+
[](common_params & params, const std::string & value) {
2456+
if (is_truthy(value)) {
2457+
params.fit_params_print = true;
2458+
} else if (is_falsey(value)) {
2459+
params.fit_params_print = false;
2460+
} else {
2461+
throw std::runtime_error(
2462+
string_format("error: unknown value for --fit-print: '%s'\n", value.c_str()));
2463+
}
2464+
}
2465+
).set_examples({LLAMA_EXAMPLE_FIT_PARAMS}).set_env("LLAMA_ARG_FIT_ESTIMATE"));
24522466
add_opt(common_arg(
24532467
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
24542468
string_format("target margin per device for --fit, comma-separated list of values, "
@@ -3131,14 +3145,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31313145
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
31323146
[](common_params & params, int value) {
31333147
if (value < -1) { throw std::invalid_argument("invalid value"); }
3134-
params.reasoning_budget = value;
3148+
params.sampling.reasoning_budget_tokens = value;
31353149
}
31363150
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
31373151
add_opt(common_arg(
31383152
{"--reasoning-budget-message"}, "MESSAGE",
31393153
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
31403154
[](common_params & params, const std::string & value) {
3141-
params.reasoning_budget_message = value;
3155+
params.sampling.reasoning_budget_message = value;
31423156
}
31433157
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
31443158
add_opt(common_arg(
@@ -3911,6 +3925,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39113925
}
39123926
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
39133927

3928+
add_opt(common_arg(
3929+
{"--spec-default"},
3930+
string_format("enable default speculative decoding config"),
3931+
[](common_params & params) {
3932+
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3933+
params.speculative.ngram_size_n = 24;
3934+
params.speculative.n_min = 48;
3935+
params.speculative.n_max = 64;
3936+
}
3937+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3938+
39143939
return ctx_arg;
39153940
}
39163941

0 commit comments

Comments
 (0)