Skip to content

Commit eddd7a1

Browse files
authored
[SYCL] Optimize Q4_0 mul_mat for Arc770, add scripts (#22291)
* opt arc770 for Q4_0 * add for Q4_0 * update the script * add help script for windows * update guide * fix format issue * convert from dos to unix for format issue * fix missed -sm parameter
1 parent dd2914d commit eddd7a1

9 files changed

Lines changed: 614 additions & 28 deletions

File tree

docs/backend/SYCL.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ The packages for FP32 and FP16 would have different accuracy and performance on
5151

5252
## News
5353

54+
- 2026.04
55+
56+
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
57+
- Fused MoE.
58+
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
59+
5460
- 2026.03
5561
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
5662

@@ -349,6 +355,12 @@ Choose one of following methods to run.
349355
./examples/sycl/test.sh
350356
```
351357

358+
- Run llama-server:
359+
360+
```sh
361+
./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
362+
```
363+
352364
2. Command line
353365
Launch inference
354366

@@ -637,10 +649,18 @@ Choose one of following methods to run.
637649

638650
1. Script
639651

652+
- Run test:
653+
640654
```
641655
examples\sycl\win-test.bat
642656
```
643657

658+
- Run llama-server:
659+
660+
```
661+
examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
662+
```
663+
644664
2. Command line
645665

646666
Launch inference

examples/sycl/start-svr.sh

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/bin/bash
2+
3+
# MIT license
4+
# Copyright (C) 2024 Intel Corporation
5+
# SPDX-License-Identifier: MIT
6+
7+
Help() {
8+
cat << EOF
9+
Usage: $(basename "$0") [OPTIONS]
10+
11+
This script processes files with specified options.
12+
13+
Options:
14+
-h, --help Display this help message and exit.
15+
-c, --context <value> Set context length. Bigger need more memory.
16+
-p, --promote <value> Prompt to start generation with.
17+
-m, --model <value> Full model file path.
18+
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
19+
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
20+
- none: use one GPU only
21+
- layer (default): split layers and KV across GPUs
22+
- row: split rows across GPUs
23+
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
24+
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
25+
ignored. Values:
26+
- 0: generic output
27+
- 1: error
28+
- 2: warning
29+
- 3: info
30+
- 4: debug
31+
32+
33+
EOF
34+
}
35+
36+
BIN_FILE=./build/bin/llama-server
37+
SEED=0
38+
GPUS_SETTING=""
39+
40+
MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
41+
NGL=99
42+
CONTEXT=4096
43+
GGML_SYCL_DEVICE=-1
44+
SPLIT_MODE=layer
45+
LOG_VERBOSE=3
46+
while [[ $# -gt 0 ]]; do
47+
case "$1" in
48+
-c|--context)
49+
CONTEXT=$2
50+
# Shift twice to consume both the option flag and its value
51+
shift
52+
shift
53+
;;
54+
-m|--model)
55+
MODEL_FILE="$2"
56+
# Shift twice to consume both the option flag and its value
57+
shift
58+
shift
59+
;;
60+
-mg|--main-gpu)
61+
GGML_SYCL_DEVICE=$2
62+
SPLIT_MODE=none
63+
# Shift twice to consume both the option flag and its value
64+
shift
65+
shift
66+
;;
67+
-sm|--split-mode)
68+
SPLIT_MODE=$2
69+
# Shift twice to consume both the option flag and its value
70+
shift
71+
shift
72+
;;
73+
-ngl|--n-gpu-layers)
74+
NGL=$2
75+
# Shift twice to consume both the option flag and its value
76+
shift
77+
shift
78+
;;
79+
-lv|--log-verbosity)
80+
LOG_VERBOSE=$2
81+
# Shift twice to consume both the option flag and its value
82+
shift
83+
shift
84+
;;
85+
-h|--help)
86+
Help
87+
exit 0
88+
;;
89+
*)
90+
# Handle unknown options or stop processing options
91+
echo "Invalid option: $1"
92+
# Optional: exit script or shift to treat remaining as positional args
93+
exit 1
94+
;;
95+
esac
96+
done
97+
98+
99+
100+
source /opt/intel/oneapi/setvars.sh
101+
102+
#export GGML_SYCL_DEBUG=1
103+
104+
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
105+
106+
#support malloc device memory more than 4GB.
107+
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
108+
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
109+
110+
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
111+
echo "Use $GGML_SYCL_DEVICE as main GPU"
112+
#use signle GPU only
113+
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
114+
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
115+
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
116+
else
117+
echo "Use all Intel GPUs, including iGPU & dGPU"
118+
GPUS_SETTING="-sm ${SPLIT_MODE}"
119+
fi
120+
121+
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
122+
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
123+
124+

examples/sycl/test.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ SEED=0
3838
GPUS_SETTING=""
3939

4040
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
41-
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
41+
MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
4242
NGL=99
4343
CONTEXT=4096
4444
GGML_SYCL_DEVICE=-1
@@ -122,9 +122,10 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
122122
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
123123
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
124124
else
125-
echo "Use all Intel GPUs, including iGPU & dGPU"
125+
echo "Use all Intel GPUs, including iGPU & dGPU"
126+
GPUS_SETTING="-sm ${SPLIT_MODE}"
126127
fi
127128

128-
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
129-
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
129+
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
130+
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
130131

examples/sycl/win-start-svr.bat

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
:: MIT license
2+
:: Copyright (C) 2024 Intel Corporation
3+
:: SPDX-License-Identifier: MIT
4+
5+
@echo off
6+
setlocal EnableExtensions EnableDelayedExpansion
7+
8+
set "BIN_FILE=.\build\bin\llama-server.exe"
9+
set "SEED=0"
10+
set "GPUS_SETTING="
11+
12+
set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
13+
set "NGL=99"
14+
set "CONTEXT=4096"
15+
set "GGML_SYCL_DEVICE=-1"
16+
set "SPLIT_MODE=layer"
17+
set "LOG_VERBOSE=3"
18+
19+
if "%~1"=="" goto after_args
20+
21+
:parse_args
22+
if "%~1"=="" goto after_args
23+
24+
if /I "%~1"=="-c" (
25+
if "%~2"=="" goto missing_value
26+
set "CONTEXT=%~2"
27+
shift
28+
shift
29+
goto parse_args
30+
)
31+
if /I "%~1"=="--context" (
32+
if "%~2"=="" goto missing_value
33+
set "CONTEXT=%~2"
34+
shift
35+
shift
36+
goto parse_args
37+
)
38+
39+
if /I "%~1"=="-m" (
40+
if "%~2"=="" goto missing_value
41+
set "MODEL_FILE=%~2"
42+
shift
43+
shift
44+
goto parse_args
45+
)
46+
if /I "%~1"=="--model" (
47+
if "%~2"=="" goto missing_value
48+
set "MODEL_FILE=%~2"
49+
shift
50+
shift
51+
goto parse_args
52+
)
53+
54+
if /I "%~1"=="-mg" (
55+
if "%~2"=="" goto missing_value
56+
set "GGML_SYCL_DEVICE=%~2"
57+
set "SPLIT_MODE=none"
58+
shift
59+
shift
60+
goto parse_args
61+
)
62+
if /I "%~1"=="--main-gpu" (
63+
if "%~2"=="" goto missing_value
64+
set "GGML_SYCL_DEVICE=%~2"
65+
set "SPLIT_MODE=none"
66+
shift
67+
shift
68+
goto parse_args
69+
)
70+
71+
if /I "%~1"=="-sm" (
72+
if "%~2"=="" goto missing_value
73+
set "SPLIT_MODE=%~2"
74+
shift
75+
shift
76+
goto parse_args
77+
)
78+
if /I "%~1"=="--split-mode" (
79+
if "%~2"=="" goto missing_value
80+
set "SPLIT_MODE=%~2"
81+
shift
82+
shift
83+
goto parse_args
84+
)
85+
86+
if /I "%~1"=="-ngl" (
87+
if "%~2"=="" goto missing_value
88+
set "NGL=%~2"
89+
shift
90+
shift
91+
goto parse_args
92+
)
93+
if /I "%~1"=="--n-gpu-layers" (
94+
if "%~2"=="" goto missing_value
95+
set "NGL=%~2"
96+
shift
97+
shift
98+
goto parse_args
99+
)
100+
101+
if /I "%~1"=="-lv" (
102+
if "%~2"=="" goto missing_value
103+
set "LOG_VERBOSE=%~2"
104+
shift
105+
shift
106+
goto parse_args
107+
)
108+
if /I "%~1"=="--log-verbosity" (
109+
if "%~2"=="" goto missing_value
110+
set "LOG_VERBOSE=%~2"
111+
shift
112+
shift
113+
goto parse_args
114+
)
115+
116+
if /I "%~1"=="-h" goto help
117+
if /I "%~1"=="--help" goto help
118+
119+
echo Invalid option: %~1
120+
exit /b 1
121+
122+
:missing_value
123+
echo Missing value for option: %~1
124+
exit /b 1
125+
126+
:help
127+
echo Usage: %~n0 [OPTIONS]
128+
echo.
129+
echo This script processes files with specified options.
130+
echo.
131+
echo Options:
132+
echo -h, --help Display this help message and exit.
133+
echo -c, --context ^<value^> Set context length. Bigger need more memory.
134+
echo -m, --model ^<value^> Full model file path.
135+
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
136+
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
137+
echo - none: use one GPU only
138+
echo - layer (default): split layers and KV across GPUs
139+
echo - row: split rows across GPUs
140+
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
141+
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
142+
echo ignored. Values:
143+
echo - 0: generic output
144+
echo - 1: error
145+
echo - 2: warning
146+
echo - 3: info
147+
echo - 4: debug
148+
exit /b 0
149+
150+
:after_args
151+
152+
REM In Windows CMD, source is not available; call oneAPI setvars if present.
153+
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
154+
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
155+
) else (
156+
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
157+
)
158+
159+
REM Support malloc device memory more than 4GB.
160+
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
161+
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
162+
163+
if not "%GGML_SYCL_DEVICE%"=="-1" (
164+
echo Use %GGML_SYCL_DEVICE% as main GPU
165+
REM Use single GPU only.
166+
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
167+
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
168+
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
169+
) else (
170+
echo Use all Intel GPUs, including iGPU ^& dGPU
171+
set "GPUS_SETTING=-sm %SPLIT_MODE%"
172+
)
173+
174+
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
175+
set "ZES_ENABLE_SYSMAN=1"
176+
%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
177+
178+
endlocal
179+

0 commit comments

Comments
 (0)