Skip to content

Commit 09e42de

Browse files
committed
create test.sh to enhance the parameters for testing, update the guide, rm useless script
1 parent 41ea261 commit 09e42de

6 files changed

Lines changed: 149 additions & 59 deletions

File tree

docs/backend/SYCL.md

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
119119
*Notes:*
120120

121121
- **Memory**
122-
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
122+
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
123123
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
124124

125125
- **Execution Unit (EU)**
@@ -423,16 +423,12 @@ Choose one of following methods to run.
423423
- Use device 0:
424424

425425
```sh
426-
./examples/sycl/run-llama2.sh 0
427-
# OR
428-
./examples/sycl/run-llama3.sh 0
426+
./examples/sycl/test.sh -mg 0
429427
```
430428
- Use multiple devices:
431429

432430
```sh
433-
./examples/sycl/run-llama2.sh
434-
# OR
435-
./examples/sycl/run-llama3.sh
431+
./examples/sycl/test.sh
436432
```
437433

438434
2. Command line
@@ -455,13 +451,13 @@ Examples:
455451
- Use device 0:
456452

457453
```sh
458-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
454+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
459455
```
460456

461457
- Use multiple devices:
462458

463459
```sh
464-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
460+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer --mmap
465461
```
466462

467463
*Notes:*
@@ -577,13 +573,13 @@ Or, use CMake presets to build:
577573

578574
```sh
579575
cmake --preset x64-windows-sycl-release
580-
cmake --build build-x64-windows-sycl-release -j --target llama-cli
576+
cmake --build build-x64-windows-sycl-release -j --target llama-completion
581577

582578
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
583-
cmake --build build-x64-windows-sycl-release -j --target llama-cli
579+
cmake --build build-x64-windows-sycl-release -j --target llama-completion
584580

585581
cmake --preset x64-windows-sycl-debug
586-
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
582+
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
587583
```
588584

589585
#### 3. Visual Studio
@@ -608,7 +604,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
608604
- For a minimal experimental setup, you can build only the inference executable using:
609605

610606
```Powershell
611-
cmake --build build --config Release -j --target llama-cli
607+
cmake --build build --config Release -j --target llama-completion
612608
```
613609
614610
##### - Generating a Visual Studio Solution
@@ -714,13 +710,7 @@ Choose one of following methods to run.
714710
1. Script
715711

716712
```
717-
examples\sycl\win-run-llama-2.bat
718-
```
719-
720-
or
721-
722-
```
723-
examples\sycl\win-run-llama-3.bat
713+
examples\sycl\win-test.bat
724714
```
725715

726716
2. Command line
@@ -744,13 +734,13 @@ Examples:
744734
- Use device 0:
745735

746736
```
747-
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
737+
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
748738
```
749739

750740
- Use multiple devices:
751741

752742
```
753-
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
743+
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer --mmap
754744
```
755745

756746

examples/sycl/run-llama2.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@ CONTEXT=4096
1818
#support malloc device memory more than 4GB.
1919
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
2020

21+
LOAD_MODE='--mmap'
2122
if [ $# -gt 0 ]; then
2223
GGML_SYCL_DEVICE=$1
2324
echo "use $GGML_SYCL_DEVICE as main GPU"
2425
#use signle GPU only
25-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
26+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
2627

2728
else
2829
#use multiple GPUs with same max compute units
29-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
30+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
3031
fi

examples/sycl/run-llama3.sh

Lines changed: 0 additions & 31 deletions
This file was deleted.

examples/sycl/test.sh

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/bin/bash
2+
3+
# MIT license
4+
# Copyright (C) 2024 Intel Corporation
5+
# SPDX-License-Identifier: MIT
6+
7+
Help() {
8+
cat << EOF
9+
Usage: $(basename "$0") [OPTIONS]
10+
11+
This script processes files with specified options.
12+
13+
Options:
14+
-h, --help Display this help message and exit.
15+
-c, --context <value> Set context length. Bigger need more memory.
16+
-p, --promote <value> Prompt to start generation with.
17+
-m, --model <value> Full model file path.
18+
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
19+
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
20+
- none: use one GPU only
21+
- layer (default): split layers and KV across GPUs
22+
- row: split rows across GPUs
23+
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
24+
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
25+
ignored. Values:
26+
- 0: generic output
27+
- 1: error
28+
- 2: warning
29+
- 3: info
30+
- 4: debug
31+
32+
33+
EOF
34+
}
35+
36+
BIN_FILE=./build/bin/llama-completion
37+
SEED=0
38+
GPUS_SETTING=""
39+
40+
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
41+
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
42+
NGL=99
43+
CONTEXT=4096
44+
GGML_SYCL_DEVICE=-1
45+
SPLIT_MODE=layer
46+
LOG_VERBOSE=3
47+
while [[ $# -gt 0 ]]; do
48+
case "$1" in
49+
-c|--context)
50+
CONTEXT=$2
51+
# Shift twice to consume both the option flag and its value
52+
shift
53+
shift
54+
;;
55+
-p|--promote)
56+
# Option that is a simple flag (boolean)
57+
INPUT_PROMPT="$2"
58+
# Shift once to consume the option flag
59+
shift
60+
shift
61+
;;
62+
-m|--model)
63+
MODEL_FILE="$2"
64+
# Shift twice to consume both the option flag and its value
65+
shift
66+
shift
67+
;;
68+
-mg|--main-gpu)
69+
GGML_SYCL_DEVICE=$2
70+
SPLIT_MODE=none
71+
# Shift twice to consume both the option flag and its value
72+
shift
73+
shift
74+
;;
75+
-sm|--split-mode)
76+
SPLIT_MODE=$2
77+
# Shift twice to consume both the option flag and its value
78+
shift
79+
shift
80+
;;
81+
-ngl|--n-gpu-layers)
82+
NGL=$2
83+
# Shift twice to consume both the option flag and its value
84+
shift
85+
shift
86+
;;
87+
-lv|--log-verbosity)
88+
LOG_VERBOSE=$2
89+
# Shift twice to consume both the option flag and its value
90+
shift
91+
shift
92+
;;
93+
-h|--help)
94+
Help
95+
exit 0
96+
;;
97+
*)
98+
# Handle unknown options or stop processing options
99+
echo "Invalid option: $1"
100+
# Optional: exit script or shift to treat remaining as positional args
101+
exit 1
102+
;;
103+
esac
104+
done
105+
106+
107+
108+
source /opt/intel/oneapi/setvars.sh
109+
110+
#export GGML_SYCL_DEBUG=1
111+
112+
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
113+
114+
#support malloc device memory more than 4GB.
115+
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
116+
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
117+
118+
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
119+
echo "Use $GGML_SYCL_DEVICE as main GPU"
120+
#use signle GPU only
121+
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
122+
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
123+
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
124+
else
125+
echo "Use all Intel GPUs, including iGPU & dGPU"
126+
fi
127+
128+
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
129+
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
130+

examples/sycl/win-run-llama2.bat

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
77

88
:: support malloc device memory more than 4GB.
99
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
10-
11-
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
10+
set LOAD_MODE="--mmap"
11+
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
77

88
:: support malloc device memory more than 4GB.
99
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
10-
11-
.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
10+
set LOAD_MODE="--mmap"
11+
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%

0 commit comments

Comments
 (0)