|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# MIT license |
| 4 | +# Copyright (C) 2024 Intel Corporation |
| 5 | +# SPDX-License-Identifier: MIT |
| 6 | + |
| 7 | +Help() { |
| 8 | + cat << EOF |
| 9 | +Usage: $(basename "$0") [OPTIONS] |
| 10 | +
|
| 11 | +This script processes files with specified options. |
| 12 | +
|
| 13 | +Options: |
| 14 | + -h, --help Display this help message and exit. |
| 15 | + -c, --context <value> Set context length. Bigger need more memory. |
| 16 | + -p, --promote <value> Prompt to start generation with. |
| 17 | + -m, --model <value> Full model file path. |
| 18 | + -mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode. |
| 19 | + -sm,--split-mode <value> How to split the model across multiple GPUs, one of: |
| 20 | + - none: use one GPU only |
| 21 | + - layer (default): split layers and KV across GPUs |
| 22 | + - row: split rows across GPUs |
| 23 | + -ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1) |
| 24 | + -lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be |
| 25 | + ignored. Values: |
| 26 | + - 0: generic output |
| 27 | + - 1: error |
| 28 | + - 2: warning |
| 29 | + - 3: info |
| 30 | + - 4: debug |
| 31 | +
|
| 32 | +
|
| 33 | +EOF |
| 34 | +} |
| 35 | + |
| 36 | +BIN_FILE=./build/bin/llama-server |
| 37 | +SEED=0 |
| 38 | +GPUS_SETTING="" |
| 39 | + |
| 40 | +MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf |
| 41 | +NGL=99 |
| 42 | +CONTEXT=4096 |
| 43 | +GGML_SYCL_DEVICE=-1 |
| 44 | +SPLIT_MODE=layer |
| 45 | +LOG_VERBOSE=3 |
| 46 | +while [[ $# -gt 0 ]]; do |
| 47 | + case "$1" in |
| 48 | + -c|--context) |
| 49 | + CONTEXT=$2 |
| 50 | + # Shift twice to consume both the option flag and its value |
| 51 | + shift |
| 52 | + shift |
| 53 | + ;; |
| 54 | + -m|--model) |
| 55 | + MODEL_FILE="$2" |
| 56 | + # Shift twice to consume both the option flag and its value |
| 57 | + shift |
| 58 | + shift |
| 59 | + ;; |
| 60 | + -mg|--main-gpu) |
| 61 | + GGML_SYCL_DEVICE=$2 |
| 62 | + SPLIT_MODE=none |
| 63 | + # Shift twice to consume both the option flag and its value |
| 64 | + shift |
| 65 | + shift |
| 66 | + ;; |
| 67 | + -sm|--split-mode) |
| 68 | + SPLIT_MODE=$2 |
| 69 | + # Shift twice to consume both the option flag and its value |
| 70 | + shift |
| 71 | + shift |
| 72 | + ;; |
| 73 | + -ngl|--n-gpu-layers) |
| 74 | + NGL=$2 |
| 75 | + # Shift twice to consume both the option flag and its value |
| 76 | + shift |
| 77 | + shift |
| 78 | + ;; |
| 79 | + -lv|--log-verbosity) |
| 80 | + LOG_VERBOSE=$2 |
| 81 | + # Shift twice to consume both the option flag and its value |
| 82 | + shift |
| 83 | + shift |
| 84 | + ;; |
| 85 | + -h|--help) |
| 86 | + Help |
| 87 | + exit 0 |
| 88 | + ;; |
| 89 | + *) |
| 90 | + # Handle unknown options or stop processing options |
| 91 | + echo "Invalid option: $1" |
| 92 | + # Optional: exit script or shift to treat remaining as positional args |
| 93 | + exit 1 |
| 94 | + ;; |
| 95 | + esac |
| 96 | +done |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | +source /opt/intel/oneapi/setvars.sh |
| 101 | + |
| 102 | +#export GGML_SYCL_DEBUG=1 |
| 103 | + |
| 104 | +#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. |
| 105 | + |
| 106 | +#support malloc device memory more than 4GB. |
| 107 | +export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 |
| 108 | +echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}" |
| 109 | + |
| 110 | +if [ $GGML_SYCL_DEVICE -ne -1 ]; then |
| 111 | + echo "Use $GGML_SYCL_DEVICE as main GPU" |
| 112 | + #use signle GPU only |
| 113 | + GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}" |
| 114 | + export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}" |
| 115 | + echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}" |
| 116 | +else |
| 117 | + echo "Use all Intel GPUs, including iGPU & dGPU" |
| 118 | + fi |
| 119 | + |
| 120 | +echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap " |
| 121 | +ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000 |
| 122 | + |
| 123 | + |
0 commit comments