@@ -23,7 +23,7 @@ get_enable_function() {
2323}
2424
2525function start_vllm_services() {
26- COMPOSE_FILE=" compose_vllm .yaml"
26+ COMPOSE_FILE=" compose .yaml"
2727 echo " stop former service..."
2828 docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE down
2929
@@ -63,24 +63,11 @@ function start_vllm_services() {
6363 sudo chown -R 1000:1000 ${HF_CACHE}
6464 HF_ENDPOINT=https://hf-mirror.com
6565 # vllm ENV
66- export NGINX_PORT=8086
67- export vLLM_ENDPOINT=" http://${HOST_IP} :${NGINX_PORT} "
68- read -p " DP number(how many containers to run vLLM) [1] , press Enter to confirm, or type a new value:" DP_NUM; DP_NUM=${DP_NUM:- 1}
69- read -p " Tensor parallel size(your tp size [1]), press Enter to confirm, or type a new value:" TENSOR_PARALLEL_SIZE; TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:- 1}
70-
71- for (( x= 0 ; x< DP_NUM; x++ )) ; do
72- start_gpu=$(( x * TENSOR_PARALLEL_SIZE ))
73- default_gpu_list=$( seq -s, $start_gpu $(( start_gpu + TENSOR_PARALLEL_SIZE - 1 )) )
74-
75- read -p " selected XPU(your selected_XPU_${x} [${default_gpu_list} ]) , press Enter to confirm, or type a new value:" input_gpu_list
76- selected_gpu_list=${input_gpu_list:- $default_gpu_list }
66+ export VLLM_SERVICE_PORT_A770=8086
7767
78- export SELECTED_XPU_${x} =" $selected_gpu_list "
79- export VLLM_SERVICE_PORT_${x} =" 8$(( x+ 1 )) 00"
80- done
68+ read -p " Tensor parallel size(your tp size [1]), press Enter to confirm, or type a new value:" TENSOR_PARALLEL_SIZE; TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:- 1}
8169 CCL_DG2_USM=$( get_user_input " Set USM (Core=1, Xeon=0, default=0)" 0)
8270 export HOST_IP=${HOST_IP}
83- export VLLM_SERVICE_PORT_0=8100
8471 # export ENV
8572 export MODEL_PATH=${MODEL_PATH}
8673 export DOC_PATH=${DOC_PATH}
@@ -90,18 +77,14 @@ function start_vllm_services() {
9077 export no_proxy=" localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP} "
9178 export MILVUS_ENABLED=${MILVUS_ENABLED}
9279 export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
93- export SELECTED_XPU_0=${SELECTED_XPU_0}
9480 export TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE}
9581 export CCL_DG2_USM=${CCL_DG2_USM}
9682 export VIDEOGROUPID=$( getent group video | cut -d: -f3)
9783 export RENDERGROUPID=$( getent group render | cut -d: -f3)
9884
99- bash $WORKPATH /nginx/nginx-conf-generator.sh $DP_NUM $WORKPATH /nginx/nginx.conf
100- export NGINX_CONFIG_PATH=" ${WORKPATH} /nginx/nginx.conf"
10185
10286 # Start Docker Containers
103- bash $WORKPATH /docker_compose/intel/gpu/arc/multi-arc-yaml-generator.sh $DP_NUM $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE
104- docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
87+ docker compose --profile a770 -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
10588 echo " ipex-llm-serving-xpu is booting, please wait..."
10689 n=0
10790 until [[ " $n " -ge 100 ]]; do
@@ -176,6 +159,7 @@ function start_services() {
176159 export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
177160 export VIDEOGROUPID=$( getent group video | cut -d: -f3)
178161 export RENDERGROUPID=$( getent group render | cut -d: -f3)
162+ export MAX_MODEL_LEN=5000
179163
180164 # Start Docker Containers
181165 COMPOSE_FILE=" compose.yaml"
@@ -199,10 +183,11 @@ function check_baai_folder() {
199183
200184function quick_start_vllm_services() {
201185 WORKPATH=$( dirname " $PWD " )
202- COMPOSE_FILE=" compose_vllm .yaml"
186+ COMPOSE_FILE=" compose .yaml"
203187 EC_RAG_SERVICE_PORT=16010
204188 docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE down
205189
190+ ip_address=$( hostname -I | awk ' {print $1}' )
206191 export HOST_IP=${HOST_IP:- " ${ip_address} " }
207192 export MODEL_PATH=${MODEL_PATH:- " ${PWD} /models" }
208193 export DOC_PATH=${DOC_PATH:- " $WORKPATH /tests" }
@@ -211,21 +196,17 @@ function quick_start_vllm_services() {
211196 export MILVUS_ENABLED=${MILVUS_ENABLED:- 1}
212197 export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:- 2}
213198 export HF_ENDPOINT=${HF_ENDPOINT:- https:// hf-mirror.com}
214- export NGINX_PORT=${NGINX_PORT:- 8086}
215- export NGINX_PORT_0=${NGINX_PORT_0:- 8100}
216- export VLLM_SERVICE_PORT_0=${VLLM_SERVICE_PORT_0:- 8100}
217199 export TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:- 1}
218- export SELECTED_XPU_0=${SELECTED_XPU_0:- 0}
219200 export MAX_NUM_SEQS=${MAX_NUM_SEQS:- 64}
220- export MAX_NUM_BATCHED_TOKENS =${MAX_NUM_BATCHED_TOKENS :- 4000 }
221- export MAX_MODEL_LEN =${MAX_MODEL_LEN :- 3000 }
201+ export MAX_MODEL_LEN =${MAX_MODEL_LEN :- 10240 }
202+ export MAX_NUM_BATCHED_TOKENS =${MAX_NUM_BATCHED_TOKENS :- 10240 }
222203 export LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:- fp8}
223204 export CCL_DG2_USM=${CCL_DG2_USM:- 0}
224- export vLLM_ENDPOINT=${vLLM_ENDPOINT:- " http://${HOST_IP} :${NGINX_PORT} " }
225205 export LLM_MODEL=${LLM_MODEL:- Qwen/ Qwen3-8B}
226206 export LLM_MODEL_PATH=${LLM_MODEL_PATH:- " ${MODEL_PATH} /Qwen/Qwen3-8B" }
227207 export VIDEOGROUPID=$( getent group video | cut -d: -f3)
228208 export RENDERGROUPID=$( getent group render | cut -d: -f3)
209+ export VLLM_SERVICE_PORT_A770=8086
229210
230211 check_baai_folder
231212 export HF_CACHE=${HF_CACHE:- " ${HOME} /.cache" }
@@ -237,11 +218,8 @@ function quick_start_vllm_services() {
237218 sudo chown -R 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
238219 sudo chown -R 1000:1000 ${HF_CACHE}
239220 cd $WORKPATH /docker_compose/intel/gpu/arc
240- bash $WORKPATH /nginx/nginx-conf-generator.sh $DP_NUM $WORKPATH /nginx/nginx.conf
241- export NGINX_CONFIG_PATH=${NGINX_CONFIG_PATH:- " $WORKPATH /nginx/nginx.conf" }
242221
243- bash $WORKPATH /docker_compose/intel/gpu/arc/multi-arc-yaml-generator.sh $DP_NUM $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE
244- docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
222+ docker compose --profile a770 -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
245223 echo " ipex-llm-serving-xpu is booting, please wait..."
246224 n=0
247225 until [[ " $n " -ge 100 ]]; do
@@ -272,6 +250,7 @@ function quick_start_ov_services() {
272250 export MODEL_PATH=${MODEL_PATH:- " ${PWD} /models" }
273251 export VIDEOGROUPID=$( getent group video | cut -d: -f3)
274252 export RENDERGROUPID=$( getent group render | cut -d: -f3)
253+ export MAX_MODEL_LEN=5000
275254
276255 check_baai_folder
277256 export HF_CACHE=${HF_CACHE:- " ${HOME} /.cache" }
@@ -292,7 +271,7 @@ function quick_start_ov_services() {
292271
293272
294273function start_vLLM_B60_services() {
295- COMPOSE_FILE=" compose_vllm_b60 .yaml"
274+ COMPOSE_FILE=" compose .yaml"
296275 echo " stop former service..."
297276 export MODEL_PATH=${MODEL_PATH:- " ${PWD} /models" }
298277 docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE down
@@ -339,7 +318,7 @@ function start_vLLM_B60_services() {
339318 NO_ENABLE_PREFIX_CACHING=$( get_user_input " NO_ENABLE_PREFIX_CACHING (disable prefix caching, 1=disable/0=enable)" " 1" )
340319 MAX_NUM_BATCHED_TOKENS=$( get_user_input " MAX_NUM_BATCHED_TOKENS (max number of batched tokens)" " 8192" )
341320 DISABLE_LOG_REQUESTS=$( get_user_input " DISABLE_LOG_REQUESTS (disable request logs, 1=disable/0=enable)" " 1" )
342- MAX_MODEL_LEN=$( get_user_input " MAX_MODEL_LEN (max model context length, e.g. 49152 /10240)" " 49152 " )
321+ MAX_MODEL_LEN=$( get_user_input " MAX_MODEL_LEN (max model context length, e.g. 40000 /10240)" " 40000 " )
343322 BLOCK_SIZE=$( get_user_input " BLOCK_SIZE (vLLM block size)" " 64" )
344323 QUANTIZATION=$( get_user_input " QUANTIZATION (model quantization method, e.g. fp8/int4)" " fp8" )
345324 # export ENV
@@ -371,7 +350,7 @@ function start_vLLM_B60_services() {
371350 export QUANTIZATION=${QUANTIZATION}
372351
373352 # Start Docker Containers
374- docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
353+ docker compose --profile b60 - f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
375354 echo " ipex-llm-serving-xpu is booting, please wait..."
376355 n=0
377356 until [[ " $n " -ge 100 ]]; do
@@ -389,10 +368,11 @@ function start_vLLM_B60_services() {
389368
390369function quick_start_vllm_B60_services() {
391370 WORKPATH=$( dirname " $PWD " )
392- COMPOSE_FILE=" compose_vllm_b60 .yaml"
371+ COMPOSE_FILE=" compose .yaml"
393372 EC_RAG_SERVICE_PORT=16010
394373 docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE down
395374
375+ ip_address=$( hostname -I | awk ' {print $1}' )
396376 export HOST_IP=${HOST_IP:- " ${ip_address} " }
397377 export MODEL_PATH=${MODEL_PATH:- " ${PWD} /models" }
398378 export DOC_PATH=${DOC_PATH:- " $WORKPATH /tests" }
@@ -403,26 +383,26 @@ function quick_start_vllm_B60_services() {
403383 export VIDEOGROUPID=$( getent group video | cut -d: -f3)
404384 export RENDERGROUPID=$( getent group render | cut -d: -f3)
405385 # export vllm ENV
406- export DP=${DP:- 4 }
386+ export DP=${DP:- 1 }
407387 export TP=${TP:- 1}
408388 export DTYPE=${DTYPE:- float16}
409- export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:- 0,1,2,3 }
389+ export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:- 0}
410390 export ENFORCE_EAGER=${ENFORCE_EAGER:- 1}
411391 export TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:- 1}
412392 export DISABLE_SLIDING_WINDOW=${DISABLE_SLIDING_WINDOW:- 1}
413393 export GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL:- 0.8}
414394 export NO_ENABLE_PREFIX_CACHING=${NO_ENABLE_PREFIX_CACHING:- 1}
415395 export MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:- 8192}
416396 export DISABLE_LOG_REQUESTS=${disable_LOG_REQUESTS:- 1}
417- export MAX_MODEL_LEN=${MAX_MODEL_LEN:- 49152 }
397+ export MAX_MODEL_LEN=${MAX_MODEL_LEN:- 40000 }
418398 export BLOCK_SIZE=${BLOCK_SIZE:- 64}
419399 export QUANTIZATION=${QUANTIZATION:- fp8}
420400
421401
422402 check_baai_folder
423403 export no_proxy=" localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP} "
424404 sudo chown -R 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
425- docker compose -f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
405+ docker compose --profile b60 - f $WORKPATH /docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
426406 echo " ipex-llm-serving-xpu is booting, please wait..."
427407 n=0
428408 until [[ " $n " -ge 100 ]]; do
@@ -450,10 +430,10 @@ function main {
450430 start_services
451431 fi
452432 else
453- export SERVICE_TYPE =${SERVICE_TYPE :- " vLLM_A770 " }
454- if [[ " $SERVICE_TYPE " == " vLLM_A770" || " $SERVICE_TYPE " == " vLLM" ]]; then
433+ export COMPOSE_PROFILES =${COMPOSE_PROFILES :- " " }
434+ if [[ " $COMPOSE_PROFILES " == " vLLM_A770" || " $COMPOSE_PROFILES " == " vLLM" || " $COMPOSE_PROFILES " == " vllm_on_a770 " ]]; then
455435 quick_start_vllm_services
456- elif [[ " $SERVICE_TYPE " == " vLLM_B60" || " $SERVICE_TYPE " == " vLLM_b60" ]]; then
436+ elif [[ " $COMPOSE_PROFILES " == " vLLM_B60" || " $COMPOSE_PROFILES " == " vLLM_b60" || " $COMPOSE_PROFILES " == " vllm_on_b60 " ]]; then
457437 quick_start_vllm_B60_services
458438 else
459439 quick_start_ov_services
0 commit comments