11# Copyright (C) 2024 Intel Corporation
22# SPDX-License-Identifier: Apache-2.0
3+
34services :
45 etcd :
56 container_name : milvus-etcd
67 image : quay.io/coreos/etcd:v3.5.5
8+ restart : always
79 environment :
810 - ETCD_AUTO_COMPACTION_MODE=revision
911 - ETCD_AUTO_COMPACTION_RETENTION=1000
@@ -22,6 +24,7 @@ services:
2224 minio :
2325 container_name : milvus-minio
2426 image : minio/minio:RELEASE.2023-03-20T20-16-18Z
27+ restart : always
2528 environment :
2629 MINIO_ACCESS_KEY : minioadmin
2730 MINIO_SECRET_KEY : minioadmin
@@ -41,14 +44,15 @@ services:
4144 milvus-standalone :
4245 container_name : milvus-standalone
4346 image : milvusdb/milvus:v2.4.6
47+ restart : always
4448 command : ["milvus", "run", "standalone"]
4549 security_opt :
4650 - seccomp:unconfined
4751 environment :
4852 ETCD_ENDPOINTS : etcd:2379
4953 MINIO_ADDRESS : minio:9000
5054 volumes :
51- - ./milvus.yaml:/milvus/configs/milvus.yaml
55+ - ./milvus-config .yaml:/milvus/configs/milvus.yaml
5256 - ${DOCKER_VOLUME_DIRECTORY:-${PWD}}/volumes/milvus:/var/lib/milvus
5357 healthcheck :
5458 test : ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
@@ -71,10 +75,12 @@ services:
7175 no_proxy : ${no_proxy}
7276 http_proxy : ${http_proxy}
7377 https_proxy : ${https_proxy}
74- vLLM_ENDPOINT : ${vLLM_ENDPOINT:-http://${HOST_IP}:${NGINX_PORT:-8086}}
78+ vLLM_ENDPOINT : ${vLLM_ENDPOINT:-http://${HOST_IP}:${VLLM_SERVICE_PORT_B60:-8086}}
79+ LLM_MODEL : ${LLM_MODEL}
7580 ENABLE_BENCHMARK : ${ENABLE_BENCHMARK:-false}
76- MAX_MODEL_LEN : ${MAX_MODEL_LEN:-5000 }
81+ MAX_MODEL_LEN : ${MAX_MODEL_LEN:-49152 }
7782 CHAT_HISTORY_ROUND : ${CHAT_HISTORY_ROUND:-0}
83+ METADATA_DATABASE_URL : ${METADATA_DATABASE_URL:-""}
7884 volumes :
7985 - ${MODEL_PATH:-${PWD}}:/home/user/models
8086 - ${DOC_PATH:-${PWD}}:/home/user/docs
@@ -125,6 +131,96 @@ services:
125131 depends_on :
126132 - edgecraftrag-server
127133 - ecrag
134+ llm-serving-xpu-b60 :
135+ container_name : ipex-serving-xpu-container
136+ image : intel/llm-scaler-vllm:1.1-preview
137+ privileged : true
138+ restart : always
139+ ports :
140+ - ${VLLM_SERVICE_PORT_B60:-8086}:${VLLM_SERVICE_PORT_B60:-8086}
141+ volumes :
142+ - ${MODEL_PATH}:/workspace/vllm/models
143+ devices :
144+ - /dev/dri:/dev/dri
145+ environment :
146+ DTYPE : ${DTYPE:-float16}
147+ VLLM_SERVICE_PORT_B60 : ${VLLM_SERVICE_PORT_B60:-8086}
148+ ZE_AFFINITY_MASK : ${ZE_AFFINITY_MASK:-0}
149+ ENFORCE_EAGER : ${ENFORCE_EAGER:-1}
150+ TRUST_REMOTE_CODE : ${TRUST_REMOTE_CODE:-1}
151+ DISABLE_SLIDING_WINDOW : ${DISABLE_SLIDING_WINDOW:-1}
152+ GPU_MEMORY_UTIL : ${GPU_MEMORY_UTIL:-0.8}
153+ NO_ENABLE_PREFIX_CACHING : ${NO_ENABLE_PREFIX_CACHING:-1}
154+ MAX_NUM_BATCHED_TOKENS : ${MAX_NUM_BATCHED_TOKENS:-8192}
155+ DISABLE_LOG_REQUESTS : ${DISABLE_LOG_REQUESTS:-1}
156+ MAX_MODEL_LEN : ${MAX_MODEL_LEN:-49152}
157+ BLOCK_SIZE : ${BLOCK_SIZE:-64}
158+ QUANTIZATION : ${QUANTIZATION:-fp8}
159+ LLM_MODEL : ${LLM_MODEL}
160+ TP : ${TP:-1}
161+ DP : ${DP:-1}
162+ entrypoint :
163+ /bin/bash -c "
164+ cd /workspace/vllm/models && source /opt/intel/oneapi/setvars.sh --force &&
165+ VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1 \
166+ TORCH_LLM_ALLREDUCE=1 \
167+ VLLM_USE_V1=1 \
168+ CCL_ZE_IPC_EXCHANGE=pidfd \
169+ VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
170+ VLLM_WORKER_MULTIPROC_METHOD=spawn \
171+ python3 -m vllm.entrypoints.openai.api_server \
172+ --model $${LLM_MODEL} \
173+ --dtype $${DTYPE} \
174+ --enforce-eager \
175+ --port $${VLLM_SERVICE_PORT_B60} \
176+ --trust-remote-code \
177+ --disable-sliding-window \
178+ --gpu-memory-util $${GPU_MEMORY_UTIL} \
179+ --no-enable-prefix-caching \
180+ --max-num-batched-tokens $${MAX_NUM_BATCHED_TOKENS} \
181+ --disable-log-requests \
182+ --max-model-len $${MAX_MODEL_LEN} \
183+ --block-size $${BLOCK_SIZE} \
184+ --quantization $${QUANTIZATION} \
185+ -tp=$${TP} \
186+ -dp=$${DP}"
187+ profiles :
188+ - b60
189+ llm-serving-xpu-770 :
190+ container_name : ipex-llm-serving-xpu-770
191+ image : intelanalytics/ipex-llm-serving-xpu:0.8.3-b20
192+ privileged : true
193+ restart : always
194+ ports :
195+ - ${VLLM_SERVICE_PORT_A770:-8086}:${VLLM_SERVICE_PORT_A770:-8086}
196+ group_add :
197+ - video
198+ - ${VIDEOGROUPID:-44}
199+ - ${RENDERGROUPID:-109}
200+ volumes :
201+ - ${LLM_MODEL_PATH:-${MODEL_PATH}/${LLM_MODEL}}:/llm/models
202+ devices :
203+ - /dev/dri
204+ environment :
205+ no_proxy : ${no_proxy}
206+ http_proxy : ${http_proxy}
207+ https_proxy : ${https_proxy}
208+ MODEL_PATH : " /llm/models"
209+ SERVED_MODEL_NAME : ${LLM_MODEL}
210+ TENSOR_PARALLEL_SIZE : ${TENSOR_PARALLEL_SIZE:-1}
211+ MAX_NUM_SEQS : ${MAX_NUM_SEQS:-64}
212+ MAX_NUM_BATCHED_TOKENS : ${MAX_NUM_BATCHED_TOKENS:-10240}
213+ MAX_MODEL_LEN : ${MAX_MODEL_LEN:-10240}
214+ LOAD_IN_LOW_BIT : ${LOAD_IN_LOW_BIT:-fp8}
215+ CCL_DG2_USM : ${CCL_DG2_USM:-""}
216+ PORT : ${VLLM_SERVICE_PORT_A770:-8086}
217+ ZE_AFFINITY_MASK : ${SELECTED_XPU_0:-0}
218+ shm_size : ' 32g'
219+ entrypoint : /bin/bash -c "\
220+ cd /llm && \
221+ bash start-vllm-service.sh"
222+ profiles :
223+ - a770
128224networks :
129225 default :
130226 driver : bridge
0 commit comments