Cogniware-Inc
diff --git a/‎.github/code_spell_ignore.txt‎
Lines changed: 2 additions & 1 deletion b/‎.github/code_spell_ignore.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎EdgeCraftRAG/Dockerfile.server‎
100755100644
Lines changed: 2 additions & 0 deletions b/‎EdgeCraftRAG/Dockerfile.server‎
100755100644
Lines changed: 2 additions & 0 deletions
diff --git a/‎EdgeCraftRAG/README.md‎
100755100644
Lines changed: 4 additions & 3 deletions b/‎EdgeCraftRAG/README.md‎
100755100644
Lines changed: 4 additions & 3 deletions
diff --git a/‎EdgeCraftRAG/chatqna.py‎
100755100644
Lines changed: 1 addition & 1 deletion b/‎EdgeCraftRAG/chatqna.py‎
100755100644
Lines changed: 1 addition & 1 deletion
diff --git a/‎EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md‎
Lines changed: 8 additions & 14 deletions b/‎EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml‎
100755100644
Lines changed: 99 additions & 3 deletions b/‎EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml‎
100755100644
Lines changed: 99 additions & 3 deletions
@@ -1,4 +1,5 @@
 ModelIn
 modelin
 pressEnter
-PromptIn
+PromptIn
+OT
@@ -35,4 +35,6 @@ WORKDIR /home/user/
 RUN git clone https://github.com/openvinotoolkit/openvino.genai.git genai
 ENV PYTHONPATH="$PYTHONPATH:/home/user/genai/tools/llm_bench"
 
+RUN python3 -m nltk.downloader -d /home/user/nltk_data punkt_tab averaged_perceptron_tagger_eng
+
 ENTRYPOINT ["python3", "-m", "edgecraftrag.server"]
@@ -7,9 +7,10 @@ quality and performance.
 
 ## What's New
 
-1. Support Intel Arc B60 for model inference
-2. support KBadmin for knowledge base management
-3. support Experience Injection module in UI
+1. Support Agent component and enable deep_search agent
+2. Optimize pipeline execution performance with asynchronous api
+3. Support session list display in UI
+4. Support vllm-based embedding service
 
 ## Table of contents
 
 
@@ -44,7 +44,7 @@ async def handle_request(self, request: Request):
         input = await request.json()
         stream_opt = input.get("stream", False)
         input["user"] = request.headers.get("sessionid", None)
-        chat_request = ChatCompletionRequest.parse_obj(input)
+        chat_request = ChatCompletionRequest.construct(**input)
         parameters = LLMParams(
             max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
             top_k=chat_request.top_k if chat_request.top_k else 10,
 
@@ -14,7 +14,7 @@ This section describes how to quickly deploy and test the EdgeCraftRAG service m
 2. [Access the Code](#2-access-the-code)
 3. [Prepare models](#3-prepare-models)
 4. [Prepare env variables and configurations](#4-prepare-env-variables-and-configurations)
-5. [Deploy the Service on Arc A770 Using Docker Compose](#5-deploy-the-service-on-intel-gpu-using-docker-compose)
+5. [Deploy the Service on Arc GPU Using Docker Compose](#5-deploy-the-service-on-intel-gpu-using-docker-compose)
 6. [Access UI](#6-access-ui)
 7. [Cleanup the Deployment](#7-cleanup-the-deployment)
 
@@ -66,8 +66,6 @@ modelscope download --model $LLM_MODEL --local_dir "${MODEL_PATH}/${LLM_MODEL}"
 
 ### 4. Prepare env variables and configurations
 
-Below steps are for single Intel Arc GPU inference, if you want to setup multi Intel Arc GPUs inference, please refer to [Multi-ARC Setup](../../../../docs/Advanced_Setup.md#multi-arc-setup)
-
 #### Prepare env variables for vLLM deployment
 
 ```bash
@@ -86,7 +84,9 @@ export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server
 # export HF_ENDPOINT=https://hf-mirror.com # your HF mirror endpoint"
 
 # Make sure all 3 folders have 1000:1000 permission, otherwise
-chown 1000:1000 ${MODEL_PATH} ${PWD} # the default value of DOC_PATH and TMPFILE_PATH is PWD ,so here we give permission to ${PWD}
+export DOC_PATH=${PWD}/tests
+export TMPFILE_PATH=${PWD}/tests
+chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
 # In addition, also make sure the .cache folder has 1000:1000 permission, otherwise
 chown 1000:1000 -R $HOME/.cache
 ```
@@ -110,15 +110,10 @@ export MILVUS_ENABLED=0
 #### option a. Deploy the Service on Arc A770 Using Docker Compose
 
 ```bash
-export VLLM_SERVICE_PORT_0=8100 # You can set your own port for vllm service
-# Generate your nginx config file
-# nginx-conf-generator.sh requires 2 parameters: DP_NUM and output filepath
-bash nginx/nginx-conf-generator.sh 1 nginx/nginx.conf
-# set NGINX_CONFIG_PATH
-export NGINX_CONFIG_PATH="${PWD}/nginx/nginx.conf"
+export VLLM_SERVICE_PORT_A770=8086 # You can set your own port for vllm service
 
 # Launch EC-RAG service with compose
-docker compose -f docker_compose/intel/gpu/arc/compose_vllm.yaml up -d
+docker compose --profile a770 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 ```
 
 #### option b. Deploy the Service on Arc B60 Using Docker Compose
@@ -140,7 +135,7 @@ docker compose -f docker_compose/intel/gpu/arc/compose_vllm.yaml up -d
 # export MAX_MODEL_LEN=49152
 # export BLOCK_SIZE=64
 # export QUANTIZATION=fp8
-docker compose -f docker_compose/intel/gpu/arc/compose_vllm_b60.yaml up -d
+docker compose --profile b60 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 ```
 
 ### 6. Access UI
@@ -157,8 +152,7 @@ Below is the UI front page, for detailed operations on UI and EC-RAG settings, p
 To stop the containers associated with the deployment, execute the following command:
 
 ```
-docker compose -f docker_compose/intel/gpu/arc/compose_vllm.yaml down
-# or docker compose -f docker_compose/intel/gpu/arc/compose_vllm_b60.yaml down
+docker compose -f docker_compose/intel/gpu/arc/compose.yaml down
 ```
 
 All the EdgeCraftRAG containers will be stopped and then removed on completion of the "down" command.
 
@@ -1,9 +1,11 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
 services:
   etcd:
     container_name: milvus-etcd
     image: quay.io/coreos/etcd:v3.5.5
+    restart: always
     environment:
       - ETCD_AUTO_COMPACTION_MODE=revision
       - ETCD_AUTO_COMPACTION_RETENTION=1000
@@ -22,6 +24,7 @@ services:
   minio:
     container_name: milvus-minio
     image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    restart: always
     environment:
       MINIO_ACCESS_KEY: minioadmin
       MINIO_SECRET_KEY: minioadmin
@@ -41,14 +44,15 @@ services:
   milvus-standalone:
     container_name: milvus-standalone
     image: milvusdb/milvus:v2.4.6
+    restart: always
     command: ["milvus", "run", "standalone"]
     security_opt:
       - seccomp:unconfined
     environment:
       ETCD_ENDPOINTS: etcd:2379
       MINIO_ADDRESS: minio:9000
     volumes:
-      - ./milvus.yaml:/milvus/configs/milvus.yaml
+      - ./milvus-config.yaml:/milvus/configs/milvus.yaml
       - ${DOCKER_VOLUME_DIRECTORY:-${PWD}}/volumes/milvus:/var/lib/milvus
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
@@ -71,10 +75,12 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_ENDPOINT:-http://${HOST_IP}:${NGINX_PORT:-8086}}
+      vLLM_ENDPOINT: ${vLLM_ENDPOINT:-http://${HOST_IP}:${VLLM_SERVICE_PORT_B60:-8086}}
+      LLM_MODEL: ${LLM_MODEL}
       ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
-      MAX_MODEL_LEN: ${MAX_MODEL_LEN:-5000}
+      MAX_MODEL_LEN: ${MAX_MODEL_LEN:-49152}
       CHAT_HISTORY_ROUND: ${CHAT_HISTORY_ROUND:-0}
+      METADATA_DATABASE_URL: ${METADATA_DATABASE_URL:-""}
     volumes:
       - ${MODEL_PATH:-${PWD}}:/home/user/models
       - ${DOC_PATH:-${PWD}}:/home/user/docs
@@ -125,6 +131,96 @@ services:
     depends_on:
       - edgecraftrag-server
       - ecrag
+  llm-serving-xpu-b60:
+    container_name: ipex-serving-xpu-container
+    image: intel/llm-scaler-vllm:1.1-preview
+    privileged: true
+    restart: always
+    ports:
+      - ${VLLM_SERVICE_PORT_B60:-8086}:${VLLM_SERVICE_PORT_B60:-8086}
+    volumes:
+      - ${MODEL_PATH}:/workspace/vllm/models
+    devices:
+      - /dev/dri:/dev/dri
+    environment:
+      DTYPE: ${DTYPE:-float16}
+      VLLM_SERVICE_PORT_B60: ${VLLM_SERVICE_PORT_B60:-8086}
+      ZE_AFFINITY_MASK: ${ZE_AFFINITY_MASK:-0}
+      ENFORCE_EAGER: ${ENFORCE_EAGER:-1}
+      TRUST_REMOTE_CODE: ${TRUST_REMOTE_CODE:-1}
+      DISABLE_SLIDING_WINDOW: ${DISABLE_SLIDING_WINDOW:-1}
+      GPU_MEMORY_UTIL: ${GPU_MEMORY_UTIL:-0.8}
+      NO_ENABLE_PREFIX_CACHING: ${NO_ENABLE_PREFIX_CACHING:-1}
+      MAX_NUM_BATCHED_TOKENS: ${MAX_NUM_BATCHED_TOKENS:-8192}
+      DISABLE_LOG_REQUESTS: ${DISABLE_LOG_REQUESTS:-1}
+      MAX_MODEL_LEN: ${MAX_MODEL_LEN:-49152}
+      BLOCK_SIZE: ${BLOCK_SIZE:-64}
+      QUANTIZATION: ${QUANTIZATION:-fp8}
+      LLM_MODEL: ${LLM_MODEL}
+      TP: ${TP:-1}
+      DP: ${DP:-1}
+    entrypoint:
+      /bin/bash -c "
+      cd  /workspace/vllm/models && source /opt/intel/oneapi/setvars.sh --force &&
+      VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1 \
+      TORCH_LLM_ALLREDUCE=1 \
+      VLLM_USE_V1=1 \
+      CCL_ZE_IPC_EXCHANGE=pidfd \
+      VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+      VLLM_WORKER_MULTIPROC_METHOD=spawn \
+      python3 -m vllm.entrypoints.openai.api_server \
+      --model $${LLM_MODEL} \
+      --dtype $${DTYPE} \
+      --enforce-eager \
+      --port $${VLLM_SERVICE_PORT_B60} \
+      --trust-remote-code \
+      --disable-sliding-window \
+      --gpu-memory-util $${GPU_MEMORY_UTIL} \
+      --no-enable-prefix-caching \
+      --max-num-batched-tokens $${MAX_NUM_BATCHED_TOKENS} \
+      --disable-log-requests \
+      --max-model-len $${MAX_MODEL_LEN} \
+      --block-size $${BLOCK_SIZE} \
+      --quantization $${QUANTIZATION} \
+      -tp=$${TP} \
+      -dp=$${DP}"
+    profiles:
+      - b60
+  llm-serving-xpu-770:
+    container_name: ipex-llm-serving-xpu-770
+    image: intelanalytics/ipex-llm-serving-xpu:0.8.3-b20
+    privileged: true
+    restart: always
+    ports:
+      - ${VLLM_SERVICE_PORT_A770:-8086}:${VLLM_SERVICE_PORT_A770:-8086}
+    group_add:
+      - video
+      - ${VIDEOGROUPID:-44}
+      - ${RENDERGROUPID:-109}
+    volumes:
+      - ${LLM_MODEL_PATH:-${MODEL_PATH}/${LLM_MODEL}}:/llm/models
+    devices:
+      - /dev/dri
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MODEL_PATH: "/llm/models"
+      SERVED_MODEL_NAME: ${LLM_MODEL}
+      TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
+      MAX_NUM_SEQS: ${MAX_NUM_SEQS:-64}
+      MAX_NUM_BATCHED_TOKENS: ${MAX_NUM_BATCHED_TOKENS:-10240}
+      MAX_MODEL_LEN: ${MAX_MODEL_LEN:-10240}
+      LOAD_IN_LOW_BIT: ${LOAD_IN_LOW_BIT:-fp8}
+      CCL_DG2_USM: ${CCL_DG2_USM:-""}
+      PORT: ${VLLM_SERVICE_PORT_A770:-8086}
+      ZE_AFFINITY_MASK: ${SELECTED_XPU_0:-0}
+    shm_size: '32g'
+    entrypoint: /bin/bash -c "\
+      cd /llm && \
+      bash start-vllm-service.sh"
+    profiles:
+      - a770
 networks:
   default:
     driver: bridge