Adapt to latest changes in llm microservice famliy (#696)

lianhao · web-flow · commit 70ad650d78f9 · 2025-01-15T13:58:48.000+08:00
Signed-off-by: Lianhao Lu &lt;lianhao.lu@intel.com&gt;
diff --git a/helm-charts/common/llm-uservice/.helmignore b/helm-charts/common/llm-uservice/.helmignore
@@ -21,3 +21,5 @@
 .idea/
 *.tmproj
 .vscode/
+# CI values
+ci*-values.yaml
diff --git a/helm-charts/common/llm-uservice/README.md b/helm-charts/common/llm-uservice/README.md
@@ -1,55 +1,90 @@
 # llm-uservice
 
-Helm chart for deploying LLM microservice.
+Helm chart for deploying OPEA LLM microservices.
 
-llm-uservice depends on TGI, you should set TGI_LLM_ENDPOINT as tgi endpoint.
+## Installing the chart
 
-## (Option1): Installing the chart separately
+`llm-uservice` depends on one of the following inference backend services:
 
-First, you need to install the tgi chart, please refer to the [tgi](../tgi) chart for more information.
+- TGI: please refer to [tgi](../tgi) chart for more information
 
-After you've deployted the tgi chart successfully, please run `kubectl get svc` to get the tgi service endpoint, i.e. `http://tgi`.
+- vLLM: please refer to [vllm](../vllm) chart for more information
 
-To install the chart, run the following:
+First, you need to install one of the dependent chart, i.e. `tgi` or `vllm` helm chart.
 
-```console
-cd GenAIInfra/helm-charts/common/llm-uservice
-export HFTOKEN="insert-your-huggingface-token-here"
-export TGI_LLM_ENDPOINT="http://tgi"
-helm dependency update
-helm install llm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set TGI_LLM_ENDPOINT=${TGI_LLM_ENDPOINT} --wait
-```
+After you've deployed the dependent chart successfully, please run `kubectl get svc` to get the backend inference service endpoint, e.g. `http://tgi`, `http://vllm`.
 
-## (Option2): Installing the chart with dependencies automatically
+To install the `llm-uservice` chart, run the following:
 
 ```console
 cd GenAIInfra/helm-charts/common/llm-uservice
-export HFTOKEN="insert-your-huggingface-token-here"
 helm dependency update
-helm install llm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set tgi.enabled=true --wait
+export HFTOKEN="insert-your-huggingface-token-here"
+# set backend inferene service endpoint URL
+# for tgi
+export LLM_ENDPOINT="http://tgi"
+# for vllm
+# export LLM_ENDPOINT="http://vllm"
+
+# set the same model used by the backend inference service
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+# install llm-textgen with TGI backend
+helm install llm-uservice . --set TEXTGEN_BACKEND="TGI" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
+
+# install llm-textgen with vLLM backend
+# helm install llm-uservice . --set TEXTGEN_BACKEND="vLLM" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
+
+# install llm-docsum with TGI backend
+# helm install llm-uservice . --set image.repository="opea/llm-docsum" --set DOCSUM_BACKEND="TGI" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set MAX_INPUT_TOKENS=2048 --set MAX_TOTAL_TOKENS=4096 --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
+
+# install llm-docsum with vLLM backend
+# helm install llm-uservice . --set image.repository="opea/llm-docsum" --set DOCSUM_BACKEND="vLLM" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set MAX_INPUT_TOKENS=2048 --set MAX_TOTAL_TOKENS=4096 --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
+
+# install llm-faqgen with TGI backend
+# helm install llm-uservice . --set image.repository="opea/llm-faqgen" --set FAQGEN_BACKEND="TGI" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
+
+# install llm-faqgen with vLLM backend
+# helm install llm-uservice . --set image.repository="opea/llm-faqgen" --set FAQGEN_BACKEND="vLLM" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
 ```
 
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
 
-Then run the command `kubectl port-forward svc/llm-uservice 9000:9000` to expose the llm-uservice service for access.
+Then run the command `kubectl port-forward svc/llm-uservice 9000:9000` to expose the service for access.
 
 Open another terminal and run the following command to verify the service if working:
 
 ```console
+# for llm-textgen service
 curl http://localhost:9000/v1/chat/completions \
-    -X POST \
-    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-    -H 'Content-Type: application/json'
+  -X POST \
+  -d d '{"model": "${LLM_MODEL_ID}", "messages": "What is Deep Learning?", "max_tokens":17}' \
+  -H 'Content-Type: application/json'
+
+# for llm-docsum service
+curl http://localhost:9000/v1/docsum \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
+  -H 'Content-Type: application/json'
+
+# for llm-faqgen service
+curl http://localhost:9000/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128}' \
+  -H 'Content-Type: application/json'
 ```
 
 ## Values
 
-| Key                             | Type   | Default          | Description                     |
-| ------------------------------- | ------ | ---------------- | ------------------------------- |
-| global.HUGGINGFACEHUB_API_TOKEN | string | `""`             | Your own Hugging Face API token |
-| image.repository                | string | `"opea/llm-tgi"` |                                 |
-| service.port                    | string | `"9000"`         |                                 |
-| TGI_LLM_ENDPOINT                | string | `""`             | LLM endpoint                    |
-| global.monitoring               | bool   | `false`          | Service usage metrics           |
+| Key                             | Type   | Default                       | Description                                                                      |
+| ------------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------- |
+| global.HUGGINGFACEHUB_API_TOKEN | string | `""`                          | Your own Hugging Face API token                                                  |
+| image.repository                | string | `"opea/llm-textgen"`          | one of "opea/llm-textgen", "opea/llm-docsum", "opea/llm-faqgen"                  |
+| LLM_ENDPOINT                    | string | `""`                          | backend inference service endpoint                                               |
+| LLM_MODEL_ID                    | string | `"Intel/neural-chat-7b-v3-3"` | model used by the inference backend                                              |
+| TEXTGEN_BACKEND                 | string | `"tgi"`                       | backend inference engine, only valid for llm-textgen image, one of "TGI", "vLLM" |
+| DOCSUM_BACKEND                  | string | `"tgi"`                       | backend inference engine, only valid for llm-docsum image, one of "TGI", "vLLM"  |
+| FAQGEN_BACKEND                  | string | `"tgi"`                       | backend inference engine, only valid for llm-faqgen image, one of "TGi", "vLLM"  |
+| global.monitoring               | bool   | `false`                       | Service usage metrics                                                            |
diff --git a/helm-charts/common/llm-uservice/ci-docsum-values.yaml b/helm-charts/common/llm-uservice/ci-docsum-values.yaml
@@ -2,13 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 image:
-  repository: opea/llm-docsum-tgi
+  repository: opea/llm-docsum
   tag: "latest"
 
+LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3"
 MAX_INPUT_TOKENS: 2048
 MAX_TOTAL_TOKENS: 4096
 
 tgi:
+  LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3"
   enabled: true
   MAX_INPUT_LENGTH: 2048
   MAX_TOTAL_TOKENS: 4096
diff --git a/helm-charts/common/llm-uservice/ci-faqgen-values.yaml b/helm-charts/common/llm-uservice/ci-faqgen-values.yaml
@@ -2,9 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 image:
-  repository: opea/llm-faqgen-tgi
+  repository: opea/llm-faqgen
   tag: "latest"
 
+LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+
 tgi:
   enabled: true
   LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/helm-charts/common/llm-uservice/ci-vllm-docsum-gaudi-values.yaml b/helm-charts/common/llm-uservice/ci-vllm-docsum-gaudi-values.yaml
@@ -0,0 +1,26 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+image:
+  repository: opea/llm-docsum
+  tag: "latest"
+
+DOCSUM_BACKEND: "vLLM"
+LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3"
+MAX_INPUT_TOKENS: 2048
+MAX_TOTAL_TOKENS: 4096
+
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  image:
+    repository: opea/vllm-gaudi
+    tag: "latest"
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
+  resources:
+    limits:
+      habana.ai/gaudi: 1
diff --git a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml
@@ -18,8 +18,5 @@ vllm:
     limits:
       habana.ai/gaudi: 1
 
-vLLM_ENDPOINT: ""
+TEXTGEN_BACKEND: "vLLM"
 LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-image:
-  repository: opea/llm-vllm
-  tag: "latest"
diff --git a/helm-charts/common/llm-uservice/templates/configmap.yaml b/helm-charts/common/llm-uservice/templates/configmap.yaml
@@ -8,30 +8,59 @@ metadata:
   labels:
     {{- include "llm-uservice.labels" . | nindent 4 }}
 data:
-  {{- if .Values.TGI_LLM_ENDPOINT }}
-  TGI_LLM_ENDPOINT: {{ .Values.TGI_LLM_ENDPOINT | quote }}
+  {{- if hasSuffix "llm-textgen" .Values.image.repository }}
+  {{- if eq "TGI" .Values.TEXTGEN_BACKEND }}
+  LLM_COMPONENT_NAME: "OPEA_LLM"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
+  {{- end }}
+  {{- else if eq "vLLM" .Values.TEXTGEN_BACKEND }}
+  LLM_COMPONENT_NAME: "OPEA_LLM"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
+  {{- end }}
   {{- else }}
-  TGI_LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
+  {{- cat "Invalid TEXTGEN_BACKEND:" .Values.TEXTGEN_BACKEND | fail }}
+  {{- end }}
+  {{- else if hasSuffix "llm-docsum" .Values.image.repository }}
+  MAX_INPUT_TOKENS: {{ .Values.MAX_INPUT_TOKENS | default "" | quote }}
+  MAX_TOTAL_TOKENS: {{ .Values.MAX_TOTAL_TOKENS | default "" | quote }}
+  {{- if eq "TGI" .Values.DOCSUM_BACKEND }}
+  DocSum_COMPONENT_NAME: "OPEADocSum_TGI"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
+  {{- end }}
+  {{- else if eq "vLLM" .Values.DOCSUM_BACKEND }}
+  DocSum_COMPONENT_NAME: "OPEADocSum_vLLM"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
   {{- end }}
-  {{- if .Values.vLLM_ENDPOINT }}
-  vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote }}
   {{- else }}
-  vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
+  {{- cat "Invalid DOCUSM_BACKEND:" .Values.DOCSUM_BACKEND | fail }}
   {{- end }}
-  {{- if .Values.LLM_MODEL_ID }}
-  # NOTE:
-  # delete LLM_MODEL once https://github.com/opea-project/GenAIComps/pull/1089 is merged
-  LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote }}
-  LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }}
+  {{- else if hasSuffix "llm-faqgen" .Values.image.repository }}
+  {{- if eq "TGI" .Values.FAQGEN_BACKEND }}
+  FAQGen_COMPONENT_NAME: "OPEAFAQGen_TGI"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
   {{- end }}
-  {{- if .Values.MAX_INPUT_TOKENS }}
-  MAX_INPUT_TOKENS: {{ .Values.MAX_INPUT_TOKENS | quote }}
+  {{- else if eq "vLLM" .Values.FAQGEN_BACKEND }}
+  FAQGen_COMPONENT_NAME: "OPEAFAQGen_vLLM"
+  {{- if not .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
   {{- end }}
-  {{- if .Values.MAX_TOTAL_TOKENS }}
-  MAX_TOTAL_TOKENS: {{ .Values.MAX_TOTAL_TOKENS | quote }}
+  {{- else }}
+  {{- cat "Invalid FAQGEN_BACKEND:" .Values.FAQGEN_BACKEND | fail }}
+  {{- end }}
+  {{- end }}
+  {{- if .Values.LLM_ENDPOINT }}
+  LLM_ENDPOINT: {{ tpl .Values.LLM_ENDPOINT . | quote }}
+  {{- end }}
+  {{- if .Values.LLM_MODEL_ID }}
+  LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }}
   {{- end }}
-  HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote }}
   HF_HOME: "/tmp/.cache/huggingface"
+  HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote }}
   {{- if .Values.global.HF_ENDPOINT }}
   HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote }}
   {{- end }}
diff --git a/helm-charts/common/llm-uservice/templates/deployment.yaml b/helm-charts/common/llm-uservice/templates/deployment.yaml
@@ -28,8 +28,38 @@ spec:
       serviceAccountName: {{ include "llm-uservice.serviceAccountName" . }}
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: wait-for-llm
+          envFrom:
+            - configMapRef:
+                name: {{ include "llm-uservice.fullname" . }}-config
+            {{- if .Values.global.extraEnvConfig }}
+            - configMapRef:
+                name: {{ .Values.global.extraEnvConfig }}
+                optional: true
+            {{- end }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: busybox:1.36
+          command: ["sh", "-c"]
+          args:
+            - |
+              proto=$(echo ${LLM_ENDPOINT} | sed -n 's/.*\(http[s]\?\):\/\/\([^ :]\+\):\?\([0-9]*\).*/\1/p');
+              host=$(echo ${LLM_ENDPOINT} | sed -n 's/.*\(http[s]\?\):\/\/\([^ :]\+\):\?\([0-9]*\).*/\2/p');
+              port=$(echo ${LLM_ENDPOINT} | sed -n 's/.*\(http[s]\?\):\/\/\([^ :]\+\):\?\([0-9]*\).*/\3/p');
+              if [ -z "$port" ]; then
+                  port=80;
+                  [[ "$proto" = "https" ]] && port=443;
+              fi;
+              retry_count={{ .Values.retryCount | default 60 }};
+              j=1;
+              while ! nc -z ${host} ${port}; do
+                [[ $j -ge ${retry_count} ]] && echo "ERROR: ${host}:${port} is NOT reachable in $j seconds!" && exit 1;
+                j=$((j+1)); sleep 1;
+              done;
+              echo "${host}:${port} is reachable within $j seconds.";
       containers:
-        - name: {{ .Release.Name }}
+        - name: {{ .Chart.Name }}
           envFrom:
             - configMapRef:
                 name: {{ include "llm-uservice.fullname" . }}-config
diff --git a/helm-charts/common/llm-uservice/templates/tests/test-pod.yaml b/helm-charts/common/llm-uservice/templates/tests/test-pod.yaml
@@ -17,26 +17,22 @@ spec:
       command: ['bash', '-c']
       args:
         - |
+          {{- if contains "llm-docsum" .Values.image.repository }}
+          url="http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/docsum";
+          body='{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":17}';
+          {{- else if contains "llm-faqgen" .Values.image.repository }}
+          url="http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/faqgen";
+          body='{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":17}';
+          {{- else }}
+          url="http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions";
+          body='{"model": "{{ .Values.LLM_MODEL_ID }}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}';
+          {{- end }}
           max_retry=20;
           for ((i=1; i<=max_retry; i++)); do
-          {{- if contains "llm-docsum-tgi" .Values.image.repository }}
-          # Try with docsum endpoint
-            curl http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/docsum -sS --fail-with-body \
-              -X POST \
-              -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":17}' \
-              -H 'Content-Type: application/json' && break;
-          {{- else if contains "llm-faqgen-tgi" .Values.image.repository }}
-          # Try with faqgen endpoint
-            curl http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/faqgen -sS --fail-with-body \
+            curl "$url" -sS --fail-with-body \
               -X POST \
-              -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":17}' \
+              -d "$body" \
               -H 'Content-Type: application/json' && break;
-          {{- else }}
-            curl http://{{ include "llm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \
-              -X POST \
-              -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-              -H 'Content-Type: application/json' && break;
-          {{- end }}
             curlcode=$?
             if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi;
           done;
diff --git a/helm-charts/common/llm-uservice/values.yaml b/helm-charts/common/llm-uservice/values.yaml
diff --git a/helm-charts/common/llm-uservice/variant_docsum-values.yaml b/helm-charts/common/llm-uservice/variant_docsum-values.yaml
diff --git a/helm-charts/common/llm-uservice/variant_faqgen-values.yaml b/helm-charts/common/llm-uservice/variant_faqgen-values.yaml