Sync helm charts values with GenAIInfra (#2219)

chensuyue · web-flow · commit b5e844e4f725 · 2025-08-22T14:43:08.000+08:00
Signed-off-by: chensuyue &lt;suyue.chen@intel.com&gt;
diff --git a/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml b/AgentQnA/kubernetes/helm/rocm-tgi-values.yaml
@@ -0,0 +1,57 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: ghcr.io/huggingface/text-generation-inference
+    tag: "3.0.0-rocm"
+  LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  PYTORCH_TUNABLEOP_ENABLED: "0"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "false"
+  HIP_VISIBLE_DEVICES: "0,1"
+  MAX_BATCH_SIZE: "4"
+  extraCmdArgs: [ "--num-shard","2" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+    requests:
+      cpu: 1
+      memory: 16Gi
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+    capabilities:
+      add:
+        - SYS_PTRACE
+  readinessProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  startupProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+  model: "meta-llama/Llama-3.3-70B-Instruct"
diff --git a/AgentQnA/kubernetes/helm/rocm-values.yaml b/AgentQnA/kubernetes/helm/rocm-values.yaml
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: opea/vllm-rocm
+    tag: latest
+  env:
+    LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct
+    HIP_VISIBLE_DEVICES: "0,1"
+    TENSOR_PARALLEL_SIZE: "2"
+    HF_HUB_DISABLE_PROGRESS_BARS: "1"
+    HF_HUB_ENABLE_HF_TRANSFER: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    PYTORCH_JIT: "0"
+    HF_HOME: "/data"
+  extraCmd:
+    command: [ "python3", "/workspace/api_server.py" ]
+  extraCmdArgs: [ "--swap-space", "16",
+                  "--disable-log-requests",
+                  "--dtype", "float16",
+                  "--num-scheduler-steps", "1",
+                  "--distributed-executor-backend", "mp" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+  startupProbe:
+    failureThreshold: 180
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
@@ -49,8 +49,8 @@ teirerank:
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: 1.5.0
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: hpu-1.7
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
@@ -42,8 +42,8 @@ teirerank:
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: 1.5.0
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: hpu-1.7
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml
@@ -0,0 +1,52 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "TGI"
+  service:
+    port: 80
+tgi:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: ghcr.io/huggingface/text-generation-inference
+    tag: "3.0.0-rocm"
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "false"
+  PYTORCH_TUNABLEOP_ENABLED: "0"
+  HIP_VISIBLE_DEVICES: "0,1"
+  MAX_BATCH_SIZE: "4"
+  extraCmdArgs: [ "--num-shard","2" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+    requests:
+      cpu: 1
+      memory: 16Gi
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+    capabilities:
+      add:
+        - SYS_PTRACE
+  readinessProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  startupProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
diff --git a/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml b/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml
@@ -0,0 +1,45 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "vLLM"
+  service:
+    port: 80
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: opea/vllm-rocm
+    tag: latest
+  env:
+    HIP_VISIBLE_DEVICES: "0"
+    TENSOR_PARALLEL_SIZE: "1"
+    HF_HUB_DISABLE_PROGRESS_BARS: "1"
+    HF_HUB_ENABLE_HF_TRANSFER: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    PYTORCH_JIT: "0"
+    HF_HOME: "/data"
+  extraCmd:
+    command: [ "python3", "/workspace/api_server.py" ]
+  extraCmdArgs: [ "--swap-space", "16",
+                  "--disable-log-requests",
+                  "--dtype", "float16",
+                  "--num-scheduler-steps", "1",
+                  "--distributed-executor-backend", "mp" ]
+  resources:
+    limits:
+      amd.com/gpu: "1"
+  startupProbe:
+    failureThreshold: 180
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
diff --git a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -43,8 +43,8 @@ teirerank:
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: 1.5.0
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: hpu-1.7
   resources:
     limits:
       habana.ai/gaudi: 1
@@ -60,8 +60,8 @@ teirerank:
 #   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
 #   MAX_WARMUP_SEQUENCE_LENGTH: "512"
 #   image:
-#     repository: ghcr.io/huggingface/tei-gaudi
-#     tag: 1.5.0
+#     repository: ghcr.io/huggingface/text-embeddings-inference
+#     tag: hpu-1.7
 #   resources:
 #     limits:
 #       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/gaudi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-values.yaml
@@ -37,8 +37,8 @@ teirerank:
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: 1.5.0
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: hpu-1.7
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -19,8 +19,8 @@ guardrails-usvc:
 # tei:
 #   accelDevice: "gaudi"
 #   image:
-#     repository: ghcr.io/huggingface/tei-gaudi
-#     tag: 1.5.0
+#     repository: ghcr.io/huggingface/text-embeddings-inference
+#     tag: hpu-1.7
 #   resources:
 #     limits:
 #       habana.ai/gaudi: 1
@@ -32,8 +32,8 @@ teirerank:
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   MAX_WARMUP_SEQUENCE_LENGTH: "512"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: "1.5.0"
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: hpu-1.7
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml b/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: ghcr.io/huggingface/text-generation-inference
+    tag: "3.0.0-rocm"
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  PYTORCH_TUNABLEOP_ENABLED: "0"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  HIP_VISIBLE_DEVICES: "0,1"
+  MAX_BATCH_SIZE: "4"
+  extraCmdArgs: [ "--num-shard","2" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+    requests:
+      cpu: 1
+      memory: 16Gi
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+    capabilities:
+      add:
+        - SYS_PTRACE
+  readinessProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  startupProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+
+vllm:
+  enabled: false
diff --git a/ChatQnA/kubernetes/helm/rocm-values.yaml b/ChatQnA/kubernetes/helm/rocm-values.yaml
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: opea/vllm-rocm
+    tag: latest
+  env:
+    HIP_VISIBLE_DEVICES: "0"
+    TENSOR_PARALLEL_SIZE: "1"
+    HF_HUB_DISABLE_PROGRESS_BARS: "1"
+    HF_HUB_ENABLE_HF_TRANSFER: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    PYTORCH_JIT: "0"
+    HF_HOME: "/data"
+  extraCmd:
+    command: [ "python3", "/workspace/api_server.py" ]
+  extraCmdArgs: [ "--swap-space", "16",
+                  "--disable-log-requests",
+                  "--dtype", "float16",
+                  "--num-scheduler-steps", "1",
+                  "--distributed-executor-backend", "mp" ]
+  resources:
+    limits:
+      amd.com/gpu: "1"
+  startupProbe:
+    failureThreshold: 180
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -31,8 +31,8 @@ tgi:
 tei:
   accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tei-gaudi
-    tag: "1.5.0"
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: "hpu-1.7"
   OMPI_MCA_btl_vader_single_copy_mechanism: none
   MAX_WARMUP_SEQUENCE_LENGTH: 512
   securityContext: