Skip to content

Commit b5e844e

Browse files
authored
Sync helm charts values with GenAIInfra (#2219)
Signed-off-by: chensuyue <suyue.chen@intel.com>
1 parent 46236c7 commit b5e844e

12 files changed

Lines changed: 308 additions & 16 deletions
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Accelerate inferencing in heaviest components to improve performance
5+
# by overriding their subchart values
6+
vllm:
7+
enabled: false
8+
tgi:
9+
enabled: true
10+
accelDevice: "rocm"
11+
image:
12+
repository: ghcr.io/huggingface/text-generation-inference
13+
tag: "3.0.0-rocm"
14+
LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct
15+
MAX_INPUT_LENGTH: "2048"
16+
MAX_TOTAL_TOKENS: "4096"
17+
PYTORCH_TUNABLEOP_ENABLED: "0"
18+
USE_FLASH_ATTENTION: "true"
19+
FLASH_ATTENTION_RECOMPUTE: "false"
20+
HIP_VISIBLE_DEVICES: "0,1"
21+
MAX_BATCH_SIZE: "4"
22+
extraCmdArgs: [ "--num-shard","2" ]
23+
resources:
24+
limits:
25+
amd.com/gpu: "2"
26+
requests:
27+
cpu: 1
28+
memory: 16Gi
29+
securityContext:
30+
readOnlyRootFilesystem: false
31+
runAsNonRoot: false
32+
runAsUser: 0
33+
capabilities:
34+
add:
35+
- SYS_PTRACE
36+
readinessProbe:
37+
initialDelaySeconds: 60
38+
periodSeconds: 5
39+
timeoutSeconds: 1
40+
failureThreshold: 120
41+
startupProbe:
42+
initialDelaySeconds: 60
43+
periodSeconds: 5
44+
timeoutSeconds: 1
45+
failureThreshold: 120
46+
supervisor:
47+
llm_endpoint_url: http://{{ .Release.Name }}-tgi
48+
llm_engine: tgi
49+
model: "meta-llama/Llama-3.3-70B-Instruct"
50+
ragagent:
51+
llm_endpoint_url: http://{{ .Release.Name }}-tgi
52+
llm_engine: tgi
53+
model: "meta-llama/Llama-3.3-70B-Instruct"
54+
sqlagent:
55+
llm_endpoint_url: http://{{ .Release.Name }}-tgi
56+
llm_engine: tgi
57+
model: "meta-llama/Llama-3.3-70B-Instruct"
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Accelerate inferencing in heaviest components to improve performance
5+
# by overriding their subchart values
6+
7+
tgi:
8+
enabled: false
9+
vllm:
10+
enabled: true
11+
accelDevice: "rocm"
12+
image:
13+
repository: opea/vllm-rocm
14+
tag: latest
15+
env:
16+
LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct
17+
HIP_VISIBLE_DEVICES: "0,1"
18+
TENSOR_PARALLEL_SIZE: "2"
19+
HF_HUB_DISABLE_PROGRESS_BARS: "1"
20+
HF_HUB_ENABLE_HF_TRANSFER: "0"
21+
VLLM_USE_TRITON_FLASH_ATTN: "0"
22+
VLLM_WORKER_MULTIPROC_METHOD: "spawn"
23+
PYTORCH_JIT: "0"
24+
HF_HOME: "/data"
25+
extraCmd:
26+
command: [ "python3", "/workspace/api_server.py" ]
27+
extraCmdArgs: [ "--swap-space", "16",
28+
"--disable-log-requests",
29+
"--dtype", "float16",
30+
"--num-scheduler-steps", "1",
31+
"--distributed-executor-backend", "mp" ]
32+
resources:
33+
limits:
34+
amd.com/gpu: "2"
35+
startupProbe:
36+
failureThreshold: 180
37+
securityContext:
38+
readOnlyRootFilesystem: false
39+
runAsNonRoot: false
40+
runAsUser: 0
41+
supervisor:
42+
llm_endpoint_url: http://{{ .Release.Name }}-vllm
43+
llm_engine: vllm
44+
model: "meta-llama/Llama-3.3-70B-Instruct"
45+
ragagent:
46+
llm_endpoint_url: http://{{ .Release.Name }}-vllm
47+
llm_engine: vllm
48+
model: "meta-llama/Llama-3.3-70B-Instruct"
49+
sqlagent:
50+
llm_endpoint_url: http://{{ .Release.Name }}-vllm
51+
llm_engine: vllm
52+
model: "meta-llama/Llama-3.3-70B-Instruct"

ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ teirerank:
4949
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
5050
MAX_WARMUP_SEQUENCE_LENGTH: "512"
5151
image:
52-
repository: ghcr.io/huggingface/tei-gaudi
53-
tag: 1.5.0
52+
repository: ghcr.io/huggingface/text-embeddings-inference
53+
tag: hpu-1.7
5454
resources:
5555
limits:
5656
habana.ai/gaudi: 1

ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ teirerank:
4242
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
4343
MAX_WARMUP_SEQUENCE_LENGTH: "512"
4444
image:
45-
repository: ghcr.io/huggingface/tei-gaudi
46-
tag: 1.5.0
45+
repository: ghcr.io/huggingface/text-embeddings-inference
46+
tag: hpu-1.7
4747
resources:
4848
limits:
4949
habana.ai/gaudi: 1
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
CHATQNA_TYPE: "CHATQNA_FAQGEN"
5+
llm-uservice:
6+
enabled: true
7+
image:
8+
repository: opea/llm-faqgen
9+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
10+
FAQGEN_BACKEND: "TGI"
11+
service:
12+
port: 80
13+
tgi:
14+
enabled: true
15+
accelDevice: "rocm"
16+
image:
17+
repository: ghcr.io/huggingface/text-generation-inference
18+
tag: "3.0.0-rocm"
19+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
20+
MAX_INPUT_LENGTH: "2048"
21+
MAX_TOTAL_TOKENS: "4096"
22+
USE_FLASH_ATTENTION: "true"
23+
FLASH_ATTENTION_RECOMPUTE: "false"
24+
PYTORCH_TUNABLEOP_ENABLED: "0"
25+
HIP_VISIBLE_DEVICES: "0,1"
26+
MAX_BATCH_SIZE: "4"
27+
extraCmdArgs: [ "--num-shard","2" ]
28+
resources:
29+
limits:
30+
amd.com/gpu: "2"
31+
requests:
32+
cpu: 1
33+
memory: 16Gi
34+
securityContext:
35+
readOnlyRootFilesystem: false
36+
runAsNonRoot: false
37+
runAsUser: 0
38+
capabilities:
39+
add:
40+
- SYS_PTRACE
41+
readinessProbe:
42+
initialDelaySeconds: 60
43+
periodSeconds: 5
44+
timeoutSeconds: 1
45+
failureThreshold: 120
46+
startupProbe:
47+
initialDelaySeconds: 60
48+
periodSeconds: 5
49+
timeoutSeconds: 1
50+
failureThreshold: 120
51+
vllm:
52+
enabled: false
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
CHATQNA_TYPE: "CHATQNA_FAQGEN"
5+
llm-uservice:
6+
enabled: true
7+
image:
8+
repository: opea/llm-faqgen
9+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
10+
FAQGEN_BACKEND: "vLLM"
11+
service:
12+
port: 80
13+
tgi:
14+
enabled: false
15+
vllm:
16+
enabled: true
17+
accelDevice: "rocm"
18+
image:
19+
repository: opea/vllm-rocm
20+
tag: latest
21+
env:
22+
HIP_VISIBLE_DEVICES: "0"
23+
TENSOR_PARALLEL_SIZE: "1"
24+
HF_HUB_DISABLE_PROGRESS_BARS: "1"
25+
HF_HUB_ENABLE_HF_TRANSFER: "0"
26+
VLLM_USE_TRITON_FLASH_ATTN: "0"
27+
VLLM_WORKER_MULTIPROC_METHOD: "spawn"
28+
PYTORCH_JIT: "0"
29+
HF_HOME: "/data"
30+
extraCmd:
31+
command: [ "python3", "/workspace/api_server.py" ]
32+
extraCmdArgs: [ "--swap-space", "16",
33+
"--disable-log-requests",
34+
"--dtype", "float16",
35+
"--num-scheduler-steps", "1",
36+
"--distributed-executor-backend", "mp" ]
37+
resources:
38+
limits:
39+
amd.com/gpu: "1"
40+
startupProbe:
41+
failureThreshold: 180
42+
securityContext:
43+
readOnlyRootFilesystem: false
44+
runAsNonRoot: false
45+
runAsUser: 0

ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ teirerank:
4343
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
4444
MAX_WARMUP_SEQUENCE_LENGTH: "512"
4545
image:
46-
repository: ghcr.io/huggingface/tei-gaudi
47-
tag: 1.5.0
46+
repository: ghcr.io/huggingface/text-embeddings-inference
47+
tag: hpu-1.7
4848
resources:
4949
limits:
5050
habana.ai/gaudi: 1
@@ -60,8 +60,8 @@ teirerank:
6060
# OMPI_MCA_btl_vader_single_copy_mechanism: "none"
6161
# MAX_WARMUP_SEQUENCE_LENGTH: "512"
6262
# image:
63-
# repository: ghcr.io/huggingface/tei-gaudi
64-
# tag: 1.5.0
63+
# repository: ghcr.io/huggingface/text-embeddings-inference
64+
# tag: hpu-1.7
6565
# resources:
6666
# limits:
6767
# habana.ai/gaudi: 1

ChatQnA/kubernetes/helm/gaudi-values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ teirerank:
3737
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
3838
MAX_WARMUP_SEQUENCE_LENGTH: "512"
3939
image:
40-
repository: ghcr.io/huggingface/tei-gaudi
41-
tag: 1.5.0
40+
repository: ghcr.io/huggingface/text-embeddings-inference
41+
tag: hpu-1.7
4242
resources:
4343
limits:
4444
habana.ai/gaudi: 1

ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ guardrails-usvc:
1919
# tei:
2020
# accelDevice: "gaudi"
2121
# image:
22-
# repository: ghcr.io/huggingface/tei-gaudi
23-
# tag: 1.5.0
22+
# repository: ghcr.io/huggingface/text-embeddings-inference
23+
# tag: hpu-1.7
2424
# resources:
2525
# limits:
2626
# habana.ai/gaudi: 1
@@ -32,8 +32,8 @@ teirerank:
3232
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
3333
MAX_WARMUP_SEQUENCE_LENGTH: "512"
3434
image:
35-
repository: ghcr.io/huggingface/tei-gaudi
36-
tag: "1.5.0"
35+
repository: ghcr.io/huggingface/text-embeddings-inference
36+
tag: hpu-1.7
3737
resources:
3838
limits:
3939
habana.ai/gaudi: 1
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# Accelerate inferencing in heaviest components to improve performance
5+
# by overriding their subchart values
6+
7+
tgi:
8+
enabled: true
9+
accelDevice: "rocm"
10+
image:
11+
repository: ghcr.io/huggingface/text-generation-inference
12+
tag: "3.0.0-rocm"
13+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
14+
MAX_INPUT_LENGTH: "2048"
15+
MAX_TOTAL_TOKENS: "4096"
16+
PYTORCH_TUNABLEOP_ENABLED: "0"
17+
USE_FLASH_ATTENTION: "true"
18+
FLASH_ATTENTION_RECOMPUTE: "true"
19+
HIP_VISIBLE_DEVICES: "0,1"
20+
MAX_BATCH_SIZE: "4"
21+
extraCmdArgs: [ "--num-shard","2" ]
22+
resources:
23+
limits:
24+
amd.com/gpu: "2"
25+
requests:
26+
cpu: 1
27+
memory: 16Gi
28+
securityContext:
29+
readOnlyRootFilesystem: false
30+
runAsNonRoot: false
31+
runAsUser: 0
32+
capabilities:
33+
add:
34+
- SYS_PTRACE
35+
readinessProbe:
36+
initialDelaySeconds: 60
37+
periodSeconds: 5
38+
timeoutSeconds: 1
39+
failureThreshold: 120
40+
startupProbe:
41+
initialDelaySeconds: 60
42+
periodSeconds: 5
43+
timeoutSeconds: 1
44+
failureThreshold: 120
45+
46+
vllm:
47+
enabled: false

0 commit comments

Comments
 (0)