@@ -10,8 +10,9 @@ CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
1010# guardrails related config
1111guardrails-usvc :
1212 enabled : true
13- # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi -guardrails"
13+ SAFETY_GUARD_ENDPOINT : " http://{{ .Release.Name }}-vllm -guardrails"
1414 SAFETY_GUARD_MODEL_ID : " meta-llama/Meta-Llama-Guard-2-8B"
15+ retryTimeoutSeconds : 720
1516
1617# gaudi related config
1718# tei running on CPU by default
@@ -41,33 +42,24 @@ teirerank:
4142 readinessProbe :
4243 timeoutSeconds : 1
4344
44- tgi -guardrails :
45+ vllm -guardrails :
4546 enabled : true
4647 accelDevice : " gaudi"
47- LLM_MODEL_ID : " meta-llama/Meta-Llama-Guard-2-8B"
4848 image :
49- repository : ghcr.io/huggingface/tgi -gaudi
50- tag : " 2.3.1 "
49+ repository : opea/vllm -gaudi
50+ LLM_MODEL_ID : " meta-llama/Meta-Llama-Guard-2-8B "
5151 resources :
5252 limits :
5353 habana.ai/gaudi : 1
54- MAX_INPUT_LENGTH : " 1024"
55- MAX_TOTAL_TOKENS : " 2048"
56- CUDA_GRAPHS : " "
57- OMPI_MCA_btl_vader_single_copy_mechanism : " none"
58- ENABLE_HPU_GRAPH : " true"
59- LIMIT_HPU_GRAPH : " true"
60- USE_FLASH_ATTENTION : " true"
61- FLASH_ATTENTION_RECOMPUTE : " true"
62- readinessProbe :
63- initialDelaySeconds : 5
64- periodSeconds : 5
65- timeoutSeconds : 1
54+ extraCmdArgs : [
55+ " --tensor-parallel-size" , "1",
56+ " --block-size" , "128",
57+ " --max-num-seqs" , "256",
58+ " --max-seq-len-to-capture" , "2048"
59+ ]
6660 startupProbe :
67- initialDelaySeconds : 5
68- periodSeconds : 5
69- timeoutSeconds : 1
70- failureThreshold : 120
61+ failureThreshold : 360
62+ OMPI_MCA_btl_vader_single_copy_mechanism : " none"
7163
7264tgi :
7365 enabled : false
0 commit comments