|
| 1 | +# Copyright (C) 2024 Intel Corporation |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +# Accelerate inferencing in heaviest components to improve performance |
| 5 | +# by overriding their subchart values |
| 6 | + |
| 7 | +tgi: |
| 8 | + enabled: false |
| 9 | +vllm: |
| 10 | + enabled: true |
| 11 | + accelDevice: "rocm" |
| 12 | + image: |
| 13 | + repository: opea/vllm-rocm |
| 14 | + tag: latest |
| 15 | + env: |
| 16 | + LLM_MODEL_ID: meta-llama/Llama-3.3-70B-Instruct |
| 17 | + HIP_VISIBLE_DEVICES: "0,1" |
| 18 | + TENSOR_PARALLEL_SIZE: "2" |
| 19 | + HF_HUB_DISABLE_PROGRESS_BARS: "1" |
| 20 | + HF_HUB_ENABLE_HF_TRANSFER: "0" |
| 21 | + VLLM_USE_TRITON_FLASH_ATTN: "0" |
| 22 | + VLLM_WORKER_MULTIPROC_METHOD: "spawn" |
| 23 | + PYTORCH_JIT: "0" |
| 24 | + HF_HOME: "/data" |
| 25 | + extraCmd: |
| 26 | + command: [ "python3", "/workspace/api_server.py" ] |
| 27 | + extraCmdArgs: [ "--swap-space", "16", |
| 28 | + "--disable-log-requests", |
| 29 | + "--dtype", "float16", |
| 30 | + "--num-scheduler-steps", "1", |
| 31 | + "--distributed-executor-backend", "mp" ] |
| 32 | + resources: |
| 33 | + limits: |
| 34 | + amd.com/gpu: "2" |
| 35 | + startupProbe: |
| 36 | + failureThreshold: 180 |
| 37 | + securityContext: |
| 38 | + readOnlyRootFilesystem: false |
| 39 | + runAsNonRoot: false |
| 40 | + runAsUser: 0 |
| 41 | +supervisor: |
| 42 | + llm_endpoint_url: http://{{ .Release.Name }}-vllm |
| 43 | + llm_engine: vllm |
| 44 | + model: "meta-llama/Llama-3.3-70B-Instruct" |
| 45 | +ragagent: |
| 46 | + llm_endpoint_url: http://{{ .Release.Name }}-vllm |
| 47 | + llm_engine: vllm |
| 48 | + model: "meta-llama/Llama-3.3-70B-Instruct" |
| 49 | +sqlagent: |
| 50 | + llm_endpoint_url: http://{{ .Release.Name }}-vllm |
| 51 | + llm_engine: vllm |
| 52 | + model: "meta-llama/Llama-3.3-70B-Instruct" |
0 commit comments