tutorials/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml at c52dcc15d8f6133af358569bd9fc8eef2cb9e695 · triton-inference-server/tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The GPU SKU that supports `.model` and to which Triton Server instances can be deployed.
# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.
# Run 'kubectl get nodes' to find node names.
# Run 'kubectl describe node <node_name>' to inspect a node's labels.
gpu: # (required)
# Example values: NVIDIA-A100-SXM4-40GB, NVIDIA-A10G, Tesla-V100-SXM2-16GB, Tesla-T4

# Configuration options related to the AI model to be deployed.
model: # (required)
  # Name of the model to be served Triton Server instances.
  # Supported values are:
  # - gpt2
  # - llama-2-7b
  # - llama-2-70b
  # - llama-2-7b-chat
  # - llama-2-70b-chat
  # - llama-3-8b
  # - llama-3-70b
  # - llama-3-8b-instruct
  # - llama-3-70b-instruct
  # - opt125m
  name: # (required)
  # Persistent volume claim where model content will be persisted.
  # Expected to support read/write many access.
  persistentVolumeClaim: # (required)
  # Name of the secret used to download the model from Hugging Face.
  # GPT2 does not require an access token to download.
  # Other models may require per repository permissions to be granted.
  pullSecret: # (optional)
  # When `false` a model conversion job is created and the leader pod will wait for the job to complete before starting Triton; otherwise this doesn't happen.
  # When not relying on the model conversion job, the following must exist on the persistent volume:
  # - models: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/model"
  # - engine: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/engine"
  skipConversion: # (default: false)
  # Configuration options related to the conversion of a non-optimized model into TensorRT format.
  tensorrtLlm: # (optional)
    # Configuration opens related to conversion of non-TensorRT models to TensorRT engine and plan files.
    # Ignored when `model.skipConversion` is `true`.
    conversion: # (optional)
      # Number of logical CPU cores reserved for, and assigned to the model conversion job.
      cpu: # (default: 4)
      # Number of GPUs reserved for, and assigned to the model conversion job.
      gpu: # (default: 1)
      # Amount of CPU-visible system memory allocated to, and reserved for the model conversion job.
      memory: # (default: 32Gi)
    # Data type used when compiling and optimizing the model for TensorRT.
    # Supported options are float16, bfloat16, float32
    dataType: # (default: float16)
    # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.
    # When 'false', the init container will fall back to vLLM and parallelism options are ignored.
    enable: true # (default: true)
    # Parallelism configuration options which affect how the model is converted to
    # TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.
    parallelism: # (optional)
      # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a
      # subset of layers that is executed on a separate device.
      # The main limitation of this method is that, due to the sequential nature of the processing, some devices or
      # layers may remain idle while waiting for the output.
      pipeline: # (default: 1)
      # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller,
      # independent blocks of computation that can be executed on different devices.
      # Attention blocks and multi-layer perceptron (MLP) layers are major components of transformers that can take advantage of
      # tensor parallelism.
      # In multi-head attention blocks, each head or group of heads can be assigned to a different device so they can be computed
      # independently and in parallel.
      tensor: # (default: 1)

# Configuration options for Triton Server.
triton: # (required)
  # Configuration options related to the container image for Triton Server.
  image: # (required)
    # Optional list of pull secrets to be used when downloading the Triton Server container image.
    pullSecrets: # (optional)
    # - name: ngc-container-pull
    # Name of the container image containing the version of Triton Server to be used.
    name: # (required)
  # Configuration options managing the resources assigned to individual Triton Server instances.
  resources: # (optional)
    # Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.
    cpu: # (default: 4)
    # Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.
    memory: # (default: 32Gi)

# Configuration options related to how various components generate logs.
logging: # (optional)
  # Logging configuration options specific to the initialization container.
  initialization:
    # When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used.
    verbose: # (default: false)
  # Logging configuration options specific to Triton Server.
  tritonServer:
    # When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used.
    useIso8601: # (default: false)
    # When `true` Triton Server uses verbose logging; otherwise standard logging is used.
    verbose: # (default: false)

# Configurations option related to the Kubernetes objects created by the chart.
kubernetes: # (optional)
  # Root file-system path used when mounting content to the underlying host.
  hostRootPath: # (default: /triton)
  # Optional set of labels to be applied to created Kubernetes objects.
  # These labels can be used for association with a preexisting service object.
  labels: # (optional)
    # customLabel: exampleValue
  # When `false`, a service will not be created when the chart is installed; otherwise a service will be created.
  noService: # (default: false)
  # Name of the service account to use when deploying components.
  # When not provided, a service account will be created.
  serviceAccount: # (optional)
  # Tolerations applied to every pod deployed as part of this deployment.
  # Template already includes `nvidia.com/gpu=present:NoSchedule`.
  tolerations: # (optional)