From e1d2a25f3b64322e55030319b62a17a7418f95ea Mon Sep 17 00:00:00 2001 From: greg pereira Date: Mon, 5 May 2025 21:05:10 -0700 Subject: [PATCH 1/8] using new NIXL only connector Signed-off-by: greg pereira --- charts/llm-d/README.md | 2 +- .../presets/basic-gpu-with-nixl-preset.yaml | 128 +++++++----------- charts/llm-d/values.yaml | 4 +- 3 files changed, 49 insertions(+), 85 deletions(-) diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md index 30ab979..bde8013 100644 --- a/charts/llm-d/README.md +++ b/charts/llm-d/README.md @@ -165,7 +165,7 @@ Kubernetes: `>= 1.25.0-0` | ingress.tls.enabled | Enable TLS configuration for the host defined at `ingress.host` parameter | bool | `false` | | ingress.tls.secretName | The name to which the TLS Secret will be called | string | `""` | | kubeVersion | Override Kubernetes version | string | `""` | -| modelservice | Model service controller configuration | object | See below | +| modelservice | Model service controller configuration | object | `{"annotations":{},"enabled":true,"epp":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5-amd64"},"metrics":{"enabled":true}},"fullnameOverride":"","image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.6"},"metrics":{"enabled":true},"nameOverride":"","podAnnotations":{},"podLabels":{},"rbac":{"create":true},"replicas":1,"routingProxy":{"image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-routing-sidecar-dev","tag":"0.0.6"}},"service":{"enabled":true,"port":8443,"type":"ClusterIP"},"serviceAccount":{"annotations":{},"create":true,"fullnameOverride":"","labels":{},"nameOverride":""},"vllm":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"},"metrics":{"enabled":true}},"vllmSim":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim-dev","tag":"0.0.4"}}}` | | modelservice.annotations | Annotations to add to all modelservice resources | object | `{}` | | modelservice.decode | Decode options | object | See below | | modelservice.decode.tolerations | Tolerations configuration to deploy decode pods to tainted nodes | list | See below | diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index 2d0198b..6b4e1a0 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -16,42 +16,6 @@ metadata: {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }} {{- end }} data: - configMaps: | - - apiVersion: v1 - kind: ConfigMap - metadata: - name: {{ include "modelservice.fullname" . -}}-config-decoder - data: - lmcache-decoder-config.yaml: | - local_cpu: False - max_local_cpu_size: 0 - max_local_disk_size: 0 - remote_serde: NULL - enable_nixl: True - nixl_role: receiver - nixl_peer_host: 0.0.0.0 - nixl_peer_port: 55555 - nixl_buffer_size: 524288 - nixl_buffer_device: "cuda" - nixl_enable_gc: True - - apiVersion: v1 - kind: ConfigMap - metadata: - name: {{ include "modelservice.fullname" . -}}-config-prefiller - data: - lmcache-prefiller-config.yaml: | - local_cpu: False - max_local_cpu_size: 0 - max_local_disk_size: 0 - remote_serde: NULL - enable_nixl: True - nixl_role: "sender" - nixl_peer_host: {{`"{{ .DecodeServiceName }}"`}} - nixl_peer_port: 55555 - nixl_buffer_size: 524288 - nixl_buffer_device: "cuda" - nixl_enable_gc: True - decodeDeployment: | apiVersion: apps/v1 kind: Deployment @@ -62,18 +26,20 @@ data: tolerations: {{- toYaml .Values.modelservice.decode.tolerations | nindent 12 }} {{- end }} - containers: + initContainers: - name: routing-proxy image: {{ include "modelservice.routingProxyImage" . }} securityContext: allowPrivilegeEscalation: false runAsNonRoot: true args: - - "--port=8001" - - "--vllm-port=8000" + - "--port=8000" + - "--vllm-port=8001" + - "--connector=nixl" ports: - containerPort: 8000 protocol: TCP + containers: - name: vllm image: {{ include "modelservice.vllmImage" . }} imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }} @@ -85,31 +51,32 @@ data: - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }} args: - "--port" - - "8000" + - "8001" - "--kv-transfer-config" - - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--enforce-eager" env: - name: HOME value: /home + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG - name: POD_IP valueFrom: fieldRef: apiVersion: v1 fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL - value: ${POD_IP}:80 + value: {{ `"${POD_IP}:80"` }} - name: CUDA_VISIBLE_DEVICES value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" - - name: LMCACHE_USE_EXPERIMENTAL - value: "True" - - name: VLLM_ENABLE_V1_MULTIPROCESSING - value: "1" - - name: VLLM_WORKER_MULTIPROC_METHOD - value: spawn - - name: LMCACHE_CONFIG_FILE - value: /vllm-workspace/lmcache-decoder-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -121,8 +88,6 @@ data: volumeMounts: - name: home mountPath: /home - - name: config-decoder - mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -132,16 +97,11 @@ data: readOnly: true {{ `{{- end }}` }} ports: - - containerPort: 8001 - protocol: TCP - - containerPort: 55555 + - containerPort: 5557 protocol: TCP volumes: - name: home emptyDir: {} - - name: config-decoder - configMap: - name: {{ include "modelservice.fullname" . -}}-config-decoder {{ `{{- if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -157,18 +117,20 @@ data: tolerations: {{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }} {{- end }} - containers: + initContainers: - name: "routing-proxy" image: {{ include "modelservice.routingProxyImage" . }} securityContext: allowPrivilegeEscalation: false runAsNonRoot: true args: - - "--port=8001" - - "--vllm-port=8000" + - "--port=8000" + - "--vllm-port=8001" + - "--connector=nixl" ports: - containerPort: 8000 protocol: TCP + containers: - name: vllm image: {{ include "modelservice.vllmImage" . }} imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }} @@ -180,31 +142,32 @@ data: - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }} args: - "--port" - - "8000" + - "8001" - "--kv-transfer-config" - - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--enforce-eager" env: - name: HOME value: /home + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP - name: POD_IP valueFrom: fieldRef: apiVersion: v1 fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL - value: "${POD_IP}:80" + value: {{ `"${POD_IP}:80"` }} - name: CUDA_VISIBLE_DEVICES value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" - - name: LMCACHE_USE_EXPERIMENTAL - value: "True" - - name: VLLM_ENABLE_V1_MULTIPROCESSING - value: "1" - - name: VLLM_WORKER_MULTIPROC_METHOD - value: spawn - - name: LMCACHE_CONFIG_FILE - value: /vllm-workspace/lmcache-prefiller-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -216,8 +179,6 @@ data: volumeMounts: - name: home mountPath: /home - - name: config-prefiller - mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -227,16 +188,11 @@ data: readOnly: true {{ `{{- end }}` }} ports: - - containerPort: 8001 - protocol: TCP - - containerPort: 55555 + - containerPort: 5557 protocol: TCP volumes: - name: home emptyDir: {} - - name: config-prefiller - configMap: - name: {{ include "modelservice.fullname" . -}}-config-prefiller {{ `{{ if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -254,7 +210,7 @@ data: clusterIP: None ports: - name: nixl - port: 55555 + port: 5557 protocol: TCP - name: vllm port: 8000 @@ -272,7 +228,7 @@ data: clusterIP: None ports: - name: nixl - port: 55555 + port: 5557 protocol: TCP - name: vllm port: 8000 @@ -332,6 +288,14 @@ data: - --grpcHealthPort - "9003" env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + {{- if .Values.redis.enabled }} + - name: KVCACHE_INDEXER_REDIS_ADDR + value: {{ include "redis.master.service.fullurl" . -}} + {{- end -}} {{/* HACK, waiting on: https://github.com/neuralmagic/llm-d-model-service/issues/123 */}} {{ `{{- if .HFModelName }}` }} - name: HF_TOKEN diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index 589847e..be30bb2 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -300,8 +300,8 @@ modelservice: # -- vLLM image used in ModelService CR presets image: registry: quay.io - repository: llm-d/llm-d-dev - tag: "0.0.5" + repository: "llm-d/llm-d-dev" + tag: "vllm-nixl-0.0.6" imagePullPolicy: "IfNotPresent" metrics: From d13a7904767cd579919c996c06ea0bcbd0adddc1 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Thu, 8 May 2025 17:44:18 -0700 Subject: [PATCH 2/8] runs but no cache hit Signed-off-by: greg pereira --- .../modelservice/presets/basic-gpu-with-nixl-preset.yaml | 8 ++++---- charts/llm-d/values.yaml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index 6b4e1a0..1be7c3c 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -52,9 +52,9 @@ data: args: - "--port" - "8001" + - "--enforce-eager" - "--kv-transfer-config" - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - "--enforce-eager" env: - name: HOME value: /home @@ -143,9 +143,9 @@ data: args: - "--port" - "8001" + - "--enforce-eager" - "--kv-transfer-config" - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - - "--enforce-eager" env: - name: HOME value: /home @@ -213,7 +213,7 @@ data: port: 5557 protocol: TCP - name: vllm - port: 8000 + port: 8001 protocol: TCP prefillService: | @@ -231,7 +231,7 @@ data: port: 5557 protocol: TCP - name: vllm - port: 8000 + port: 8001 protocol: TCP eppService: | diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index be30bb2..d896f23 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -301,7 +301,7 @@ modelservice: image: registry: quay.io repository: "llm-d/llm-d-dev" - tag: "vllm-nixl-0.0.6" + tag: "vllm-nixl-0.0.6-amd64" imagePullPolicy: "IfNotPresent" metrics: From 151cf330f8fb432f199c54ba89fd225035593bf5 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 9 May 2025 06:26:52 -0700 Subject: [PATCH 3/8] no p/d services in prod example Signed-off-by: greg pereira --- .../presets/basic-gpu-with-nixl-preset.yaml | 77 ++++++++++--------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index 1be7c3c..b9884e0 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -30,8 +30,11 @@ data: - name: routing-proxy image: {{ include "modelservice.routingProxyImage" . }} securityContext: - allowPrivilegeEscalation: false + capabilities: + drop: + - MKNOD runAsNonRoot: true + allowPrivilegeEscalation: false args: - "--port=8000" - "--vllm-port=8001" @@ -39,6 +42,8 @@ data: ports: - containerPort: 8000 protocol: TCP + restartPolicy: Always + imagePullPolicy: Always containers: - name: vllm image: {{ include "modelservice.vllmImage" . }} @@ -130,6 +135,8 @@ data: ports: - containerPort: 8000 protocol: TCP + restartPolicy: Always + imagePullPolicy: Always containers: - name: vllm image: {{ include "modelservice.vllmImage" . }} @@ -198,41 +205,41 @@ data: emptyDir: {} {{ `{{ end }}` }} - decodeService: | - apiVersion: v1 - kind: Service - metadata: - labels: - {{- if .Values.modelservice.vllm.metrics.enabled }} - {{ include "metrics.label" . }} - {{- end }} - spec: - clusterIP: None - ports: - - name: nixl - port: 5557 - protocol: TCP - - name: vllm - port: 8001 - protocol: TCP + # decodeService: | + # apiVersion: v1 + # kind: Service + # metadata: + # labels: + # {{- if .Values.modelservice.vllm.metrics.enabled }} + # {{ include "metrics.label" . }} + # {{- end }} + # spec: + # clusterIP: None + # ports: + # - name: nixl + # port: 5557 + # protocol: TCP + # - name: vllm + # port: 8000 + # protocol: TCP - prefillService: | - apiVersion: v1 - kind: Service - metadata: - labels: - {{- if .Values.modelservice.vllm.metrics.enabled }} - {{ include "metrics.label" . }} - {{- end }} - spec: - clusterIP: None - ports: - - name: nixl - port: 5557 - protocol: TCP - - name: vllm - port: 8001 - protocol: TCP + # prefillService: | + # apiVersion: v1 + # kind: Service + # metadata: + # labels: + # {{- if .Values.modelservice.vllm.metrics.enabled }} + # {{ include "metrics.label" . }} + # {{- end }} + # spec: + # clusterIP: None + # ports: + # - name: nixl + # port: 5557 + # protocol: TCP + # - name: vllm + # port: 8000 + # protocol: TCP eppService: | apiVersion: v1 From 39c7a6d263860a01e2202ad475276615dc6598d9 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 9 May 2025 08:27:03 -0700 Subject: [PATCH 4/8] restore pd services deemed non-invasive Signed-off-by: greg pereira --- charts/llm-d/README.md | 4 +- .../presets/basic-gpu-with-nixl-preset.yaml | 72 ++++++++++--------- charts/llm-d/values.yaml | 5 +- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md index bde8013..d70542e 100644 --- a/charts/llm-d/README.md +++ b/charts/llm-d/README.md @@ -165,7 +165,7 @@ Kubernetes: `>= 1.25.0-0` | ingress.tls.enabled | Enable TLS configuration for the host defined at `ingress.host` parameter | bool | `false` | | ingress.tls.secretName | The name to which the TLS Secret will be called | string | `""` | | kubeVersion | Override Kubernetes version | string | `""` | -| modelservice | Model service controller configuration | object | `{"annotations":{},"enabled":true,"epp":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5-amd64"},"metrics":{"enabled":true}},"fullnameOverride":"","image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.6"},"metrics":{"enabled":true},"nameOverride":"","podAnnotations":{},"podLabels":{},"rbac":{"create":true},"replicas":1,"routingProxy":{"image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-routing-sidecar-dev","tag":"0.0.6"}},"service":{"enabled":true,"port":8443,"type":"ClusterIP"},"serviceAccount":{"annotations":{},"create":true,"fullnameOverride":"","labels":{},"nameOverride":""},"vllm":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"},"metrics":{"enabled":true}},"vllmSim":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim-dev","tag":"0.0.4"}}}` | +| modelservice | Model service controller configuration | object | See below | | modelservice.annotations | Annotations to add to all modelservice resources | object | `{}` | | modelservice.decode | Decode options | object | See below | | modelservice.decode.tolerations | Tolerations configuration to deploy decode pods to tainted nodes | list | See below | @@ -207,7 +207,7 @@ Kubernetes: `>= 1.25.0-0` | modelservice.serviceMonitor.port | ServiceMonitor endpoint port | string | `"vllm"` | | modelservice.serviceMonitor.selector | ServiceMonitor selector matchLabels
matchLabels must match labels on modelservice Services | object | `{"matchLabels":{}}` | | modelservice.vllm | vLLM container options | object | See below | -| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"0.0.5"}` | +| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"}` | | modelservice.vllm.metrics.enabled | Enable metrics scraping from vllm service, see `modelservice.serviceMonitor` for configuration | bool | `true` | | modelservice.vllmSim | vLL sim container options | object | See below | | modelservice.vllmSim.image | vLLM sim image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim","tag":"0.0.4"}` | diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index b9884e0..e0ef382 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -49,6 +49,10 @@ data: image: {{ include "modelservice.vllmImage" . }} imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }} securityContext: + capabilities: + drop: + - MKNOD + runAsNonRoot: true allowPrivilegeEscalation: false command: - vllm @@ -205,41 +209,41 @@ data: emptyDir: {} {{ `{{ end }}` }} - # decodeService: | - # apiVersion: v1 - # kind: Service - # metadata: - # labels: - # {{- if .Values.modelservice.vllm.metrics.enabled }} - # {{ include "metrics.label" . }} - # {{- end }} - # spec: - # clusterIP: None - # ports: - # - name: nixl - # port: 5557 - # protocol: TCP - # - name: vllm - # port: 8000 - # protocol: TCP + decodeService: | + apiVersion: v1 + kind: Service + metadata: + labels: + {{- if .Values.modelservice.vllm.metrics.enabled }} + {{ include "metrics.label" . }} + {{- end }} + spec: + clusterIP: None + ports: + - name: nixl + port: 5557 + protocol: TCP + - name: vllm + port: 8000 + protocol: TCP - # prefillService: | - # apiVersion: v1 - # kind: Service - # metadata: - # labels: - # {{- if .Values.modelservice.vllm.metrics.enabled }} - # {{ include "metrics.label" . }} - # {{- end }} - # spec: - # clusterIP: None - # ports: - # - name: nixl - # port: 5557 - # protocol: TCP - # - name: vllm - # port: 8000 - # protocol: TCP + prefillService: | + apiVersion: v1 + kind: Service + metadata: + labels: + {{- if .Values.modelservice.vllm.metrics.enabled }} + {{ include "metrics.label" . }} + {{- end }} + spec: + clusterIP: None + ports: + - name: nixl + port: 5557 + protocol: TCP + - name: vllm + port: 8000 + protocol: TCP eppService: | apiVersion: v1 diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index d896f23..24bddc9 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -301,8 +301,9 @@ modelservice: image: registry: quay.io repository: "llm-d/llm-d-dev" - tag: "vllm-nixl-0.0.6-amd64" - imagePullPolicy: "IfNotPresent" + tag: "vllm-nixl-0.0.6" + imagePullPolicy: "Always" + # imagePullPolicy: "IfNotPresent" metrics: From 8126af13a287b788f2a044244bfc2d4b7563b201 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 9 May 2025 09:07:44 -0700 Subject: [PATCH 5/8] keeping confimaps around but not using them in lmcache for dual connectors later Signed-off-by: greg pereira --- .../presets/basic-gpu-with-nixl-preset.yaml | 58 ++++--- notes/testing-nixl-and-epp.md | 142 ++++++++++++++++++ 2 files changed, 182 insertions(+), 18 deletions(-) create mode 100644 notes/testing-nixl-and-epp.md diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index e0ef382..f8d361d 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -16,6 +16,28 @@ metadata: {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }} {{- end }} data: + configMaps: | + - apiVersion: v1 + kind: ConfigMap + metadata: + name: {{ include "modelservice.fullname" . -}}-config-decoder + data: + lmcache-decoder-config.yaml: | + # local_cpu: False + # max_local_cpu_size: 0 + # max_local_disk_size: 0 + # remote_serde: NULL + - apiVersion: v1 + kind: ConfigMap + metadata: + name: {{ include "modelservice.fullname" . -}}-config-prefiller + data: + lmcache-prefiller-config.yaml: | + # local_cpu: False + # max_local_cpu_size: 0 + # max_local_disk_size: 0 + # remote_serde: NULL + decodeDeployment: | apiVersion: apps/v1 kind: Deployment @@ -81,11 +103,13 @@ data: apiVersion: v1 fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL - value: {{ `"${POD_IP}:80"` }} + value: ${POD_IP}:8200 - name: CUDA_VISIBLE_DEVICES value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" + # - name: LMCACHE_CONFIG_FILE + # value: /vllm-workspace/lmcache-decoder-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -97,6 +121,8 @@ data: volumeMounts: - name: home mountPath: /home + # - name: config-decoder + # mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -111,6 +137,9 @@ data: volumes: - name: home emptyDir: {} + # - name: config-decoder + # configMap: + # name: {{ include "modelservice.fullname" . -}}-config-decoder {{ `{{- if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -126,21 +155,6 @@ data: tolerations: {{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }} {{- end }} - initContainers: - - name: "routing-proxy" - image: {{ include "modelservice.routingProxyImage" . }} - securityContext: - allowPrivilegeEscalation: false - runAsNonRoot: true - args: - - "--port=8000" - - "--vllm-port=8001" - - "--connector=nixl" - ports: - - containerPort: 8000 - protocol: TCP - restartPolicy: Always - imagePullPolicy: Always containers: - name: vllm image: {{ include "modelservice.vllmImage" . }} @@ -153,7 +167,7 @@ data: - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }} args: - "--port" - - "8001" + - "8000" - "--enforce-eager" - "--kv-transfer-config" - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' @@ -174,11 +188,14 @@ data: apiVersion: v1 fieldPath: status.podIP - name: LMCACHE_DISTRIBUTED_URL - value: {{ `"${POD_IP}:80"` }} + value: ${POD_IP}:8200 - name: CUDA_VISIBLE_DEVICES value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" + ### Keep ability to enable LMCache configs but don't use them right now + # - name: LMCACHE_CONFIG_FILE + # value: /vllm-workspace/lmcache-prefiller-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -190,6 +207,9 @@ data: volumeMounts: - name: home mountPath: /home + # - name: config-prefiller + # configMap: + # name: {{ include "modelservice.fullname" . -}}-config-prefiller {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -204,6 +224,8 @@ data: volumes: - name: home emptyDir: {} + # - name: config-prefiller + # mountPath: /vllm-workspace {{ `{{ if .HFModelName }}` }} - name: model-cache emptyDir: {} diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md new file mode 100644 index 0000000..c9f0ec9 --- /dev/null +++ b/notes/testing-nixl-and-epp.md @@ -0,0 +1,142 @@ +# notes + +Helper scritps + +```bash +export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind." + +export LLM_PROMPT_2="Now that we have implemented benchmarks, I was hoping you could help me understand how I would track these manifests in GitOps. Ideally I would openshift gitops but would also support vanilla argocd for non OCP environments. Do you have any suggestions on the topic?" + +export LLM_PROMPT_3="Lets talk about dolphins! What are some unique characteristics of dolphins compared to other acquatic animals?" + +export LLM_PROMPT_4="speaking of aquatic animals, what is your favourite aquatic animal and why?" + +export LLM_PROMPT_5="How might I gather metrics on how much energy consumption my OCP cluster uses?" + +curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Llama-3.2-3B-Instruct", + "prompt": "'${LLM_PROMPT_1}'", + "max_tokens": 500, + "temperature": 0 + }' | jq + +curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Llama-3.2-3B-Instruct", + "prompt": "'${LLM_PROMPT_2}'", + "max_tokens": 500, + "temperature": 0 + }' | jq + +DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}') +PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}') +EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}') + + +# grab logs together p/D +stern -n $(oc project -q) "$PREFILL_POD|$DECODE_POD" -c vllm | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate" +``` + +## Debugging and testing NIXL KV cache + +Debugging KV cache through logs: + +#### Terminal 1 EPP + +Follow EPP logs to see if it can hit Decode routing sidecar + +```bash +EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}') +kubectl logs pod/${EPP_POD} -f | grep -v "Failed to refreshed metrics\|Refreshed metrics\|gRPC health check serving\|Refreshing Prometheus Metrics" +``` + +### Terminal 2 Routing sidecar (Decode) + +Follow the routing sidecar in the decode pod to see if it can post to prefill if needed + +```bash +DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}') +kubectl logs pod/${DECODE_POD} -c routing-proxy -f | grep -v "http: proxy error: dial tcp \[::1\]:8001: connect: connection refused" +``` + +### Terminal 3 Decode inference + +Follow the decode vllm logs: + +```bash +DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}') +kubectl logs pod/${DECODE_POD} -c vllm -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s" +``` + +### Terminal 4 Prefill + +Check to see that prefill logs are getting hit by decode: + +```bash +PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}') +kubectl logs pod/${PREFILL_POD} -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s" +``` + +At this point you should be able to send a request through the gatway and track the relevant logs: + +```bash +INGRESS_ADDRESS=$(kubectl get ingress llm-d-inference-gateway | tail -n 1 | awk '{print $3}') +curl ${INGRESS_ADDRESS}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Llama-3.2-3B-Instruct", + "prompt": "'${LLM_PROMPT_1}'", + "max_tokens": 500, + "temperature": 0 + }' | jq +``` + +Epp should filter out Prefill pods, and only target decode first. You should see this between the 2nd and 3rd steps in EPP when it applies the filter plugin: +- Scheduling a request (step 2) has both pods as candidates, ex: +```log +{"level":"info","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g Address:10.131.10.180 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.29171375 +0000 UTC m=+388.303255999}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-84667878f9-lwb47 Address:10.128.13.52 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.316489317 +0000 UTC m=+388.328031566}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} +``` +- Apply filter plugin (step 3), only has decode as candidate to target sidecar first: +```log +{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g"},"Address":"10.131.10.180","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.29171375Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-84667878f9-lwb47"},"Address":"10.128.13.52","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.316489317Z"}]} +{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"} +``` + +Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchistrated between P/D pods, ex: +```log +I0509 19:43:44.077499 1 chat_completions.go:110] "running NIXL protocol" logger="proxy server" +I0509 19:43:44.077593 1 chat_completions.go:172] "sending request to prefiller" logger="proxy server" url="http://10.128.13.52:8000" body="{\"do_remote_decode\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"stream\":false,\"temperature\":0}" +I0509 19:43:44.099979 1 chat_completions.go:217] "received prefiller response" logger="proxy server" remote_block_ids=[1,2,3,4] remote_engine_id="81eb3201-d5c2-4642-8131-7849f2e955ce" remote_host="10.128.13.52" remote_port=5557 +I0509 19:43:44.100082 1 chat_completions.go:252] "sending request to decoder" logger="proxy server" body="{\"do_remote_prefill\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"remote_block_ids\":[1,2,3,4],\"remote_engine_id\":\"81eb3201-d5c2-4642-8131-7849f2e955ce\",\"remote_host\":\"10.128.13.52\",\"remote_port\":5557,\"temperature\":0}" +``` + +Finally in the decode inference pod (terminal 3) we should see the logs on KV transfer: + +```log +INFO 05-09 19:26:20 [logger.py:39] Received request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None. +INFO 05-09 19:26:20 [async_llm.py:255] Added request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0. +DEBUG 05-09 19:26:20 [core.py:431] EngineCore loop active. +DEBUG 05-09 19:26:20 [nixl_connector.py:559] start_load_kv for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 from remote engine 81eb3201-d5c2-4642-8131-7849f2e955ce. Num local_block_ids: 4. Num remote_block_ids: 4. +DEBUG 05-09 19:26:20 [nixl_connector.py:313] Querying metadata on path: tcp://10.128.13.52:5557 +DEBUG 05-09 19:26:20 [nixl_connector.py:422] Created 1055264 blocks for src engine 6d177cac-6a93-4396-8c06-a5af03e9ace7 and rank 0 +DEBUG 05-09 19:26:21 [nixl_connector.py:439] Created 1055264 blocks for dst engine 81eb3201-d5c2-4642-8131-7849f2e955ce and rank 0 +DEBUG 05-09 19:26:22 [nixl_connector.py:326] NIXL handshake: get metadata took: 0.0025545399985276163 +DEBUG 05-09 19:26:22 [nixl_connector.py:328] NIXL handshake: add agent took: 2.2907175269938307 +DEBUG 05-09 19:26:22 [nixl_connector.py:463] Rank 0, get_finished: 0 requests done sending and 1 requests done recving +DEBUG 05-09 19:26:22 [scheduler.py:862] Finished recving KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 +``` + +If you are debugging networking you can finally observe the prefill pod logs to see how it recieves the request from decode, and sends back the KVs + +```log +INFO 05-09 19:43:44 [logger.py:39] Received request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None. +INFO 05-09 19:43:44 [async_llm.py:255] Added request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0. +DEBUG 05-09 19:43:44 [core.py:431] EngineCore loop active. +DEBUG 05-09 19:43:44 [nixl_connector.py:463] Rank 0, get_finished: 1 requests done sending and 0 requests done recving +DEBUG 05-09 19:43:44 [scheduler.py:865] Finished sending KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 +DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work. +INFO: 10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK +``` From 44b4607d799c91f281873377a0e2b99311f16d7a Mon Sep 17 00:00:00 2001 From: greg pereira Date: Fri, 9 May 2025 13:17:01 -0700 Subject: [PATCH 6/8] downgrade to working image Signed-off-by: greg pereira --- charts/llm-d/README.md | 2 +- .../presets/basic-gpu-with-nixl-preset.yaml | 28 +++---- .../sample-application/modelservice.yaml | 10 +++ charts/llm-d/values.yaml | 6 +- notes/testing-nixl-and-epp.md | 74 ++++++++++++++++--- 5 files changed, 87 insertions(+), 33 deletions(-) diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md index d70542e..d4c7b9b 100644 --- a/charts/llm-d/README.md +++ b/charts/llm-d/README.md @@ -174,7 +174,7 @@ Kubernetes: `>= 1.25.0-0` | modelservice.epp | Endpoint picker configuration | object | See below | | modelservice.epp.defaultEnvVars | Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again. Ref: https://github.com/neuralmagic/gateway-api-inference-extension/tree/dev?tab=readme-ov-file#temporary-fork-configuration | list | `[{"name":"ENABLE_KVCACHE_AWARE_SCORER","value":"{{ .Values.redis.enabled }}"},{"name":"KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"KVCACHE_INDEXER_REDIS_ADDR","value":"{{ if .Values.redis.enabled }}{{ include \"redis.master.service.fullurl\" . }}{{ end }}"},{"name":"ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_SESSION_AWARE_SCORER","value":"true"},{"name":"SESSION_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PD_ENABLED","value":"true"},{"name":"PD_PROMPT_LEN_THRESHOLD","value":"10"},{"name":"PREFILL_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"PREFILL_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"PREFILL_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFILL_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"DECODE_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"DECODE_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"DECODE_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"}]` | | modelservice.epp.extraEnvVars | Additional environment variables for endpoint picker | list | `[]` | -| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-inference-scheduler","tag":"0.0.1"}` | +| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5"}` | | modelservice.epp.metrics.enabled | Enable metrics scraping from endpoint picker service, see `modelservice.serviceMonitor` for configuration | bool | `true` | | modelservice.fullnameOverride | String to fully override modelservice.fullname | string | `""` | | modelservice.image | Modelservice controller image, please change only if appropriate adjustments to the CRD are being made | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.8"}` | diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index f8d361d..28d2300 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -121,8 +121,8 @@ data: volumeMounts: - name: home mountPath: /home - # - name: config-decoder - # mountPath: /vllm-workspace + - name: config-decoder + mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -137,9 +137,9 @@ data: volumes: - name: home emptyDir: {} - # - name: config-decoder - # configMap: - # name: {{ include "modelservice.fullname" . -}}-config-decoder + - name: config-decoder + configMap: + name: {{ include "modelservice.fullname" . -}}-config-decoder {{ `{{- if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -207,9 +207,8 @@ data: volumeMounts: - name: home mountPath: /home - # - name: config-prefiller - # configMap: - # name: {{ include "modelservice.fullname" . -}}-config-prefiller + - name: config-prefiller + mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -224,8 +223,9 @@ data: volumes: - name: home emptyDir: {} - # - name: config-prefiller - # mountPath: /vllm-workspace + - name: config-prefiller + configMap: + name: {{ include "modelservice.fullname" . -}}-config-prefiller {{ `{{ if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -321,14 +321,6 @@ data: - --grpcHealthPort - "9003" env: - - name: PD_ENABLED - value: "true" - - name: PD_PROMPT_LEN_THRESHOLD - value: "10" - {{- if .Values.redis.enabled }} - - name: KVCACHE_INDEXER_REDIS_ADDR - value: {{ include "redis.master.service.fullurl" . -}} - {{- end -}} {{/* HACK, waiting on: https://github.com/neuralmagic/llm-d-model-service/issues/123 */}} {{ `{{- if .HFModelName }}` }} - name: HF_TOKEN diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml index 3190fd2..5597fd2 100644 --- a/charts/llm-d/templates/sample-application/modelservice.yaml +++ b/charts/llm-d/templates/sample-application/modelservice.yaml @@ -52,4 +52,14 @@ spec: name: {{ .Values.sampleApplication.model.auth.hfToken.name }} key: {{ .Values.sampleApplication.model.auth.hfToken.key }} {{- end }} + epp: + defaultEnvVars: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + {{- if .Values.redis.enabled }} + - name: KVCACHE_INDEXER_REDIS_ADDR + value: {{ include "redis.master.service.fullurl" . -}} + {{- end -}} {{- end }} diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml index 24bddc9..26bb420 100644 --- a/charts/llm-d/values.yaml +++ b/charts/llm-d/values.yaml @@ -207,8 +207,10 @@ modelservice: # -- Endpoint picker image used in ModelService CR presets image: registry: quay.io - repository: llm-d/llm-d-inference-scheduler - tag: "0.0.1" + repository: llm-d/llm-d-gateway-api-inference-extension-dev + tag: 0.0.5 + # repository: llm-d/llm-d-inference-scheduler + # tag: 0.0.1 imagePullPolicy: "Always" metrics: diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md index c9f0ec9..27f1fc2 100644 --- a/notes/testing-nixl-and-epp.md +++ b/notes/testing-nixl-and-epp.md @@ -1,6 +1,6 @@ # notes -Helper scritps +Helper scripts ```bash export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind." @@ -44,9 +44,9 @@ stern -n $(oc project -q) "$PREFILL_POD|$DECODE_POD" -c vllm | grep -v "\"GET /m Debugging KV cache through logs: -#### Terminal 1 EPP +### Terminal 1 EPP -Follow EPP logs to see if it can hit Decode routing sidecar +Follow EPP logs to see the logic around which inferencing pods are picked up: ```bash EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}') @@ -80,10 +80,13 @@ PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role= kubectl logs pod/${PREFILL_POD} -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s" ``` -At this point you should be able to send a request through the gatway and track the relevant logs: +At this point you should be able to send a request through the gateway and track the relevant logs: ```bash INGRESS_ADDRESS=$(kubectl get ingress llm-d-inference-gateway | tail -n 1 | awk '{print $3}') + +export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind." + curl ${INGRESS_ADDRESS}/v1/completions \ -H "Content-Type: application/json" \ -d '{ @@ -94,18 +97,54 @@ curl ${INGRESS_ADDRESS}/v1/completions \ }' | jq ``` -Epp should filter out Prefill pods, and only target decode first. You should see this between the 2nd and 3rd steps in EPP when it applies the filter plugin: -- Scheduling a request (step 2) has both pods as candidates, ex: +### Investigating our EPP logs + +First the EPP logs will identify that a `LLM Request has been assembled` and then will schedule the request. + +```log +{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"handlers/request.go:75","msg":"LLM request assembled","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 0"} +{"level":"info","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6588b8d59c-5tccf Address:10.131.10.188 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:36:04.729232544 +0000 UTC m=+572.692052157}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:36:04.750653805 +0000 UTC m=+572.713473418}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} +``` + +#### Role specific logs (prefill) + +After this, Epp will run the filter plugin with desired model to grab the pool of all prefill nodes for that model (in this case, we only have 1 prefill pod for that model): + +```log +{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6588b8d59c-5tccf"},"Address":"10.131.10.188","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.729232544Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.750653805Z"}]} + +{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"} + +{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:167","msg":"Filter plugin result","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.750653805Z"}]} +``` + +Next, EPP will apply the `scorer plugin`, to score each of the prefill pods available for that model: + ```log -{"level":"info","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g Address:10.131.10.180 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.29171375 +0000 UTC m=+388.303255999}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-84667878f9-lwb47 Address:10.128.13.52 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.316489317 +0000 UTC m=+388.328031566}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} +{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:179","msg":"Before running scorer plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0.0003184037359371672,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:54:50.70102255Z"}]} + +{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:196","msg":"After running scorer plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} ``` -- Apply filter plugin (step 3), only has decode as candidate to target sidecar first: + +Finally, EPP will run the `picker plugin` to select the best canidated within the pool based on the scores: + ```log -{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g"},"Address":"10.131.10.180","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.29171375Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-84667878f9-lwb47"},"Address":"10.128.13.52","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.316489317Z"}]} -{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"} +{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:210","msg":"Before running picker plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","podsError":"json: unsupported type: map[types.Pod]float64"} + +{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"picker/max_score_picker.go:31","msg":"Selecting a pod with the max score from 1 candidates: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0.0003184037359371672 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.70102255 +0000 UTC m=+1698.663842173}}]","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} + +{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:214","msg":"After running picker plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","result":{"TargetPod":{"Pod":{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0.0003184037359371672,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:54:50.70102255Z"},"Score":0},"MutatedHeaders":null}} + +{"level":"info","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6588b8d59c-5tccf Address:10.131.10.188 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:5.30672893228612e-05 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.677717088 +0000 UTC m=+1698.640536701}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0.0003184037359371672 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.70102255 +0000 UTC m=+1698.663842173}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"} ``` -Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchistrated between P/D pods, ex: +All of these role specific logs will then repeat until we also have a `decode` pod we can hit. + +### Sidecar router logs + + +Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchestrated between P/D pods, ex: + ```log I0509 19:43:44.077499 1 chat_completions.go:110] "running NIXL protocol" logger="proxy server" I0509 19:43:44.077593 1 chat_completions.go:172] "sending request to prefiller" logger="proxy server" url="http://10.128.13.52:8000" body="{\"do_remote_decode\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"stream\":false,\"temperature\":0}" @@ -113,9 +152,14 @@ I0509 19:43:44.099979 1 chat_completions.go:217] "received prefiller respo I0509 19:43:44.100082 1 chat_completions.go:252] "sending request to decoder" logger="proxy server" body="{\"do_remote_prefill\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"remote_block_ids\":[1,2,3,4],\"remote_engine_id\":\"81eb3201-d5c2-4642-8131-7849f2e955ce\",\"remote_host\":\"10.128.13.52\",\"remote_port\":5557,\"temperature\":0}" ``` +Here you can see how the request comes into the routing sidecar. It routes its request to the prefiller first, and receives a response. The sidecard then receives the response from prefiller, and sends the request over to decode. + +### Decode VLLM logs + Finally in the decode inference pod (terminal 3) we should see the logs on KV transfer: ```log + INFO 05-09 19:26:20 [logger.py:39] Received request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None. INFO 05-09 19:26:20 [async_llm.py:255] Added request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0. DEBUG 05-09 19:26:20 [core.py:431] EngineCore loop active. @@ -129,7 +173,11 @@ DEBUG 05-09 19:26:22 [nixl_connector.py:463] Rank 0, get_finished: 0 requests do DEBUG 05-09 19:26:22 [scheduler.py:862] Finished recving KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 ``` -If you are debugging networking you can finally observe the prefill pod logs to see how it recieves the request from decode, and sends back the KVs +Here you can see a request comes in from the sidecar (after the sidecar received response from prefill). The decode vllm pod receives a `kv_load` request from the `prefill` node, does the NIXL handshake, and receives the KV transfer from `prefill`. + +### Prefil VLLM logs + +Finally to get the full picture we can see the prefill logs. ```log INFO 05-09 19:43:44 [logger.py:39] Received request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None. @@ -140,3 +188,5 @@ DEBUG 05-09 19:43:44 [scheduler.py:865] Finished sending KV transfer for request DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work. INFO: 10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK ``` + +It receives the orrigional request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode. From 2b7ce2304a20e71dc7cd815f289ec11f42e073e8 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Sun, 11 May 2025 08:53:55 -0700 Subject: [PATCH 7/8] removing dead code placeholder sections Signed-off-by: greg pereira --- .../presets/basic-gpu-with-nixl-preset.yaml | 37 ------------------- .../sample-application/modelservice.yaml | 10 ----- 2 files changed, 47 deletions(-) diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml index 28d2300..31e7862 100644 --- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml +++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml @@ -16,28 +16,6 @@ metadata: {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }} {{- end }} data: - configMaps: | - - apiVersion: v1 - kind: ConfigMap - metadata: - name: {{ include "modelservice.fullname" . -}}-config-decoder - data: - lmcache-decoder-config.yaml: | - # local_cpu: False - # max_local_cpu_size: 0 - # max_local_disk_size: 0 - # remote_serde: NULL - - apiVersion: v1 - kind: ConfigMap - metadata: - name: {{ include "modelservice.fullname" . -}}-config-prefiller - data: - lmcache-prefiller-config.yaml: | - # local_cpu: False - # max_local_cpu_size: 0 - # max_local_disk_size: 0 - # remote_serde: NULL - decodeDeployment: | apiVersion: apps/v1 kind: Deployment @@ -108,8 +86,6 @@ data: value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" - # - name: LMCACHE_CONFIG_FILE - # value: /vllm-workspace/lmcache-decoder-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -121,8 +97,6 @@ data: volumeMounts: - name: home mountPath: /home - - name: config-decoder - mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -137,9 +111,6 @@ data: volumes: - name: home emptyDir: {} - - name: config-decoder - configMap: - name: {{ include "modelservice.fullname" . -}}-config-decoder {{ `{{- if .HFModelName }}` }} - name: model-cache emptyDir: {} @@ -193,9 +164,6 @@ data: value: "0" - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" - ### Keep ability to enable LMCache configs but don't use them right now - # - name: LMCACHE_CONFIG_FILE - # value: /vllm-workspace/lmcache-prefiller-config.yaml {{- if .Values.redis.enabled }} - name: LMCACHE_LOOKUP_URL value: {{ include "redis.master.service.fullurl" .}} @@ -207,8 +175,6 @@ data: volumeMounts: - name: home mountPath: /home - - name: config-prefiller - mountPath: /vllm-workspace {{ `{{- if .HFModelName }}` }} - name: model-cache mountPath: /models @@ -223,9 +189,6 @@ data: volumes: - name: home emptyDir: {} - - name: config-prefiller - configMap: - name: {{ include "modelservice.fullname" . -}}-config-prefiller {{ `{{ if .HFModelName }}` }} - name: model-cache emptyDir: {} diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml index 5597fd2..3190fd2 100644 --- a/charts/llm-d/templates/sample-application/modelservice.yaml +++ b/charts/llm-d/templates/sample-application/modelservice.yaml @@ -52,14 +52,4 @@ spec: name: {{ .Values.sampleApplication.model.auth.hfToken.name }} key: {{ .Values.sampleApplication.model.auth.hfToken.key }} {{- end }} - epp: - defaultEnvVars: - - name: PD_ENABLED - value: "true" - - name: PD_PROMPT_LEN_THRESHOLD - value: "10" - {{- if .Values.redis.enabled }} - - name: KVCACHE_INDEXER_REDIS_ADDR - value: {{ include "redis.master.service.fullurl" . -}} - {{- end -}} {{- end }} From 0dab3ba813e4965cd32b16caf2f27a6e3808f9a3 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Sun, 11 May 2025 08:56:19 -0700 Subject: [PATCH 8/8] linting Signed-off-by: greg pereira --- charts/llm-d/Chart.yaml | 2 +- charts/llm-d/README.md | 2 +- notes/testing-nixl-and-epp.md | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/charts/llm-d/Chart.yaml b/charts/llm-d/Chart.yaml index 5c52de7..fd8f519 100644 --- a/charts/llm-d/Chart.yaml +++ b/charts/llm-d/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: llm-d type: application -version: 0.7.0 +version: 0.7.1 appVersion: "0.0.1" icon: data:null description: A Helm chart for llm-d diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md index d4c7b9b..737b374 100644 --- a/charts/llm-d/README.md +++ b/charts/llm-d/README.md @@ -1,7 +1,7 @@ # llm-d Helm Chart for OpenShift -![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square) +![Version: 0.7.1](https://img.shields.io/badge/Version-0.7.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for llm-d diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md index 27f1fc2..3372d51 100644 --- a/notes/testing-nixl-and-epp.md +++ b/notes/testing-nixl-and-epp.md @@ -142,7 +142,6 @@ All of these role specific logs will then repeat until we also have a `decode` p ### Sidecar router logs - Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchestrated between P/D pods, ex: ```log @@ -189,4 +188,4 @@ DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work. INFO: 10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK ``` -It receives the orrigional request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode. +It receives the original request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode.