From e1d2a25f3b64322e55030319b62a17a7418f95ea Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Mon, 5 May 2025 21:05:10 -0700
Subject: [PATCH 1/8] using new NIXL only connector

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 charts/llm-d/README.md                        |   2 +-
 .../presets/basic-gpu-with-nixl-preset.yaml   | 128 +++++++-----------
 charts/llm-d/values.yaml                      |   4 +-
 3 files changed, 49 insertions(+), 85 deletions(-)

diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md
index 30ab979..bde8013 100644
--- a/charts/llm-d/README.md
+++ b/charts/llm-d/README.md
@@ -165,7 +165,7 @@ Kubernetes: `>= 1.25.0-0`
 | ingress.tls.enabled | Enable TLS configuration for the host defined at `ingress.host` parameter | bool | `false` |
 | ingress.tls.secretName | The name to which the TLS Secret will be called | string | `""` |
 | kubeVersion | Override Kubernetes version | string | `""` |
-| modelservice | Model service controller configuration | object | See below |
+| modelservice | Model service controller configuration | object | `{"annotations":{},"enabled":true,"epp":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5-amd64"},"metrics":{"enabled":true}},"fullnameOverride":"","image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.6"},"metrics":{"enabled":true},"nameOverride":"","podAnnotations":{},"podLabels":{},"rbac":{"create":true},"replicas":1,"routingProxy":{"image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-routing-sidecar-dev","tag":"0.0.6"}},"service":{"enabled":true,"port":8443,"type":"ClusterIP"},"serviceAccount":{"annotations":{},"create":true,"fullnameOverride":"","labels":{},"nameOverride":""},"vllm":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"},"metrics":{"enabled":true}},"vllmSim":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim-dev","tag":"0.0.4"}}}` |
 | modelservice.annotations | Annotations to add to all modelservice resources | object | `{}` |
 | modelservice.decode | Decode options | object | See below |
 | modelservice.decode.tolerations | Tolerations configuration to deploy decode pods to tainted nodes | list | See below |
diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index 2d0198b..6b4e1a0 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -16,42 +16,6 @@ metadata:
     {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }}
     {{- end }}
 data:
-  configMaps: |
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-decoder
-      data:
-        lmcache-decoder-config.yaml: |
-          local_cpu: False
-          max_local_cpu_size: 0
-          max_local_disk_size: 0
-          remote_serde: NULL
-          enable_nixl: True
-          nixl_role: receiver
-          nixl_peer_host: 0.0.0.0
-          nixl_peer_port: 55555
-          nixl_buffer_size: 524288
-          nixl_buffer_device: "cuda"
-          nixl_enable_gc: True
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-prefiller
-      data:
-        lmcache-prefiller-config.yaml: |
-          local_cpu: False
-          max_local_cpu_size: 0
-          max_local_disk_size: 0
-          remote_serde: NULL
-          enable_nixl: True
-          nixl_role: "sender"
-          nixl_peer_host: {{`"{{ .DecodeServiceName }}"`}}
-          nixl_peer_port: 55555
-          nixl_buffer_size: 524288
-          nixl_buffer_device: "cuda"
-          nixl_enable_gc: True
-
   decodeDeployment: |
     apiVersion: apps/v1
     kind: Deployment
@@ -62,18 +26,20 @@ data:
           tolerations:
             {{- toYaml .Values.modelservice.decode.tolerations | nindent 12 }}
           {{- end }}
-          containers:
+          initContainers:
             - name: routing-proxy
               image: {{ include "modelservice.routingProxyImage" . }}
               securityContext:
                 allowPrivilegeEscalation: false
                 runAsNonRoot: true
               args:
-                - "--port=8001"
-                - "--vllm-port=8000"
+                - "--port=8000"
+                - "--vllm-port=8001"
+                - "--connector=nixl"
               ports:
                 - containerPort: 8000
                   protocol: TCP
+          containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
               imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }}
@@ -85,31 +51,32 @@ data:
                 - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }}
               args:
                 - "--port"
-                - "8000"
+                - "8001"
                 - "--kv-transfer-config"
-                - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
+                - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+                - "--enforce-eager"
               env:
                 - name: HOME
                   value: /home
+                - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                  value: "5557"
+                - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.podIP
+                - name: VLLM_LOGGING_LEVEL
+                  value: DEBUG
                 - name: POD_IP
                   valueFrom:
                     fieldRef:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: ${POD_IP}:80
+                  value: {{ `"${POD_IP}:80"` }}
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                - name: LMCACHE_USE_EXPERIMENTAL
-                  value: "True"
-                - name: VLLM_ENABLE_V1_MULTIPROCESSING
-                  value: "1"
-                - name: VLLM_WORKER_MULTIPROC_METHOD
-                  value: spawn
-                - name: LMCACHE_CONFIG_FILE
-                  value: /vllm-workspace/lmcache-decoder-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -121,8 +88,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-decoder
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -132,16 +97,11 @@ data:
                   readOnly: true
                 {{ `{{- end }}` }}
               ports:
-                - containerPort: 8001
-                  protocol: TCP
-                - containerPort: 55555
+                - containerPort: 5557
                   protocol: TCP
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-decoder
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-decoder
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -157,18 +117,20 @@ data:
           tolerations:
             {{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }}
           {{- end }}
-          containers:
+          initContainers:
             - name: "routing-proxy"
               image: {{ include "modelservice.routingProxyImage" . }}
               securityContext:
                 allowPrivilegeEscalation: false
                 runAsNonRoot: true
               args:
-                - "--port=8001"
-                - "--vllm-port=8000"
+                - "--port=8000"
+                - "--vllm-port=8001"
+                - "--connector=nixl"
               ports:
                 - containerPort: 8000
                   protocol: TCP
+          containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
               imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }}
@@ -180,31 +142,32 @@ data:
                 - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }}
               args:
                 - "--port"
-                - "8000"
+                - "8001"
                 - "--kv-transfer-config"
-                - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
+                - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+                - "--enforce-eager"
               env:
                 - name: HOME
                   value: /home
+                - name: VLLM_LOGGING_LEVEL
+                  value: DEBUG
+                - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                  value: "5557"
+                - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.podIP
                 - name: POD_IP
                   valueFrom:
                     fieldRef:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: "${POD_IP}:80"
+                  value: {{ `"${POD_IP}:80"` }}
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                - name: LMCACHE_USE_EXPERIMENTAL
-                  value: "True"
-                - name: VLLM_ENABLE_V1_MULTIPROCESSING
-                  value: "1"
-                - name: VLLM_WORKER_MULTIPROC_METHOD
-                  value: spawn
-                - name: LMCACHE_CONFIG_FILE
-                  value: /vllm-workspace/lmcache-prefiller-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -216,8 +179,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-prefiller
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -227,16 +188,11 @@ data:
                   readOnly: true
                 {{ `{{- end }}` }}
               ports:
-                - containerPort: 8001
-                  protocol: TCP
-                - containerPort: 55555
+                - containerPort: 5557
                   protocol: TCP
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-prefiller
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-prefiller
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -254,7 +210,7 @@ data:
       clusterIP: None
       ports:
       - name: nixl
-        port: 55555
+        port: 5557
         protocol: TCP
       - name: vllm
         port: 8000
@@ -272,7 +228,7 @@ data:
       clusterIP: None
       ports:
       - name: nixl
-        port: 55555
+        port: 5557
         protocol: TCP
       - name: vllm
         port: 8000
@@ -332,6 +288,14 @@ data:
                 - --grpcHealthPort
                 - "9003"
               env:
+              - name: PD_ENABLED
+                value: "true"
+              - name: PD_PROMPT_LEN_THRESHOLD
+                value: "10"
+              {{- if .Values.redis.enabled }}
+              - name: KVCACHE_INDEXER_REDIS_ADDR
+                value: {{ include "redis.master.service.fullurl" . -}}
+              {{- end -}}
               {{/* HACK, waiting on: https://github.com/neuralmagic/llm-d-model-service/issues/123 */}}
               {{ `{{- if .HFModelName }}` }}
               - name: HF_TOKEN
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index 589847e..be30bb2 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -300,8 +300,8 @@ modelservice:
     # -- vLLM image used in ModelService CR presets
     image:
       registry: quay.io
-      repository: llm-d/llm-d-dev
-      tag: "0.0.5"
+      repository: "llm-d/llm-d-dev"
+      tag: "vllm-nixl-0.0.6"
       imagePullPolicy: "IfNotPresent"
 
     metrics:

From d13a7904767cd579919c996c06ea0bcbd0adddc1 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Thu, 8 May 2025 17:44:18 -0700
Subject: [PATCH 2/8] runs but no cache hit

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 .../modelservice/presets/basic-gpu-with-nixl-preset.yaml  | 8 ++++----
 charts/llm-d/values.yaml                                  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index 6b4e1a0..1be7c3c 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -52,9 +52,9 @@ data:
               args:
                 - "--port"
                 - "8001"
+                - "--enforce-eager"
                 - "--kv-transfer-config"
                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
-                - "--enforce-eager"
               env:
                 - name: HOME
                   value: /home
@@ -143,9 +143,9 @@ data:
               args:
                 - "--port"
                 - "8001"
+                - "--enforce-eager"
                 - "--kv-transfer-config"
                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
-                - "--enforce-eager"
               env:
                 - name: HOME
                   value: /home
@@ -213,7 +213,7 @@ data:
         port: 5557
         protocol: TCP
       - name: vllm
-        port: 8000
+        port: 8001
         protocol: TCP
 
   prefillService: |
@@ -231,7 +231,7 @@ data:
         port: 5557
         protocol: TCP
       - name: vllm
-        port: 8000
+        port: 8001
         protocol: TCP
 
   eppService: |
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index be30bb2..d896f23 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -301,7 +301,7 @@ modelservice:
     image:
       registry: quay.io
       repository: "llm-d/llm-d-dev"
-      tag: "vllm-nixl-0.0.6"
+      tag: "vllm-nixl-0.0.6-amd64"
       imagePullPolicy: "IfNotPresent"
 
     metrics:

From 151cf330f8fb432f199c54ba89fd225035593bf5 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 9 May 2025 06:26:52 -0700
Subject: [PATCH 3/8] no p/d services in prod example

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 .../presets/basic-gpu-with-nixl-preset.yaml   | 77 ++++++++++---------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index 1be7c3c..b9884e0 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -30,8 +30,11 @@ data:
             - name: routing-proxy
               image: {{ include "modelservice.routingProxyImage" . }}
               securityContext:
-                allowPrivilegeEscalation: false
+                capabilities:
+                  drop:
+                    - MKNOD
                 runAsNonRoot: true
+                allowPrivilegeEscalation: false
               args:
                 - "--port=8000"
                 - "--vllm-port=8001"
@@ -39,6 +42,8 @@ data:
               ports:
                 - containerPort: 8000
                   protocol: TCP
+              restartPolicy: Always
+              imagePullPolicy: Always
           containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
@@ -130,6 +135,8 @@ data:
               ports:
                 - containerPort: 8000
                   protocol: TCP
+              restartPolicy: Always
+              imagePullPolicy: Always
           containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
@@ -198,41 +205,41 @@ data:
               emptyDir: {}
             {{ `{{ end }}` }}
 
-  decodeService: |
-    apiVersion: v1
-    kind: Service
-    metadata:
-      labels:
-        {{- if .Values.modelservice.vllm.metrics.enabled }}
-        {{ include "metrics.label" . }}
-        {{- end }}
-    spec:
-      clusterIP: None
-      ports:
-      - name: nixl
-        port: 5557
-        protocol: TCP
-      - name: vllm
-        port: 8001
-        protocol: TCP
+  # decodeService: |
+  #   apiVersion: v1
+  #   kind: Service
+  #   metadata:
+  #     labels:
+  #       {{- if .Values.modelservice.vllm.metrics.enabled }}
+  #       {{ include "metrics.label" . }}
+  #       {{- end }}
+  #   spec:
+  #     clusterIP: None
+  #     ports:
+  #     - name: nixl
+  #       port: 5557
+  #       protocol: TCP
+  #     - name: vllm
+  #       port: 8000
+  #       protocol: TCP
 
-  prefillService: |
-    apiVersion: v1
-    kind: Service
-    metadata:
-      labels:
-        {{- if .Values.modelservice.vllm.metrics.enabled }}
-        {{ include "metrics.label" . }}
-        {{- end }}
-    spec:
-      clusterIP: None
-      ports:
-      - name: nixl
-        port: 5557
-        protocol: TCP
-      - name: vllm
-        port: 8001
-        protocol: TCP
+  # prefillService: |
+  #   apiVersion: v1
+  #   kind: Service
+  #   metadata:
+  #     labels:
+  #       {{- if .Values.modelservice.vllm.metrics.enabled }}
+  #       {{ include "metrics.label" . }}
+  #       {{- end }}
+  #   spec:
+  #     clusterIP: None
+  #     ports:
+  #     - name: nixl
+  #       port: 5557
+  #       protocol: TCP
+  #     - name: vllm
+  #       port: 8000
+  #       protocol: TCP
 
   eppService: |
     apiVersion: v1

From 39c7a6d263860a01e2202ad475276615dc6598d9 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 9 May 2025 08:27:03 -0700
Subject: [PATCH 4/8] restore pd services deemed non-invasive

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 charts/llm-d/README.md                        |  4 +-
 .../presets/basic-gpu-with-nixl-preset.yaml   | 72 ++++++++++---------
 charts/llm-d/values.yaml                      |  5 +-
 3 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md
index bde8013..d70542e 100644
--- a/charts/llm-d/README.md
+++ b/charts/llm-d/README.md
@@ -165,7 +165,7 @@ Kubernetes: `>= 1.25.0-0`
 | ingress.tls.enabled | Enable TLS configuration for the host defined at `ingress.host` parameter | bool | `false` |
 | ingress.tls.secretName | The name to which the TLS Secret will be called | string | `""` |
 | kubeVersion | Override Kubernetes version | string | `""` |
-| modelservice | Model service controller configuration | object | `{"annotations":{},"enabled":true,"epp":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5-amd64"},"metrics":{"enabled":true}},"fullnameOverride":"","image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.6"},"metrics":{"enabled":true},"nameOverride":"","podAnnotations":{},"podLabels":{},"rbac":{"create":true},"replicas":1,"routingProxy":{"image":{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-routing-sidecar-dev","tag":"0.0.6"}},"service":{"enabled":true,"port":8443,"type":"ClusterIP"},"serviceAccount":{"annotations":{},"create":true,"fullnameOverride":"","labels":{},"nameOverride":""},"vllm":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"},"metrics":{"enabled":true}},"vllmSim":{"image":{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim-dev","tag":"0.0.4"}}}` |
+| modelservice | Model service controller configuration | object | See below |
 | modelservice.annotations | Annotations to add to all modelservice resources | object | `{}` |
 | modelservice.decode | Decode options | object | See below |
 | modelservice.decode.tolerations | Tolerations configuration to deploy decode pods to tainted nodes | list | See below |
@@ -207,7 +207,7 @@ Kubernetes: `>= 1.25.0-0`
 | modelservice.serviceMonitor.port | ServiceMonitor endpoint port | string | `"vllm"` |
 | modelservice.serviceMonitor.selector | ServiceMonitor selector matchLabels </br> matchLabels must match labels on modelservice Services | object | `{"matchLabels":{}}` |
 | modelservice.vllm | vLLM container options | object | See below |
-| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"0.0.5"}` |
+| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"}` |
 | modelservice.vllm.metrics.enabled | Enable metrics scraping from vllm service, see `modelservice.serviceMonitor` for configuration | bool | `true` |
 | modelservice.vllmSim | vLL sim container options | object | See below |
 | modelservice.vllmSim.image | vLLM sim image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim","tag":"0.0.4"}` |
diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index b9884e0..e0ef382 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -49,6 +49,10 @@ data:
               image: {{ include "modelservice.vllmImage" . }}
               imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }}
               securityContext:
+                capabilities:
+                  drop:
+                    - MKNOD
+                runAsNonRoot: true
                 allowPrivilegeEscalation: false
               command:
                 - vllm
@@ -205,41 +209,41 @@ data:
               emptyDir: {}
             {{ `{{ end }}` }}
 
-  # decodeService: |
-  #   apiVersion: v1
-  #   kind: Service
-  #   metadata:
-  #     labels:
-  #       {{- if .Values.modelservice.vllm.metrics.enabled }}
-  #       {{ include "metrics.label" . }}
-  #       {{- end }}
-  #   spec:
-  #     clusterIP: None
-  #     ports:
-  #     - name: nixl
-  #       port: 5557
-  #       protocol: TCP
-  #     - name: vllm
-  #       port: 8000
-  #       protocol: TCP
+  decodeService: |
+    apiVersion: v1
+    kind: Service
+    metadata:
+      labels:
+        {{- if .Values.modelservice.vllm.metrics.enabled }}
+        {{ include "metrics.label" . }}
+        {{- end }}
+    spec:
+      clusterIP: None
+      ports:
+      - name: nixl
+        port: 5557
+        protocol: TCP
+      - name: vllm
+        port: 8000
+        protocol: TCP
 
-  # prefillService: |
-  #   apiVersion: v1
-  #   kind: Service
-  #   metadata:
-  #     labels:
-  #       {{- if .Values.modelservice.vllm.metrics.enabled }}
-  #       {{ include "metrics.label" . }}
-  #       {{- end }}
-  #   spec:
-  #     clusterIP: None
-  #     ports:
-  #     - name: nixl
-  #       port: 5557
-  #       protocol: TCP
-  #     - name: vllm
-  #       port: 8000
-  #       protocol: TCP
+  prefillService: |
+    apiVersion: v1
+    kind: Service
+    metadata:
+      labels:
+        {{- if .Values.modelservice.vllm.metrics.enabled }}
+        {{ include "metrics.label" . }}
+        {{- end }}
+    spec:
+      clusterIP: None
+      ports:
+      - name: nixl
+        port: 5557
+        protocol: TCP
+      - name: vllm
+        port: 8000
+        protocol: TCP
 
   eppService: |
     apiVersion: v1
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index d896f23..24bddc9 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -301,8 +301,9 @@ modelservice:
     image:
       registry: quay.io
       repository: "llm-d/llm-d-dev"
-      tag: "vllm-nixl-0.0.6-amd64"
-      imagePullPolicy: "IfNotPresent"
+      tag: "vllm-nixl-0.0.6"
+      imagePullPolicy: "Always"
+      # imagePullPolicy: "IfNotPresent"
 
     metrics:
 

From 8126af13a287b788f2a044244bfc2d4b7563b201 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 9 May 2025 09:07:44 -0700
Subject: [PATCH 5/8] keeping confimaps around but not using them in lmcache
 for dual connectors later

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 .../presets/basic-gpu-with-nixl-preset.yaml   |  58 ++++---
 notes/testing-nixl-and-epp.md                 | 142 ++++++++++++++++++
 2 files changed, 182 insertions(+), 18 deletions(-)
 create mode 100644 notes/testing-nixl-and-epp.md

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index e0ef382..f8d361d 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -16,6 +16,28 @@ metadata:
     {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }}
     {{- end }}
 data:
+  configMaps: |
+    - apiVersion: v1
+      kind: ConfigMap
+      metadata:
+        name: {{ include "modelservice.fullname" . -}}-config-decoder
+      data:
+        lmcache-decoder-config.yaml: |
+          # local_cpu: False
+          # max_local_cpu_size: 0
+          # max_local_disk_size: 0
+          # remote_serde: NULL
+    - apiVersion: v1
+      kind: ConfigMap
+      metadata:
+        name: {{ include "modelservice.fullname" . -}}-config-prefiller
+      data:
+        lmcache-prefiller-config.yaml: |
+          # local_cpu: False
+          # max_local_cpu_size: 0
+          # max_local_disk_size: 0
+          # remote_serde: NULL
+
   decodeDeployment: |
     apiVersion: apps/v1
     kind: Deployment
@@ -81,11 +103,13 @@ data:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: {{ `"${POD_IP}:80"` }}
+                  value: ${POD_IP}:8200
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
+                # - name: LMCACHE_CONFIG_FILE
+                #   value: /vllm-workspace/lmcache-decoder-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -97,6 +121,8 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
+                # - name: config-decoder
+                #   mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -111,6 +137,9 @@ data:
           volumes:
             - name: home
               emptyDir: {}
+            # - name: config-decoder
+            #   configMap:
+            #     name: {{ include "modelservice.fullname" . -}}-config-decoder
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -126,21 +155,6 @@ data:
           tolerations:
             {{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }}
           {{- end }}
-          initContainers:
-            - name: "routing-proxy"
-              image: {{ include "modelservice.routingProxyImage" . }}
-              securityContext:
-                allowPrivilegeEscalation: false
-                runAsNonRoot: true
-              args:
-                - "--port=8000"
-                - "--vllm-port=8001"
-                - "--connector=nixl"
-              ports:
-                - containerPort: 8000
-                  protocol: TCP
-              restartPolicy: Always
-              imagePullPolicy: Always
           containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
@@ -153,7 +167,7 @@ data:
                 - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }}
               args:
                 - "--port"
-                - "8001"
+                - "8000"
                 - "--enforce-eager"
                 - "--kv-transfer-config"
                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
@@ -174,11 +188,14 @@ data:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: {{ `"${POD_IP}:80"` }}
+                  value: ${POD_IP}:8200
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
+                ### Keep ability to enable LMCache configs but don't use them right now
+                # - name: LMCACHE_CONFIG_FILE
+                #   value: /vllm-workspace/lmcache-prefiller-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -190,6 +207,9 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
+                # - name: config-prefiller
+                #   configMap:
+                #     name: {{ include "modelservice.fullname" . -}}-config-prefiller
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -204,6 +224,8 @@ data:
           volumes:
             - name: home
               emptyDir: {}
+            # - name: config-prefiller
+            #   mountPath: /vllm-workspace
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md
new file mode 100644
index 0000000..c9f0ec9
--- /dev/null
+++ b/notes/testing-nixl-and-epp.md
@@ -0,0 +1,142 @@
+# notes
+
+Helper scritps
+
+```bash
+export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind."
+
+export LLM_PROMPT_2="Now that we have implemented benchmarks, I was hoping you could help me understand how I would track these manifests in GitOps. Ideally I would openshift gitops but would also support vanilla argocd for non OCP environments. Do you have any suggestions on the topic?"
+
+export LLM_PROMPT_3="Lets talk about dolphins! What are some unique characteristics of dolphins compared to other acquatic animals?"
+
+export LLM_PROMPT_4="speaking of aquatic animals, what is your favourite aquatic animal and why?"
+
+export LLM_PROMPT_5="How might I gather metrics on how much energy consumption my OCP cluster uses?"
+
+curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Llama-3.2-3B-Instruct",
+    "prompt": "'${LLM_PROMPT_1}'",
+    "max_tokens": 500,
+    "temperature": 0
+  }' | jq
+
+curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Llama-3.2-3B-Instruct",
+    "prompt": "'${LLM_PROMPT_2}'",
+    "max_tokens": 500,
+    "temperature": 0
+  }' | jq
+
+DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
+PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}')
+EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}')
+
+
+# grab logs together p/D
+stern -n $(oc project -q) "$PREFILL_POD|$DECODE_POD" -c vllm | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate"
+```
+
+## Debugging and testing NIXL KV cache
+
+Debugging KV cache through logs:
+
+#### Terminal 1 EPP
+
+Follow EPP logs to see if it can hit Decode routing sidecar
+
+```bash
+EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}')
+kubectl logs pod/${EPP_POD} -f | grep -v "Failed to refreshed metrics\|Refreshed metrics\|gRPC health check serving\|Refreshing Prometheus Metrics"
+```
+
+### Terminal 2 Routing sidecar (Decode)
+
+Follow the routing sidecar in the decode pod to see if it can post to prefill if needed
+
+```bash
+DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
+kubectl logs pod/${DECODE_POD} -c routing-proxy -f | grep -v "http: proxy error: dial tcp \[::1\]:8001: connect: connection refused"
+```
+
+### Terminal 3 Decode inference
+
+Follow the decode vllm logs:
+
+```bash
+DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
+kubectl logs pod/${DECODE_POD} -c vllm -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s"
+```
+
+### Terminal 4 Prefill
+
+Check to see that prefill logs are getting hit by decode:
+
+```bash
+PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}')
+kubectl logs pod/${PREFILL_POD} -f |  grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s"
+```
+
+At this point you should be able to send a request through the gatway and track the relevant logs:
+
+```bash
+INGRESS_ADDRESS=$(kubectl get ingress llm-d-inference-gateway | tail -n 1 | awk '{print $3}')
+curl ${INGRESS_ADDRESS}/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Llama-3.2-3B-Instruct",
+    "prompt": "'${LLM_PROMPT_1}'",
+    "max_tokens": 500,
+    "temperature": 0
+  }' | jq
+```
+
+Epp should filter out Prefill pods, and only target decode first. You should see this between the 2nd and 3rd steps in EPP when it applies the filter plugin:
+- Scheduling a request (step 2) has both pods as candidates, ex:
+```log
+{"level":"info","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g Address:10.131.10.180 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.29171375 +0000 UTC m=+388.303255999}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-84667878f9-lwb47 Address:10.128.13.52 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.316489317 +0000 UTC m=+388.328031566}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
+```
+- Apply filter plugin (step 3), only has decode as candidate to target sidecar first:
+```log
+{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g"},"Address":"10.131.10.180","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.29171375Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-84667878f9-lwb47"},"Address":"10.128.13.52","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.316489317Z"}]}
+{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"}
+```
+
+Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchistrated between P/D pods, ex:
+```log
+I0509 19:43:44.077499       1 chat_completions.go:110] "running NIXL protocol" logger="proxy server"
+I0509 19:43:44.077593       1 chat_completions.go:172] "sending request to prefiller" logger="proxy server" url="http://10.128.13.52:8000" body="{\"do_remote_decode\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"stream\":false,\"temperature\":0}"
+I0509 19:43:44.099979       1 chat_completions.go:217] "received prefiller response" logger="proxy server" remote_block_ids=[1,2,3,4] remote_engine_id="81eb3201-d5c2-4642-8131-7849f2e955ce" remote_host="10.128.13.52" remote_port=5557
+I0509 19:43:44.100082       1 chat_completions.go:252] "sending request to decoder" logger="proxy server" body="{\"do_remote_prefill\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"remote_block_ids\":[1,2,3,4],\"remote_engine_id\":\"81eb3201-d5c2-4642-8131-7849f2e955ce\",\"remote_host\":\"10.128.13.52\",\"remote_port\":5557,\"temperature\":0}"
+```
+
+Finally in the decode inference pod (terminal 3) we should see the logs on KV transfer:
+
+```log
+INFO 05-09 19:26:20 [logger.py:39] Received request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
+INFO 05-09 19:26:20 [async_llm.py:255] Added request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0.
+DEBUG 05-09 19:26:20 [core.py:431] EngineCore loop active.
+DEBUG 05-09 19:26:20 [nixl_connector.py:559] start_load_kv for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 from remote engine 81eb3201-d5c2-4642-8131-7849f2e955ce. Num local_block_ids: 4. Num remote_block_ids: 4.
+DEBUG 05-09 19:26:20 [nixl_connector.py:313] Querying metadata on path: tcp://10.128.13.52:5557
+DEBUG 05-09 19:26:20 [nixl_connector.py:422] Created 1055264 blocks for src engine 6d177cac-6a93-4396-8c06-a5af03e9ace7 and rank 0
+DEBUG 05-09 19:26:21 [nixl_connector.py:439] Created 1055264 blocks for dst engine 81eb3201-d5c2-4642-8131-7849f2e955ce and rank 0
+DEBUG 05-09 19:26:22 [nixl_connector.py:326] NIXL handshake: get metadata took: 0.0025545399985276163
+DEBUG 05-09 19:26:22 [nixl_connector.py:328] NIXL handshake: add agent took: 2.2907175269938307
+DEBUG 05-09 19:26:22 [nixl_connector.py:463] Rank 0, get_finished: 0 requests done sending and 1 requests done recving
+DEBUG 05-09 19:26:22 [scheduler.py:862] Finished recving KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0
+```
+
+If you are debugging networking you can finally observe the prefill pod logs to see how it recieves the request from decode, and sends back the KVs
+
+```log
+INFO 05-09 19:43:44 [logger.py:39] Received request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
+INFO 05-09 19:43:44 [async_llm.py:255] Added request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0.
+DEBUG 05-09 19:43:44 [core.py:431] EngineCore loop active.
+DEBUG 05-09 19:43:44 [nixl_connector.py:463] Rank 0, get_finished: 1 requests done sending and 0 requests done recving
+DEBUG 05-09 19:43:44 [scheduler.py:865] Finished sending KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0
+DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work.
+INFO:     10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK
+```

From 44b4607d799c91f281873377a0e2b99311f16d7a Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Fri, 9 May 2025 13:17:01 -0700
Subject: [PATCH 6/8] downgrade to working image

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 charts/llm-d/README.md                        |  2 +-
 .../presets/basic-gpu-with-nixl-preset.yaml   | 28 +++----
 .../sample-application/modelservice.yaml      | 10 +++
 charts/llm-d/values.yaml                      |  6 +-
 notes/testing-nixl-and-epp.md                 | 74 ++++++++++++++++---
 5 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md
index d70542e..d4c7b9b 100644
--- a/charts/llm-d/README.md
+++ b/charts/llm-d/README.md
@@ -174,7 +174,7 @@ Kubernetes: `>= 1.25.0-0`
 | modelservice.epp | Endpoint picker configuration | object | See below |
 | modelservice.epp.defaultEnvVars | Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again. Ref: https://github.com/neuralmagic/gateway-api-inference-extension/tree/dev?tab=readme-ov-file#temporary-fork-configuration | list | `[{"name":"ENABLE_KVCACHE_AWARE_SCORER","value":"{{ .Values.redis.enabled }}"},{"name":"KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"KVCACHE_INDEXER_REDIS_ADDR","value":"{{ if .Values.redis.enabled }}{{ include \"redis.master.service.fullurl\" . }}{{ end }}"},{"name":"ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_SESSION_AWARE_SCORER","value":"true"},{"name":"SESSION_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PD_ENABLED","value":"true"},{"name":"PD_PROMPT_LEN_THRESHOLD","value":"10"},{"name":"PREFILL_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"PREFILL_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"PREFILL_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFILL_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"DECODE_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"DECODE_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"DECODE_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"}]` |
 | modelservice.epp.extraEnvVars | Additional environment variables for endpoint picker | list | `[]` |
-| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-inference-scheduler","tag":"0.0.1"}` |
+| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5"}` |
 | modelservice.epp.metrics.enabled | Enable metrics scraping from endpoint picker service, see `modelservice.serviceMonitor` for configuration | bool | `true` |
 | modelservice.fullnameOverride | String to fully override modelservice.fullname | string | `""` |
 | modelservice.image | Modelservice controller image, please change only if appropriate adjustments to the CRD are being made | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.8"}` |
diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index f8d361d..28d2300 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -121,8 +121,8 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                # - name: config-decoder
-                #   mountPath: /vllm-workspace
+                - name: config-decoder
+                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -137,9 +137,9 @@ data:
           volumes:
             - name: home
               emptyDir: {}
-            # - name: config-decoder
-            #   configMap:
-            #     name: {{ include "modelservice.fullname" . -}}-config-decoder
+            - name: config-decoder
+              configMap:
+                name: {{ include "modelservice.fullname" . -}}-config-decoder
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -207,9 +207,8 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                # - name: config-prefiller
-                #   configMap:
-                #     name: {{ include "modelservice.fullname" . -}}-config-prefiller
+                - name: config-prefiller
+                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -224,8 +223,9 @@ data:
           volumes:
             - name: home
               emptyDir: {}
-            # - name: config-prefiller
-            #   mountPath: /vllm-workspace
+            - name: config-prefiller
+              configMap:
+                name: {{ include "modelservice.fullname" . -}}-config-prefiller
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -321,14 +321,6 @@ data:
                 - --grpcHealthPort
                 - "9003"
               env:
-              - name: PD_ENABLED
-                value: "true"
-              - name: PD_PROMPT_LEN_THRESHOLD
-                value: "10"
-              {{- if .Values.redis.enabled }}
-              - name: KVCACHE_INDEXER_REDIS_ADDR
-                value: {{ include "redis.master.service.fullurl" . -}}
-              {{- end -}}
               {{/* HACK, waiting on: https://github.com/neuralmagic/llm-d-model-service/issues/123 */}}
               {{ `{{- if .HFModelName }}` }}
               - name: HF_TOKEN
diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index 3190fd2..5597fd2 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -52,4 +52,14 @@ spec:
             name: {{ .Values.sampleApplication.model.auth.hfToken.name }}
             key: {{ .Values.sampleApplication.model.auth.hfToken.key }}
       {{- end }}
+  epp:
+    defaultEnvVars:
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+    {{- if .Values.redis.enabled }}
+      - name: KVCACHE_INDEXER_REDIS_ADDR
+        value: {{ include "redis.master.service.fullurl" . -}}
+    {{- end -}}
 {{- end }}
diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
index 24bddc9..26bb420 100644
--- a/charts/llm-d/values.yaml
+++ b/charts/llm-d/values.yaml
@@ -207,8 +207,10 @@ modelservice:
     # -- Endpoint picker image used in ModelService CR presets
     image:
       registry: quay.io
-      repository: llm-d/llm-d-inference-scheduler
-      tag: "0.0.1"
+      repository: llm-d/llm-d-gateway-api-inference-extension-dev
+      tag: 0.0.5
+      # repository: llm-d/llm-d-inference-scheduler
+      # tag: 0.0.1
       imagePullPolicy: "Always"
 
     metrics:
diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md
index c9f0ec9..27f1fc2 100644
--- a/notes/testing-nixl-and-epp.md
+++ b/notes/testing-nixl-and-epp.md
@@ -1,6 +1,6 @@
 # notes
 
-Helper scritps
+Helper scripts
 
 ```bash
 export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind."
@@ -44,9 +44,9 @@ stern -n $(oc project -q) "$PREFILL_POD|$DECODE_POD" -c vllm | grep -v "\"GET /m
 
 Debugging KV cache through logs:
 
-#### Terminal 1 EPP
+### Terminal 1 EPP
 
-Follow EPP logs to see if it can hit Decode routing sidecar
+Follow EPP logs to see the logic around which inferencing pods are picked up:
 
 ```bash
 EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}')
@@ -80,10 +80,13 @@ PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=
 kubectl logs pod/${PREFILL_POD} -f |  grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s"
 ```
 
-At this point you should be able to send a request through the gatway and track the relevant logs:
+At this point you should be able to send a request through the gateway and track the relevant logs:
 
 ```bash
 INGRESS_ADDRESS=$(kubectl get ingress llm-d-inference-gateway | tail -n 1 | awk '{print $3}')
+
+export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind."
+
 curl ${INGRESS_ADDRESS}/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
@@ -94,18 +97,54 @@ curl ${INGRESS_ADDRESS}/v1/completions \
   }' | jq
 ```
 
-Epp should filter out Prefill pods, and only target decode first. You should see this between the 2nd and 3rd steps in EPP when it applies the filter plugin:
-- Scheduling a request (step 2) has both pods as candidates, ex:
+### Investigating our EPP logs
+
+First the EPP logs will identify that a `LLM Request has been assembled` and then will schedule the request.
+
+```log
+{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"handlers/request.go:75","msg":"LLM request assembled","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 0"}
+{"level":"info","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6588b8d59c-5tccf Address:10.131.10.188 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:36:04.729232544 +0000 UTC m=+572.692052157}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:36:04.750653805 +0000 UTC m=+572.713473418}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
+```
+
+#### Role specific logs (prefill)
+
+After this, Epp will run the filter plugin with desired model to grab the pool of all prefill nodes for that model (in this case, we only have 1 prefill pod for that model):
+
+```log
+{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6588b8d59c-5tccf"},"Address":"10.131.10.188","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.729232544Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.750653805Z"}]}
+
+{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"}
+
+{"level":"Level(-4)","ts":"2025-05-09T21:36:04Z","caller":"scheduling/scheduler.go:167","msg":"Filter plugin result","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:36:04.750653805Z"}]}
+```
+
+Next, EPP will apply the `scorer plugin`, to score each of the prefill pods available for that model:
+
 ```log
-{"level":"info","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g Address:10.131.10.180 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.29171375 +0000 UTC m=+388.303255999}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-84667878f9-lwb47 Address:10.128.13.52 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.316489317 +0000 UTC m=+388.328031566}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
+{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:179","msg":"Before running scorer plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0.0003184037359371672,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:54:50.70102255Z"}]}
+
+{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:196","msg":"After running scorer plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
 ```
-- Apply filter plugin (step 3), only has decode as candidate to target sidecar first:
+
+Finally, EPP will run the `picker plugin` to select the best canidated within the pool based on the scores:
+
 ```log
-{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g"},"Address":"10.131.10.180","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.29171375Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-84667878f9-lwb47"},"Address":"10.128.13.52","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.316489317Z"}]}
-{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"}
+{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:210","msg":"Before running picker plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","podsError":"json: unsupported type: map[types.Pod]float64"}
+
+{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"picker/max_score_picker.go:31","msg":"Selecting a pod with the max score from 1 candidates: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0.0003184037359371672 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.70102255 +0000 UTC m=+1698.663842173}}]","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
+
+{"level":"Level(-4)","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:214","msg":"After running picker plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","result":{"TargetPod":{"Pod":{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb"},"Address":"10.128.13.72","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0.0003184037359371672,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T21:54:50.70102255Z"},"Score":0},"MutatedHeaders":null}}
+
+{"level":"info","ts":"2025-05-09T21:54:50Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6588b8d59c-5tccf Address:10.131.10.188 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:5.30672893228612e-05 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.677717088 +0000 UTC m=+1698.640536701}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-57f7cc59b5-bxjnb Address:10.128.13.72 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0.0003184037359371672 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 21:54:50.70102255 +0000 UTC m=+1698.663842173}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
 ```
 
-Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchistrated between P/D pods, ex:
+All of these role specific logs will then repeat until we also have a `decode` pod we can hit.
+
+### Sidecar router logs
+
+
+Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchestrated between P/D pods, ex:
+
 ```log
 I0509 19:43:44.077499       1 chat_completions.go:110] "running NIXL protocol" logger="proxy server"
 I0509 19:43:44.077593       1 chat_completions.go:172] "sending request to prefiller" logger="proxy server" url="http://10.128.13.52:8000" body="{\"do_remote_decode\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"stream\":false,\"temperature\":0}"
@@ -113,9 +152,14 @@ I0509 19:43:44.099979       1 chat_completions.go:217] "received prefiller respo
 I0509 19:43:44.100082       1 chat_completions.go:252] "sending request to decoder" logger="proxy server" body="{\"do_remote_prefill\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"remote_block_ids\":[1,2,3,4],\"remote_engine_id\":\"81eb3201-d5c2-4642-8131-7849f2e955ce\",\"remote_host\":\"10.128.13.52\",\"remote_port\":5557,\"temperature\":0}"
 ```
 
+Here you can see how the request comes into the routing sidecar. It routes its request to the prefiller first, and receives a response. The sidecard then receives the response from prefiller, and sends the request over to decode.
+
+### Decode VLLM logs
+
 Finally in the decode inference pod (terminal 3) we should see the logs on KV transfer:
 
 ```log
+
 INFO 05-09 19:26:20 [logger.py:39] Received request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
 INFO 05-09 19:26:20 [async_llm.py:255] Added request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0.
 DEBUG 05-09 19:26:20 [core.py:431] EngineCore loop active.
@@ -129,7 +173,11 @@ DEBUG 05-09 19:26:22 [nixl_connector.py:463] Rank 0, get_finished: 0 requests do
 DEBUG 05-09 19:26:22 [scheduler.py:862] Finished recving KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0
 ```
 
-If you are debugging networking you can finally observe the prefill pod logs to see how it recieves the request from decode, and sends back the KVs
+Here you can see a request comes in from the sidecar (after the sidecar received response from prefill). The decode vllm pod receives a `kv_load` request from the `prefill` node, does the NIXL handshake, and receives the KV transfer from `prefill`.
+
+### Prefil VLLM logs
+
+Finally to get the full picture we can see the prefill logs.
 
 ```log
 INFO 05-09 19:43:44 [logger.py:39] Received request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
@@ -140,3 +188,5 @@ DEBUG 05-09 19:43:44 [scheduler.py:865] Finished sending KV transfer for request
 DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work.
 INFO:     10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK
 ```
+
+It receives the orrigional request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode.

From 2b7ce2304a20e71dc7cd815f289ec11f42e073e8 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Sun, 11 May 2025 08:53:55 -0700
Subject: [PATCH 7/8] removing dead code placeholder sections

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 .../presets/basic-gpu-with-nixl-preset.yaml   | 37 -------------------
 .../sample-application/modelservice.yaml      | 10 -----
 2 files changed, 47 deletions(-)

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
index 28d2300..31e7862 100644
--- a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
+++ b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -16,28 +16,6 @@ metadata:
     {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }}
     {{- end }}
 data:
-  configMaps: |
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-decoder
-      data:
-        lmcache-decoder-config.yaml: |
-          # local_cpu: False
-          # max_local_cpu_size: 0
-          # max_local_disk_size: 0
-          # remote_serde: NULL
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-prefiller
-      data:
-        lmcache-prefiller-config.yaml: |
-          # local_cpu: False
-          # max_local_cpu_size: 0
-          # max_local_disk_size: 0
-          # remote_serde: NULL
-
   decodeDeployment: |
     apiVersion: apps/v1
     kind: Deployment
@@ -108,8 +86,6 @@ data:
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                # - name: LMCACHE_CONFIG_FILE
-                #   value: /vllm-workspace/lmcache-decoder-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -121,8 +97,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-decoder
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -137,9 +111,6 @@ data:
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-decoder
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-decoder
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -193,9 +164,6 @@ data:
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                ### Keep ability to enable LMCache configs but don't use them right now
-                # - name: LMCACHE_CONFIG_FILE
-                #   value: /vllm-workspace/lmcache-prefiller-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -207,8 +175,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-prefiller
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -223,9 +189,6 @@ data:
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-prefiller
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-prefiller
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
diff --git a/charts/llm-d/templates/sample-application/modelservice.yaml b/charts/llm-d/templates/sample-application/modelservice.yaml
index 5597fd2..3190fd2 100644
--- a/charts/llm-d/templates/sample-application/modelservice.yaml
+++ b/charts/llm-d/templates/sample-application/modelservice.yaml
@@ -52,14 +52,4 @@ spec:
             name: {{ .Values.sampleApplication.model.auth.hfToken.name }}
             key: {{ .Values.sampleApplication.model.auth.hfToken.key }}
       {{- end }}
-  epp:
-    defaultEnvVars:
-      - name: PD_ENABLED
-        value: "true"
-      - name: PD_PROMPT_LEN_THRESHOLD
-        value: "10"
-    {{- if .Values.redis.enabled }}
-      - name: KVCACHE_INDEXER_REDIS_ADDR
-        value: {{ include "redis.master.service.fullurl" . -}}
-    {{- end -}}
 {{- end }}

From 0dab3ba813e4965cd32b16caf2f27a6e3808f9a3 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Sun, 11 May 2025 08:56:19 -0700
Subject: [PATCH 8/8] linting

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 charts/llm-d/Chart.yaml       | 2 +-
 charts/llm-d/README.md        | 2 +-
 notes/testing-nixl-and-epp.md | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/charts/llm-d/Chart.yaml b/charts/llm-d/Chart.yaml
index 5c52de7..fd8f519 100644
--- a/charts/llm-d/Chart.yaml
+++ b/charts/llm-d/Chart.yaml
@@ -1,7 +1,7 @@
 apiVersion: v2
 name: llm-d
 type: application
-version: 0.7.0
+version: 0.7.1
 appVersion: "0.0.1"
 icon: data:null
 description: A Helm chart for llm-d
diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md
index d4c7b9b..737b374 100644
--- a/charts/llm-d/README.md
+++ b/charts/llm-d/README.md
@@ -1,7 +1,7 @@
 
 # llm-d Helm Chart for OpenShift
 
-![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square)
+![Version: 0.7.1](https://img.shields.io/badge/Version-0.7.1-informational?style=flat-square)
 ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
 
 A Helm chart for llm-d
diff --git a/notes/testing-nixl-and-epp.md b/notes/testing-nixl-and-epp.md
index 27f1fc2..3372d51 100644
--- a/notes/testing-nixl-and-epp.md
+++ b/notes/testing-nixl-and-epp.md
@@ -142,7 +142,6 @@ All of these role specific logs will then repeat until we also have a `decode` p
 
 ### Sidecar router logs
 
-
 Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchestrated between P/D pods, ex:
 
 ```log
@@ -189,4 +188,4 @@ DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work.
 INFO:     10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK
 ```
 
-It receives the orrigional request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode.
+It receives the original request from the routing sidecar, finishes doing the inference and then sends the KV transfer to decode.