llm-d · tumido · May 12, 2025 · May 6, 2025 · May 9, 2025 · May 9, 2025
diff --git a/charts/llm-d/Chart.yaml b/charts/llm-d/Chart.yaml
@@ -1,7 +1,7 @@
 apiVersion: v2
 name: llm-d
 type: application
-version: 0.7.0
+version: 0.7.1
 appVersion: "0.0.1"
 icon: data:null
 description: A Helm chart for llm-d

diff --git a/charts/llm-d/README.md b/charts/llm-d/README.md
@@ -1,7 +1,7 @@
 
 # llm-d Helm Chart for OpenShift
 
-![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square)
+![Version: 0.7.1](https://img.shields.io/badge/Version-0.7.1-informational?style=flat-square)
 ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
 
 A Helm chart for llm-d
@@ -174,7 +174,7 @@ Kubernetes: `>= 1.25.0-0`
 | modelservice.epp | Endpoint picker configuration | object | See below |
 | modelservice.epp.defaultEnvVars | Default environment variables for endpoint picker, use `extraEnvVars` to override default behavior by defining the same variable again. Ref: https://github.com/neuralmagic/gateway-api-inference-extension/tree/dev?tab=readme-ov-file#temporary-fork-configuration | list | `[{"name":"ENABLE_KVCACHE_AWARE_SCORER","value":"{{ .Values.redis.enabled }}"},{"name":"KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"KVCACHE_INDEXER_REDIS_ADDR","value":"{{ if .Values.redis.enabled }}{{ include \"redis.master.service.fullurl\" . }}{{ end }}"},{"name":"ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"ENABLE_SESSION_AWARE_SCORER","value":"true"},{"name":"SESSION_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PD_ENABLED","value":"true"},{"name":"PD_PROMPT_LEN_THRESHOLD","value":"10"},{"name":"PREFILL_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"PREFILL_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"PREFILL_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"PREFILL_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"PREFILL_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_KVCACHE_AWARE_SCORER","value":"true"},{"name":"DECODE_KVCACHE_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_LOAD_AWARE_SCORER","value":"true"},{"name":"DECODE_LOAD_AWARE_SCORER_WEIGHT","value":"1.0"},{"name":"DECODE_ENABLE_PREFIX_AWARE_SCORER","value":"true"},{"name":"DECODE_PREFIX_AWARE_SCORER_WEIGHT","value":"1.0"}]` |
 | modelservice.epp.extraEnvVars | Additional environment variables for endpoint picker | list | `[]` |
-| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-inference-scheduler","tag":"0.0.1"}` |
+| modelservice.epp.image | Endpoint picker image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-gateway-api-inference-extension-dev","tag":"0.0.5"}` |
 | modelservice.epp.metrics.enabled | Enable metrics scraping from endpoint picker service, see `modelservice.serviceMonitor` for configuration | bool | `true` |
 | modelservice.fullnameOverride | String to fully override modelservice.fullname | string | `""` |
 | modelservice.image | Modelservice controller image, please change only if appropriate adjustments to the CRD are being made | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-model-service","tag":"0.0.8"}` |
@@ -207,7 +207,7 @@ Kubernetes: `>= 1.25.0-0`
 | modelservice.serviceMonitor.port | ServiceMonitor endpoint port | string | `"vllm"` |
 | modelservice.serviceMonitor.selector | ServiceMonitor selector matchLabels </br> matchLabels must match labels on modelservice Services | object | `{"matchLabels":{}}` |
 | modelservice.vllm | vLLM container options | object | See below |
-| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"0.0.5"}` |
+| modelservice.vllm.image | vLLM image used in ModelService CR presets | object | `{"imagePullPolicy":"Always","registry":"quay.io","repository":"llm-d/llm-d-dev","tag":"vllm-nixl-0.0.6"}` |
 | modelservice.vllm.metrics.enabled | Enable metrics scraping from vllm service, see `modelservice.serviceMonitor` for configuration | bool | `true` |
 | modelservice.vllmSim | vLL sim container options | object | See below |
 | modelservice.vllmSim.image | vLLM sim image used in ModelService CR presets | object | `{"imagePullPolicy":"IfNotPresent","registry":"quay.io","repository":"llm-d/vllm-sim","tag":"0.0.4"}` |

diff --git a/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml b/charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml
@@ -16,42 +16,6 @@ metadata:
     {{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }}
     {{- end }}
 data:
-  configMaps: |
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-decoder
-      data:
-        lmcache-decoder-config.yaml: |
-          local_cpu: False
-          max_local_cpu_size: 0
-          max_local_disk_size: 0
-          remote_serde: NULL
-          enable_nixl: True
-          nixl_role: receiver
-          nixl_peer_host: 0.0.0.0
-          nixl_peer_port: 55555
-          nixl_buffer_size: 524288
-          nixl_buffer_device: "cuda"
-          nixl_enable_gc: True
-    - apiVersion: v1
-      kind: ConfigMap
-      metadata:
-        name: {{ include "modelservice.fullname" . -}}-config-prefiller
-      data:
-        lmcache-prefiller-config.yaml: |
-          local_cpu: False
-          max_local_cpu_size: 0
-          max_local_disk_size: 0
-          remote_serde: NULL
-          enable_nixl: True
-          nixl_role: "sender"
-          nixl_peer_host: {{`"{{ .DecodeServiceName }}"`}}
-          nixl_peer_port: 55555
-          nixl_buffer_size: 524288
-          nixl_buffer_device: "cuda"
-          nixl_enable_gc: True
-
   decodeDeployment: |
     apiVersion: apps/v1
     kind: Deployment
@@ -62,54 +26,66 @@ data:
           tolerations:
             {{- toYaml .Values.modelservice.decode.tolerations | nindent 12 }}
           {{- end }}
-          containers:
+          initContainers:
             - name: routing-proxy
               image: {{ include "modelservice.routingProxyImage" . }}
               securityContext:
-                allowPrivilegeEscalation: false
+                capabilities:
+                  drop:
+                    - MKNOD
                 runAsNonRoot: true
+                allowPrivilegeEscalation: false
               args:
-                - "--port=8001"
-                - "--vllm-port=8000"
+                - "--port=8000"
+                - "--vllm-port=8001"
+                - "--connector=nixl"
               ports:
                 - containerPort: 8000
                   protocol: TCP
+              restartPolicy: Always
+              imagePullPolicy: Always
+          containers:
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
               imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }}
               securityContext:
+                capabilities:
+                  drop:
+                    - MKNOD
+                runAsNonRoot: true
                 allowPrivilegeEscalation: false
               command:
                 - vllm
                 - serve
                 - {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }}
               args:
                 - "--port"
-                - "8000"
+                - "8001"
+                - "--enforce-eager"
                 - "--kv-transfer-config"
-                - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
+                - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
               env:
                 - name: HOME
                   value: /home
+                - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                  value: "5557"
+                - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.podIP
+                - name: VLLM_LOGGING_LEVEL
+                  value: DEBUG
                 - name: POD_IP
                   valueFrom:
                     fieldRef:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: ${POD_IP}:80
+                  value: ${POD_IP}:8200
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                - name: LMCACHE_USE_EXPERIMENTAL
-                  value: "True"
-                - name: VLLM_ENABLE_V1_MULTIPROCESSING
-                  value: "1"
-                - name: VLLM_WORKER_MULTIPROC_METHOD
-                  value: spawn
-                - name: LMCACHE_CONFIG_FILE
-                  value: /vllm-workspace/lmcache-decoder-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -121,8 +97,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-decoder
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -132,16 +106,11 @@ data:
                   readOnly: true
                 {{ `{{- end }}` }}
               ports:
-                - containerPort: 8001
-                  protocol: TCP
-                - containerPort: 55555
+                - containerPort: 5557
                   protocol: TCP
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-decoder
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-decoder
             {{ `{{- if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -158,17 +127,6 @@ data:
             {{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }}
           {{- end }}
           containers:
-            - name: "routing-proxy"
-              image: {{ include "modelservice.routingProxyImage" . }}
-              securityContext:
-                allowPrivilegeEscalation: false
-                runAsNonRoot: true
-              args:
-                - "--port=8001"
-                - "--vllm-port=8000"
-              ports:
-                - containerPort: 8000
-                  protocol: TCP
             - name: vllm
               image: {{ include "modelservice.vllmImage" . }}
               imagePullPolicy: {{ .Values.modelservice.vllm.image.imagePullPolicy }}
@@ -181,30 +139,31 @@ data:
               args:
                 - "--port"
                 - "8000"
+                - "--enforce-eager"
                 - "--kv-transfer-config"
-                - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
+                - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
               env:
                 - name: HOME
                   value: /home
+                - name: VLLM_LOGGING_LEVEL
+                  value: DEBUG
+                - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+                  value: "5557"
+                - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.podIP
                 - name: POD_IP
                   valueFrom:
                     fieldRef:
                       apiVersion: v1
                       fieldPath: status.podIP
                 - name: LMCACHE_DISTRIBUTED_URL
-                  value: "${POD_IP}:80"
+                  value: ${POD_IP}:8200
                 - name: CUDA_VISIBLE_DEVICES
                   value: "0"
                 - name: UCX_TLS
                   value: "cuda_ipc,cuda_copy,tcp"
-                - name: LMCACHE_USE_EXPERIMENTAL
-                  value: "True"
-                - name: VLLM_ENABLE_V1_MULTIPROCESSING
-                  value: "1"
-                - name: VLLM_WORKER_MULTIPROC_METHOD
-                  value: spawn
-                - name: LMCACHE_CONFIG_FILE
-                  value: /vllm-workspace/lmcache-prefiller-config.yaml
                 {{- if .Values.redis.enabled }}
                 - name: LMCACHE_LOOKUP_URL
                   value: {{ include "redis.master.service.fullurl" .}}
@@ -216,8 +175,6 @@ data:
               volumeMounts:
                 - name: home
                   mountPath: /home
-                - name: config-prefiller
-                  mountPath: /vllm-workspace
                 {{ `{{- if .HFModelName }}` }}
                 - name: model-cache
                   mountPath: /models
@@ -227,16 +184,11 @@ data:
                   readOnly: true
                 {{ `{{- end }}` }}
               ports:
-                - containerPort: 8001
-                  protocol: TCP
-                - containerPort: 55555
+                - containerPort: 5557
                   protocol: TCP
           volumes:
             - name: home
               emptyDir: {}
-            - name: config-prefiller
-              configMap:
-                name: {{ include "modelservice.fullname" . -}}-config-prefiller
             {{ `{{ if .HFModelName }}` }}
             - name: model-cache
               emptyDir: {}
@@ -254,7 +206,7 @@ data:
       clusterIP: None
       ports:
       - name: nixl
-        port: 55555
+        port: 5557
         protocol: TCP
       - name: vllm
         port: 8000
@@ -272,7 +224,7 @@ data:
       clusterIP: None
       ports:
       - name: nixl
-        port: 55555
+        port: 5557
         protocol: TCP
       - name: vllm
         port: 8000

diff --git a/charts/llm-d/values.yaml b/charts/llm-d/values.yaml
@@ -207,8 +207,10 @@ modelservice:
     # -- Endpoint picker image used in ModelService CR presets
     image:
       registry: quay.io
-      repository: llm-d/llm-d-inference-scheduler
-      tag: "0.0.1"
+      repository: llm-d/llm-d-gateway-api-inference-extension-dev
+      tag: 0.0.5
+      # repository: llm-d/llm-d-inference-scheduler
+      # tag: 0.0.1
       imagePullPolicy: "Always"
 
     metrics:
@@ -300,9 +302,10 @@ modelservice:
     # -- vLLM image used in ModelService CR presets
     image:
       registry: quay.io
-      repository: llm-d/llm-d-dev
-      tag: "0.0.5"
-      imagePullPolicy: "IfNotPresent"
+      repository: "llm-d/llm-d-dev"
+      tag: "vllm-nixl-0.0.6"
+      imagePullPolicy: "Always"
+      # imagePullPolicy: "IfNotPresent"
 
     metrics: