Skip to content
This repository was archived by the owner on Oct 15, 2025. It is now read-only.

Commit e8b3df6

Browse files
keeping confimaps around but not using them in lmcache for dual connectors later
Signed-off-by: greg pereira <grpereir@redhat.com>
1 parent 218ef1a commit e8b3df6

2 files changed

Lines changed: 182 additions & 18 deletions

File tree

charts/llm-d/templates/modelservice/presets/basic-gpu-with-nixl-preset.yaml

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,28 @@ metadata:
1616
{{- include "common.tplvalues.render" ( dict "value" .Values.modelservice.annotations "context" $) | nindent 4 }}
1717
{{- end }}
1818
data:
19+
configMaps: |
20+
- apiVersion: v1
21+
kind: ConfigMap
22+
metadata:
23+
name: {{ include "modelservice.fullname" . -}}-config-decoder
24+
data:
25+
lmcache-decoder-config.yaml: |
26+
# local_cpu: False
27+
# max_local_cpu_size: 0
28+
# max_local_disk_size: 0
29+
# remote_serde: NULL
30+
- apiVersion: v1
31+
kind: ConfigMap
32+
metadata:
33+
name: {{ include "modelservice.fullname" . -}}-config-prefiller
34+
data:
35+
lmcache-prefiller-config.yaml: |
36+
# local_cpu: False
37+
# max_local_cpu_size: 0
38+
# max_local_disk_size: 0
39+
# remote_serde: NULL
40+
1941
decodeDeployment: |
2042
apiVersion: apps/v1
2143
kind: Deployment
@@ -81,11 +103,13 @@ data:
81103
apiVersion: v1
82104
fieldPath: status.podIP
83105
- name: LMCACHE_DISTRIBUTED_URL
84-
value: {{ `"${POD_IP}:80"` }}
106+
value: ${POD_IP}:8200
85107
- name: CUDA_VISIBLE_DEVICES
86108
value: "0"
87109
- name: UCX_TLS
88110
value: "cuda_ipc,cuda_copy,tcp"
111+
# - name: LMCACHE_CONFIG_FILE
112+
# value: /vllm-workspace/lmcache-decoder-config.yaml
89113
{{- if .Values.redis.enabled }}
90114
- name: LMCACHE_LOOKUP_URL
91115
value: {{ include "redis.master.service.fullurl" .}}
@@ -97,6 +121,8 @@ data:
97121
volumeMounts:
98122
- name: home
99123
mountPath: /home
124+
# - name: config-decoder
125+
# mountPath: /vllm-workspace
100126
{{ `{{- if .HFModelName }}` }}
101127
- name: model-cache
102128
mountPath: /models
@@ -111,6 +137,9 @@ data:
111137
volumes:
112138
- name: home
113139
emptyDir: {}
140+
# - name: config-decoder
141+
# configMap:
142+
# name: {{ include "modelservice.fullname" . -}}-config-decoder
114143
{{ `{{- if .HFModelName }}` }}
115144
- name: model-cache
116145
emptyDir: {}
@@ -126,21 +155,6 @@ data:
126155
tolerations:
127156
{{- toYaml .Values.modelservice.prefill.tolerations | nindent 12 }}
128157
{{- end }}
129-
initContainers:
130-
- name: "routing-proxy"
131-
image: {{ include "modelservice.routingProxyImage" . }}
132-
securityContext:
133-
allowPrivilegeEscalation: false
134-
runAsNonRoot: true
135-
args:
136-
- "--port=8000"
137-
- "--vllm-port=8001"
138-
- "--connector=nixl"
139-
ports:
140-
- containerPort: 8000
141-
protocol: TCP
142-
restartPolicy: Always
143-
imagePullPolicy: Always
144158
containers:
145159
- name: vllm
146160
image: {{ include "modelservice.vllmImage" . }}
@@ -153,7 +167,7 @@ data:
153167
- {{ `{{ default (print "/models/" .ModelPath) .HFModelName }}` }}
154168
args:
155169
- "--port"
156-
- "8001"
170+
- "8000"
157171
- "--enforce-eager"
158172
- "--kv-transfer-config"
159173
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
@@ -174,11 +188,14 @@ data:
174188
apiVersion: v1
175189
fieldPath: status.podIP
176190
- name: LMCACHE_DISTRIBUTED_URL
177-
value: {{ `"${POD_IP}:80"` }}
191+
value: ${POD_IP}:8200
178192
- name: CUDA_VISIBLE_DEVICES
179193
value: "0"
180194
- name: UCX_TLS
181195
value: "cuda_ipc,cuda_copy,tcp"
196+
### Keep ability to enable LMCache configs but don't use them right now
197+
# - name: LMCACHE_CONFIG_FILE
198+
# value: /vllm-workspace/lmcache-prefiller-config.yaml
182199
{{- if .Values.redis.enabled }}
183200
- name: LMCACHE_LOOKUP_URL
184201
value: {{ include "redis.master.service.fullurl" .}}
@@ -190,6 +207,9 @@ data:
190207
volumeMounts:
191208
- name: home
192209
mountPath: /home
210+
# - name: config-prefiller
211+
# configMap:
212+
# name: {{ include "modelservice.fullname" . -}}-config-prefiller
193213
{{ `{{- if .HFModelName }}` }}
194214
- name: model-cache
195215
mountPath: /models
@@ -204,6 +224,8 @@ data:
204224
volumes:
205225
- name: home
206226
emptyDir: {}
227+
# - name: config-prefiller
228+
# mountPath: /vllm-workspace
207229
{{ `{{ if .HFModelName }}` }}
208230
- name: model-cache
209231
emptyDir: {}

notes/testing-nixl-and-epp.md

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# notes
2+
3+
Helper scritps
4+
5+
```bash
6+
export LLM_PROMPT_1="I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind."
7+
8+
export LLM_PROMPT_2="Now that we have implemented benchmarks, I was hoping you could help me understand how I would track these manifests in GitOps. Ideally I would openshift gitops but would also support vanilla argocd for non OCP environments. Do you have any suggestions on the topic?"
9+
10+
export LLM_PROMPT_3="Lets talk about dolphins! What are some unique characteristics of dolphins compared to other acquatic animals?"
11+
12+
export LLM_PROMPT_4="speaking of aquatic animals, what is your favourite aquatic animal and why?"
13+
14+
export LLM_PROMPT_5="How might I gather metrics on how much energy consumption my OCP cluster uses?"
15+
16+
curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \
17+
-H "Content-Type: application/json" \
18+
-d '{
19+
"model": "Llama-3.2-3B-Instruct",
20+
"prompt": "'${LLM_PROMPT_1}'",
21+
"max_tokens": 500,
22+
"temperature": 0
23+
}' | jq
24+
25+
curl llm-d-inference-gateway.apps.summit-gpu.octo-emerging.redhataicoe.com/v1/completions \
26+
-H "Content-Type: application/json" \
27+
-d '{
28+
"model": "Llama-3.2-3B-Instruct",
29+
"prompt": "'${LLM_PROMPT_2}'",
30+
"max_tokens": 500,
31+
"temperature": 0
32+
}' | jq
33+
34+
DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
35+
PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}')
36+
EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}')
37+
38+
39+
# grab logs together p/D
40+
stern -n $(oc project -q) "$PREFILL_POD|$DECODE_POD" -c vllm | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate"
41+
```
42+
43+
## Debugging and testing NIXL KV cache
44+
45+
Debugging KV cache through logs:
46+
47+
#### Terminal 1 EPP
48+
49+
Follow EPP logs to see if it can hit Decode routing sidecar
50+
51+
```bash
52+
EPP_POD=$(kubectl get pods -l "llm-d.ai/epp" | tail -n 1 | awk '{print $1}')
53+
kubectl logs pod/${EPP_POD} -f | grep -v "Failed to refreshed metrics\|Refreshed metrics\|gRPC health check serving\|Refreshing Prometheus Metrics"
54+
```
55+
56+
### Terminal 2 Routing sidecar (Decode)
57+
58+
Follow the routing sidecar in the decode pod to see if it can post to prefill if needed
59+
60+
```bash
61+
DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
62+
kubectl logs pod/${DECODE_POD} -c routing-proxy -f | grep -v "http: proxy error: dial tcp \[::1\]:8001: connect: connection refused"
63+
```
64+
65+
### Terminal 3 Decode inference
66+
67+
Follow the decode vllm logs:
68+
69+
```bash
70+
DECODE_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=decode" | tail -n 1 | awk '{print $1}')
71+
kubectl logs pod/${DECODE_POD} -c vllm -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s"
72+
```
73+
74+
### Terminal 4 Prefill
75+
76+
Check to see that prefill logs are getting hit by decode:
77+
78+
```bash
79+
PREFILL_POD=$(kubectl get pods -l "llm-d.ai/inferenceServing=true,llm-d.ai/role=prefill" | tail -n 1 | awk '{print $1}')
80+
kubectl logs pod/${PREFILL_POD} -f | grep -v "\"GET /metrics HTTP/1.1\" 200 OK\|Avg prompt throughput: 0.0 tokens/s"
81+
```
82+
83+
At this point you should be able to send a request through the gatway and track the relevant logs:
84+
85+
```bash
86+
INGRESS_ADDRESS=$(kubectl get ingress llm-d-inference-gateway | tail -n 1 | awk '{print $3}')
87+
curl ${INGRESS_ADDRESS}/v1/completions \
88+
-H "Content-Type: application/json" \
89+
-d '{
90+
"model": "Llama-3.2-3B-Instruct",
91+
"prompt": "'${LLM_PROMPT_1}'",
92+
"max_tokens": 500,
93+
"temperature": 0
94+
}' | jq
95+
```
96+
97+
Epp should filter out Prefill pods, and only target decode first. You should see this between the 2nd and 3rd steps in EPP when it applies the filter plugin:
98+
- Scheduling a request (step 2) has both pods as candidates, ex:
99+
```log
100+
{"level":"info","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:129","msg":"Scheduling a request, Metrics: [{Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g Address:10.131.10.180 Role:1} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.29171375 +0000 UTC m=+388.303255999}} {Pod:{NamespacedName:e2e-helm/llama-3-2-3b-instruct-prefill-84667878f9-lwb47 Address:10.128.13.52 Role:0} Metrics:{ActiveModels:map[] WaitingModels:map[] MaxActiveModels:0 RunningQueueSize:0 WaitingQueueSize:0 KVCacheUsagePercent:0 KvCacheMaxTokenCapacity:0 UpdateTime:2025-05-09 19:26:20.316489317 +0000 UTC m=+388.328031566}}]","pd-schedule":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388"}
101+
```
102+
- Apply filter plugin (step 3), only has decode as candidate to target sidecar first:
103+
```log
104+
{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:160","msg":"Before running filter plugins","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","pods":[{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-decode-6f9b99b5cd-dlm7g"},"Address":"10.131.10.180","Role":1,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.29171375Z"},{"NamespacedName":{"Namespace":"e2e-helm","Name":"llama-3-2-3b-instruct-prefill-84667878f9-lwb47"},"Address":"10.128.13.52","Role":0,"ActiveModels":{},"WaitingModels":{},"MaxActiveModels":0,"RunningQueueSize":0,"WaitingQueueSize":0,"KVCacheUsagePercent":0,"KvCacheMaxTokenCapacity":0,"UpdateTime":"2025-05-09T19:26:20.316489317Z"}]}
105+
{"level":"Level(-4)","ts":"2025-05-09T19:26:20Z","caller":"scheduling/scheduler.go:163","msg":"Running filter plugin","request":"Model: Llama-3.2-3B-Instruct, TargetModels: map[], ResolvedTargetModel: Llama-3.2-3B-Instruct, Critical: false, PromptLength: 388","plugin":"prefill_filter"}
106+
```
107+
108+
Our next stop is the decode proxy sidecar (terminal 2) where you should notice communication being orchistrated between P/D pods, ex:
109+
```log
110+
I0509 19:43:44.077499 1 chat_completions.go:110] "running NIXL protocol" logger="proxy server"
111+
I0509 19:43:44.077593 1 chat_completions.go:172] "sending request to prefiller" logger="proxy server" url="http://10.128.13.52:8000" body="{\"do_remote_decode\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"stream\":false,\"temperature\":0}"
112+
I0509 19:43:44.099979 1 chat_completions.go:217] "received prefiller response" logger="proxy server" remote_block_ids=[1,2,3,4] remote_engine_id="81eb3201-d5c2-4642-8131-7849f2e955ce" remote_host="10.128.13.52" remote_port=5557
113+
I0509 19:43:44.100082 1 chat_completions.go:252] "sending request to decoder" logger="proxy server" body="{\"do_remote_prefill\":true,\"max_tokens\":500,\"model\":\"Llama-3.2-3B-Instruct\",\"prompt\":\"I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.\",\"remote_block_ids\":[1,2,3,4],\"remote_engine_id\":\"81eb3201-d5c2-4642-8131-7849f2e955ce\",\"remote_host\":\"10.128.13.52\",\"remote_port\":5557,\"temperature\":0}"
114+
```
115+
116+
Finally in the decode inference pod (terminal 3) we should see the logs on KV transfer:
117+
118+
```log
119+
INFO 05-09 19:26:20 [logger.py:39] Received request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
120+
INFO 05-09 19:26:20 [async_llm.py:255] Added request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0.
121+
DEBUG 05-09 19:26:20 [core.py:431] EngineCore loop active.
122+
DEBUG 05-09 19:26:20 [nixl_connector.py:559] start_load_kv for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0 from remote engine 81eb3201-d5c2-4642-8131-7849f2e955ce. Num local_block_ids: 4. Num remote_block_ids: 4.
123+
DEBUG 05-09 19:26:20 [nixl_connector.py:313] Querying metadata on path: tcp://10.128.13.52:5557
124+
DEBUG 05-09 19:26:20 [nixl_connector.py:422] Created 1055264 blocks for src engine 6d177cac-6a93-4396-8c06-a5af03e9ace7 and rank 0
125+
DEBUG 05-09 19:26:21 [nixl_connector.py:439] Created 1055264 blocks for dst engine 81eb3201-d5c2-4642-8131-7849f2e955ce and rank 0
126+
DEBUG 05-09 19:26:22 [nixl_connector.py:326] NIXL handshake: get metadata took: 0.0025545399985276163
127+
DEBUG 05-09 19:26:22 [nixl_connector.py:328] NIXL handshake: add agent took: 2.2907175269938307
128+
DEBUG 05-09 19:26:22 [nixl_connector.py:463] Rank 0, get_finished: 0 requests done sending and 1 requests done recving
129+
DEBUG 05-09 19:26:22 [scheduler.py:862] Finished recving KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0
130+
```
131+
132+
If you are debugging networking you can finally observe the prefill pod logs to see how it recieves the request from decode, and sends back the KVs
133+
134+
```log
135+
INFO 05-09 19:43:44 [logger.py:39] Received request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0: prompt: 'I am working on learning to run benchmarks in my openshift cluster. I was wondering if you could provide me a list of best practices when collecting metrics on the k8s platform, and furthermore, any OCP specific optimizations that are applicable here. Finally please help me construct a plan to support testing metrics collection for testing and dev environments such as minikube or kind.', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=500, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None), prompt_token_ids: [128000, 40, 1097, 3318, 389, 6975, 311, 1629, 63119, 304, 856, 16264, 48933, 10879, 13, 358, 574, 20910, 422, 499, 1436, 3493, 757, 264, 1160, 315, 1888, 12659, 994, 26984, 17150, 389, 279, 597, 23, 82, 5452, 11, 323, 78637, 11, 904, 507, 7269, 3230, 82278, 430, 527, 8581, 1618, 13, 17830, 4587, 1520, 757, 9429, 264, 3197, 311, 1862, 7649, 17150, 4526, 369, 7649, 323, 3567, 22484, 1778, 439, 1332, 1609, 3845, 477, 3169, 13], lora_request: None, prompt_adapter_request: None.
136+
INFO 05-09 19:43:44 [async_llm.py:255] Added request cmpl-bb24666d-5d42-46a2-997b-63f352d5bbdb-0.
137+
DEBUG 05-09 19:43:44 [core.py:431] EngineCore loop active.
138+
DEBUG 05-09 19:43:44 [nixl_connector.py:463] Rank 0, get_finished: 1 requests done sending and 0 requests done recving
139+
DEBUG 05-09 19:43:44 [scheduler.py:865] Finished sending KV transfer for request cmpl-0dec7ca2-42c8-4b79-a753-d355181114f2-0
140+
DEBUG 05-09 19:43:44 [core.py:425] EngineCore waiting for work.
141+
INFO: 10.131.10.180:44514 - "POST /v1/completions HTTP/1.1" 200 OK
142+
```

0 commit comments

Comments
 (0)