Skip to content

Commit 1d2a084

Browse files
chiang-danielclaude
andcommitted
fix: exclude consumer processing time from streaming latency
Fix streaming path to only measure time waiting on LLM chunks, not time the consumer spends processing yielded chunks. Also fix misleading test that claimed to test multi-call accumulation but only exercised a single LLM call. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4b99322 commit 1d2a084

2 files changed

Lines changed: 44 additions & 16 deletions

File tree

libs/core/kiln_ai/adapters/model_adapters/adapter_stream.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,16 @@ async def _stream_model_turn(
168168
)
169169

170170
stream = StreamingCompletion(**completion_kwargs)
171-
start = time.monotonic()
171+
call_latency_seconds = 0.0
172+
chunk_wait_start = time.monotonic()
172173
async for chunk in stream:
174+
# Accumulate time spent waiting on the LLM for this chunk
175+
call_latency_seconds += time.monotonic() - chunk_wait_start
173176
yield chunk
174-
call_latency_ms = int((time.monotonic() - start) * 1000)
177+
# Reset timer after yield returns — excludes consumer processing time
178+
chunk_wait_start = time.monotonic()
179+
180+
call_latency_ms = int(call_latency_seconds * 1000)
175181

176182
response, response_choice = _validate_response(stream.response)
177183
usage += self._adapter.usage_from_response(response)

libs/core/kiln_ai/adapters/model_adapters/test_litellm_adapter.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2502,8 +2502,9 @@ async def test_run_model_turn_sets_latency_on_trace_message(
25022502
async def test_run_model_turn_accumulates_latency_across_tool_calls(
25032503
self, adapter, provider
25042504
):
2505-
"""Latency should accumulate across multiple tool-call iterations."""
2506-
tool_response = ModelResponse(
2505+
"""Latency should accumulate across multiple LLM calls in a tool-call loop."""
2506+
# First LLM call: model requests a regular tool (not task_response)
2507+
tool_call_response = ModelResponse(
25072508
model="test-model",
25082509
choices=[
25092510
{
@@ -2514,32 +2515,53 @@ async def test_run_model_turn_accumulates_latency_across_tool_calls(
25142515
"id": "call_1",
25152516
"type": "function",
25162517
"function": {
2517-
"name": "task_response",
2518-
"arguments": '{"test": "result"}',
2518+
"name": "some_tool",
2519+
"arguments": '{"arg": "val"}',
25192520
},
25202521
}
25212522
],
25222523
}
25232524
}
25242525
],
25252526
)
2527+
# Second LLM call: model returns final content
2528+
final_response = ModelResponse(
2529+
model="test-model",
2530+
choices=[
2531+
{
2532+
"message": {
2533+
"content": "Final answer",
2534+
}
2535+
}
2536+
],
2537+
)
25262538

2527-
monotonic_values = [0.0, 0.3] # 300ms
2539+
# monotonic: start1, end1 (200ms), start2, end2 (300ms)
2540+
monotonic_values = [0.0, 0.2, 0.2, 0.5]
25282541
with patch.object(adapter, "build_completion_kwargs", return_value={}):
25292542
with patch.object(
25302543
adapter,
25312544
"acompletion_checking_response",
2532-
return_value=(tool_response, tool_response.choices[0]),
2545+
side_effect=[
2546+
(tool_call_response, tool_call_response.choices[0]),
2547+
(final_response, final_response.choices[0]),
2548+
],
25332549
):
2534-
with patch(
2535-
"kiln_ai.adapters.model_adapters.litellm_adapter.time.monotonic",
2536-
side_effect=monotonic_values,
2550+
with patch.object(
2551+
adapter,
2552+
"process_tool_calls",
2553+
return_value=(None, [{"role": "tool", "content": "tool result", "tool_call_id": "call_1"}]),
25372554
):
2538-
result = await adapter._run_model_turn(
2539-
provider, [{"role": "user", "content": "Hi"}], None, False
2540-
)
2541-
2542-
assert result.usage.total_llm_latency_ms == 300
2555+
with patch(
2556+
"kiln_ai.adapters.model_adapters.litellm_adapter.time.monotonic",
2557+
side_effect=monotonic_values,
2558+
):
2559+
result = await adapter._run_model_turn(
2560+
provider, [{"role": "user", "content": "Hi"}], None, False
2561+
)
2562+
2563+
# 200ms + 300ms = 500ms total
2564+
assert result.usage.total_llm_latency_ms == 500
25432565

25442566
def test_litellm_message_to_trace_message_includes_latency(self, adapter):
25452567
"""litellm_message_to_trace_message should include latency_ms when _latency_ms is set."""

0 commit comments

Comments
 (0)