fix: exclude consumer processing time from streaming latency

chiang-daniel · claude · chiang-daniel · commit 1d2a0845896d · 2026-04-23T16:47:31.000-07:00
Fix streaming path to only measure time waiting on LLM chunks, not
time the consumer spends processing yielded chunks. Also fix misleading
test that claimed to test multi-call accumulation but only exercised a
single LLM call.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/libs/core/kiln_ai/adapters/model_adapters/adapter_stream.py b/libs/core/kiln_ai/adapters/model_adapters/adapter_stream.py
@@ -168,10 +168,16 @@ async def _stream_model_turn(
             )
 
             stream = StreamingCompletion(**completion_kwargs)
-            start = time.monotonic()
+            call_latency_seconds = 0.0
+            chunk_wait_start = time.monotonic()
             async for chunk in stream:
+                # Accumulate time spent waiting on the LLM for this chunk
+                call_latency_seconds += time.monotonic() - chunk_wait_start
                 yield chunk
-            call_latency_ms = int((time.monotonic() - start) * 1000)
+                # Reset timer after yield returns — excludes consumer processing time
+                chunk_wait_start = time.monotonic()
+
+            call_latency_ms = int(call_latency_seconds * 1000)
 
             response, response_choice = _validate_response(stream.response)
             usage += self._adapter.usage_from_response(response)
diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_litellm_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_litellm_adapter.py
@@ -2502,8 +2502,9 @@ async def test_run_model_turn_sets_latency_on_trace_message(
     async def test_run_model_turn_accumulates_latency_across_tool_calls(
         self, adapter, provider
     ):
-        """Latency should accumulate across multiple tool-call iterations."""
-        tool_response = ModelResponse(
+        """Latency should accumulate across multiple LLM calls in a tool-call loop."""
+        # First LLM call: model requests a regular tool (not task_response)
+        tool_call_response = ModelResponse(
             model="test-model",
             choices=[
                 {
@@ -2514,32 +2515,53 @@ async def test_run_model_turn_accumulates_latency_across_tool_calls(
                                 "id": "call_1",
                                 "type": "function",
                                 "function": {
-                                    "name": "task_response",
-                                    "arguments": '{"test": "result"}',
+                                    "name": "some_tool",
+                                    "arguments": '{"arg": "val"}',
                                 },
                             }
                         ],
                     }
                 }
             ],
         )
+        # Second LLM call: model returns final content
+        final_response = ModelResponse(
+            model="test-model",
+            choices=[
+                {
+                    "message": {
+                        "content": "Final answer",
+                    }
+                }
+            ],
+        )
 
-        monotonic_values = [0.0, 0.3]  # 300ms
+        # monotonic: start1, end1 (200ms), start2, end2 (300ms)
+        monotonic_values = [0.0, 0.2, 0.2, 0.5]
         with patch.object(adapter, "build_completion_kwargs", return_value={}):
             with patch.object(
                 adapter,
                 "acompletion_checking_response",
-                return_value=(tool_response, tool_response.choices[0]),
+                side_effect=[
+                    (tool_call_response, tool_call_response.choices[0]),
+                    (final_response, final_response.choices[0]),
+                ],
             ):
-                with patch(
-                    "kiln_ai.adapters.model_adapters.litellm_adapter.time.monotonic",
-                    side_effect=monotonic_values,
+                with patch.object(
+                    adapter,
+                    "process_tool_calls",
+                    return_value=(None, [{"role": "tool", "content": "tool result", "tool_call_id": "call_1"}]),
                 ):
-                    result = await adapter._run_model_turn(
-                        provider, [{"role": "user", "content": "Hi"}], None, False
-                    )
-
-        assert result.usage.total_llm_latency_ms == 300
+                    with patch(
+                        "kiln_ai.adapters.model_adapters.litellm_adapter.time.monotonic",
+                        side_effect=monotonic_values,
+                    ):
+                        result = await adapter._run_model_turn(
+                            provider, [{"role": "user", "content": "Hi"}], None, False
+                        )
+
+        # 200ms + 300ms = 500ms total
+        assert result.usage.total_llm_latency_ms == 500
 
     def test_litellm_message_to_trace_message_includes_latency(self, adapter):
         """litellm_message_to_trace_message should include latency_ms when _latency_ms is set."""