Skip to content

Commit 1a6582a

Browse files
committed
clean up
1 parent 6323f6a commit 1a6582a

12 files changed

Lines changed: 597 additions & 14 deletions

File tree

app/web_ui/src/lib/components/compare_chart.svelte

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
getRunConfigPromptDisplayName,
1414
} from "$lib/utils/run_config_formatters"
1515
import ChartNoData from "./chart_no_data.svelte"
16+
import { formatLatency } from "$lib/utils/formatters"
1617
1718
// Type for comparison features (same as parent page)
1819
type ComparisonFeature = {
@@ -123,8 +124,7 @@
123124
return `$${value.toFixed(6)}`
124125
}
125126
if (dataKey.includes("latency")) {
126-
if (value < 1000) return `${Math.round(value)}ms`
127-
return `${(value / 1000).toFixed(1)}s`
127+
return formatLatency(value)
128128
}
129129
if (dataKey.includes("tokens")) {
130130
return value.toFixed(0)

app/web_ui/src/lib/components/compare_radar_chart.svelte

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
return key === COST_KEY || key === LATENCY_KEY
4444
}
4545
46-
export function lowerIsBetterToScore(
46+
export function metricToScore(
4747
cost: number,
4848
costs: number[],
4949
{
@@ -168,13 +168,13 @@
168168
if (rawValue === null) {
169169
html += `<div>${label}: N/A</div>`
170170
} else if (key === COST_KEY) {
171-
const displayValue = lowerIsBetterToScore(
171+
const displayValue = metricToScore(
172172
rawValue,
173173
lowerIsBetterValues[key] || [],
174174
)
175175
html += `<div>${label}: ${displayValue.toFixed(1)} <span style="color: #888;">(Mean Cost: $${rawValue.toFixed(6)})</span></div>`
176176
} else if (key === LATENCY_KEY) {
177-
const displayValue = lowerIsBetterToScore(
177+
const displayValue = metricToScore(
178178
rawValue,
179179
lowerIsBetterValues[key] || [],
180180
)
@@ -247,7 +247,7 @@
247247
if (rawValue === null) {
248248
displayValue = 0
249249
} else if (isLowerIsBetterMetric(key)) {
250-
displayValue = lowerIsBetterToScore(rawValue, lowerIsBetterValues[key] || [])
250+
displayValue = metricToScore(rawValue, lowerIsBetterValues[key] || [])
251251
} else {
252252
displayValue = rawValue
253253
}

app/web_ui/src/lib/utils/formatters.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,3 +326,8 @@ export function formatEvalConfigName(
326326
]
327327
return eval_config.name + " — " + parts.join(", ")
328328
}
329+
330+
export function formatLatency(ms: number): string {
331+
if (ms < 1000) return `${Math.round(ms)}ms`
332+
return `${(ms / 1000).toFixed(1)}s`
333+
}

app/web_ui/src/routes/(app)/run/run.svelte

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import { client } from "$lib/api_client"
1212
import Output from "$lib/ui/output.svelte"
1313
import { KilnError, createKilnError } from "$lib/utils/error_handlers"
14-
import { formatDate } from "$lib/utils/formatters"
14+
import { formatDate, formatLatency } from "$lib/utils/formatters"
1515
import { bounceOut } from "svelte/easing"
1616
import { fly } from "svelte/transition"
1717
import { onMount } from "svelte"
@@ -526,11 +526,6 @@
526526
)
527527
}
528528
529-
function formatLatency(ms: number): string {
530-
if (ms < 1000) return `${Math.round(ms)}ms`
531-
return `${(ms / 1000).toFixed(1)}s`
532-
}
533-
534529
function get_usage_properties(
535530
run: TaskRun | null,
536531
subtask_cost: number | null,

app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/compare/+page.svelte

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import { goto } from "$app/navigation"
66
import { client } from "$lib/api_client"
77
import { createKilnError, KilnError } from "$lib/utils/error_handlers"
8+
import { formatLatency } from "$lib/utils/formatters"
89
import type { Task, TaskRunConfig, Eval } from "$lib/types"
910
import type { components } from "$lib/api_schema"
1011
import CompareChart from "$lib/components/compare_chart.svelte"
@@ -497,8 +498,7 @@
497498
if (scoreKey === "mean_cost") {
498499
return `$${value.toFixed(7)}`
499500
} else if (scoreKey === "mean_total_llm_latency_ms") {
500-
if (value < 1000) return `${Math.round(value)}ms`
501-
return `${(value / 1000).toFixed(1)}s`
501+
return formatLatency(value)
502502
} else {
503503
return value.toFixed(1)
504504
}
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
---
2+
status: complete
3+
---
4+
5+
# Architecture: Make Performance a First-Class Citizen
6+
7+
## Overview
8+
9+
This is a "follow the pattern" change. Every modification mirrors an existing cost/token pattern. No new architectural concepts, no new dependencies, no new infrastructure.
10+
11+
## Data Model Changes
12+
13+
### 1. `ChatCompletionAssistantMessageParamWrapper` (`libs/core/kiln_ai/utils/open_ai_types.py`)
14+
15+
Add one optional field to the existing TypedDict:
16+
17+
```python
18+
latency_ms: Optional[int]
19+
"""Time spent waiting on this specific LLM API call in milliseconds."""
20+
```
21+
22+
This follows the same pattern as `reasoning_content` on the same TypedDict. TypedDicts with `total=False` allow optional fields naturally.
23+
24+
### 2. `Usage` class (`libs/core/kiln_ai/datamodel/task_run.py`)
25+
26+
Add one field:
27+
28+
```python
29+
total_llm_latency_ms: int | None = Field(
30+
default=None,
31+
description="Total time spent waiting on LLM API calls in milliseconds. Sum of per-call latencies, excludes tool execution time.",
32+
ge=0,
33+
)
34+
```
35+
36+
Update `__add__`:
37+
38+
```python
39+
return Usage(
40+
# ... existing fields ...
41+
total_llm_latency_ms=_add_optional_int(self.total_llm_latency_ms, other.total_llm_latency_ms),
42+
)
43+
```
44+
45+
### 3. `MeanUsage` class (`app/desktop/studio_server/eval_api.py`)
46+
47+
Add one field:
48+
49+
```python
50+
mean_total_llm_latency_ms: float | None = Field(
51+
default=None,
52+
description="Average total LLM latency per run in milliseconds.",
53+
)
54+
```
55+
56+
## Computation
57+
58+
### Timing in `litellm_adapter.py`
59+
60+
**In `_run_model_turn()` — time each LLM call:**
61+
62+
The timing wraps `acompletion_checking_response()` inside the existing `while tool_calls_count < MAX_TOOL_CALLS_PER_TURN` loop. Each iteration is one LLM call.
63+
64+
```python
65+
import time
66+
67+
# Inside the while loop, around the existing completion call:
68+
start = time.monotonic()
69+
model_response, response_choice = await self.acompletion_checking_response(**completion_kwargs)
70+
call_latency_ms = int((time.monotonic() - start) * 1000)
71+
72+
# Add to turn's usage accumulator
73+
usage += self.usage_from_response(model_response)
74+
usage.total_llm_latency_ms = (usage.total_llm_latency_ms or 0) + call_latency_ms
75+
```
76+
77+
Note: `usage_from_response()` does NOT set `total_llm_latency_ms` — that method extracts data from LiteLLM's response object (cost, tokens). Latency is measured externally by us. The addition happens after `usage_from_response` so we don't overwrite.
78+
79+
**Storing `latency_ms` on trace messages:**
80+
81+
In `litellm_message_to_trace_message()`, the LiteLLM message is converted to our wrapper type. We need to pass the `call_latency_ms` through to this method.
82+
83+
Approach: Store the latency on the LiteLLM message object or pass it alongside. Since `_run_model_turn()` appends `response_choice.message` to `messages` (line 156), and later `all_messages_to_trace()` converts these, we need a way to carry the latency.
84+
85+
**Implementation**: After timing the call, attach `latency_ms` as an attribute on the response message before appending:
86+
87+
```python
88+
# After timing the call:
89+
response_choice.message._latency_ms = call_latency_ms # type: ignore
90+
messages.append(response_choice.message)
91+
```
92+
93+
Then in `litellm_message_to_trace_message()`, read it:
94+
95+
```python
96+
message: ChatCompletionAssistantMessageParamWrapper = {
97+
"role": "assistant",
98+
# ... existing fields ...
99+
}
100+
# Add latency if available
101+
latency_ms = getattr(litellm_msg, "_latency_ms", None)
102+
if latency_ms is not None:
103+
message["latency_ms"] = latency_ms
104+
```
105+
106+
This is a pragmatic approach — `_latency_ms` is a transient attribute that only lives between timing and trace conversion within the same method call chain. It's not persisted or serialized by LiteLLM.
107+
108+
**Alternative considered**: Passing latency as a separate dict keyed by message index. Rejected — more complex, harder to maintain, and the attribute approach is isolated to `_run_model_turn()``litellm_message_to_trace_message()`.
109+
110+
### Streaming path
111+
112+
The streaming path (`_create_run_stream`) constructs messages differently. Review needed during implementation to determine if/how latency tracking applies to streaming. Streaming typically reports usage via `FinishEvent` metadata, and the same timing approach (wrap the LLM call) applies. This should be handled in the same phase as the non-streaming path.
113+
114+
## Eval Aggregation (`eval_api.py`)
115+
116+
In `get_run_config_eval_scores()`, exact same pattern as cost:
117+
118+
```python
119+
# Initialize (alongside existing accumulators at ~line 1424):
120+
total_llm_latency_ms_sum = 0.0
121+
latency_ms_count = 0
122+
123+
# Accumulate (alongside existing cost accumulation at ~line 1506):
124+
if usage.total_llm_latency_ms is not None:
125+
total_llm_latency_ms_sum += usage.total_llm_latency_ms
126+
latency_ms_count += 1
127+
128+
# Mean calculation (alongside existing mean_cost at ~line 1578):
129+
mean_total_llm_latency_ms=total_llm_latency_ms_sum / latency_ms_count if latency_ms_count >= threshold else None,
130+
```
131+
132+
## Frontend Changes
133+
134+
### Run Page (`app/web_ui/src/routes/(app)/run/run.svelte`)
135+
136+
**`calculate_subtask_usage()`**: Add `latency_ms` to return type and accumulation:
137+
138+
```typescript
139+
async function calculate_subtask_usage(
140+
trace: Trace | null | undefined,
141+
visited: Set<string> = new Set(),
142+
): Promise<{ cost: number; tokens: number; latency_ms: number }> {
143+
// ... existing logic ...
144+
total_llm_latency_ms += response.data.usage?.total_llm_latency_ms ?? 0
145+
// ... recurse for subtasks ...
146+
return { cost: total_cost, tokens: total_tokens, latency_ms: total_llm_latency_ms }
147+
}
148+
```
149+
150+
**`get_usage_properties()`**: Add latency entries after cost and token entries, same pattern:
151+
152+
```typescript
153+
// After token entries:
154+
const run_latency = run?.usage?.total_llm_latency_ms ?? 0
155+
const total_latency = run_latency + (subtask_latency ?? 0)
156+
if (total_latency > 0) {
157+
properties.push({
158+
name: "Total Latency",
159+
value: formatLatency(total_latency),
160+
})
161+
}
162+
```
163+
164+
**Format helper**:
165+
166+
```typescript
167+
function formatLatency(ms: number): string {
168+
if (ms < 1000) return `${Math.round(ms)}ms`
169+
return `${(ms / 1000).toFixed(1)}s`
170+
}
171+
```
172+
173+
### Compare Page (`app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/compare/+page.svelte`)
174+
175+
**`generateComparisonFeatures()`**: Add one item to `costItems` array:
176+
177+
```typescript
178+
{ label: "Latency (ms)", key: "cost::mean_total_llm_latency_ms" },
179+
```
180+
181+
Rename section: `"Average Usage & Cost"``"Average Usage, Cost & Latency"`.
182+
183+
**`getModelValueRaw()`**: Add switch case:
184+
185+
```typescript
186+
case "mean_total_llm_latency_ms":
187+
return meanUsage.mean_total_llm_latency_ms ?? null
188+
```
189+
190+
**`getModelValue()`**: Add formatting:
191+
192+
```typescript
193+
if (scoreKey === "mean_total_llm_latency_ms") {
194+
if (value < 1000) return `${Math.round(value)}ms`
195+
return `${(value / 1000).toFixed(1)}s`
196+
}
197+
```
198+
199+
### Radar Chart (`app/web_ui/src/lib/components/compare_radar_chart.svelte`)
200+
201+
```typescript
202+
const LATENCY_KEY = "cost::mean_total_llm_latency_ms"
203+
204+
// Rename isCostMetric → isLowerIsBetterMetric
205+
function isLowerIsBetterMetric(key: string): boolean {
206+
return key === COST_KEY || key === LATENCY_KEY
207+
}
208+
```
209+
210+
Add `LATENCY_KEY` to `dataKeys` alongside `COST_KEY`:
211+
212+
```typescript
213+
$: dataKeys = [
214+
...comparisonFeatures
215+
.filter((f) => f.eval_id !== "kiln_cost_section")
216+
.flatMap((f) => f.items.map((item) => item.key)),
217+
COST_KEY,
218+
LATENCY_KEY,
219+
]
220+
```
221+
222+
Label: `"Speed"` for the radar axis. Use same `costToScore()` function (lower is better).
223+
224+
### Bar Chart (`app/web_ui/src/lib/components/compare_chart.svelte`)
225+
226+
Add latency formatting in tooltip:
227+
228+
```typescript
229+
if (key.includes("latency")) {
230+
if (value < 1000) return `${Math.round(value)}ms`
231+
return `${(value / 1000).toFixed(1)}s`
232+
}
233+
```
234+
235+
## API Schema Regeneration
236+
237+
After Python changes, run `app/web_ui/src/lib/generate_schema.sh` to regenerate `api_schema.d.ts`. This is automated — no manual TypeScript type changes needed.
238+
239+
## Testing Strategy
240+
241+
### Python unit tests
242+
243+
**`libs/core/kiln_ai/datamodel/test_task_run.py`** (or equivalent):
244+
- `Usage.__add__` with `total_llm_latency_ms`: None+None=None, None+val=val, val+val=sum
245+
- `Usage` serialization/deserialization with `total_llm_latency_ms`
246+
- Backwards compat: deserialize old JSON without `total_llm_latency_ms` → None
247+
248+
**`libs/core/kiln_ai/adapters/model_adapters/test_litellm_adapter.py`**:
249+
- `_run_model_turn()` sets `latency_ms` on trace assistant messages
250+
- `_run_model_turn()` accumulates `total_llm_latency_ms` on Usage across tool-call iterations
251+
- `_run()` accumulates `total_llm_latency_ms` across turns
252+
- Timing values are positive integers (sanity check, not exact values — mock `time.monotonic`)
253+
254+
**`app/desktop/studio_server/test_eval_api.py`**:
255+
- `mean_total_llm_latency_ms` calculated correctly with sample data
256+
- 50% threshold applied correctly
257+
- None when insufficient data
258+
259+
### Frontend tests
260+
261+
**Compare page tests**: Verify `mean_total_llm_latency_ms` appears in comparison features and formats correctly.
262+
263+
**Run page tests**: Verify latency appears in usage properties.
264+
265+
### Test approach for timing
266+
267+
Mock `time.monotonic` to return controlled values. Don't assert exact ms values from real calls — they're non-deterministic. Instead:
268+
- Mock returns incrementing values (e.g., 0.0, 0.5, 0.5, 1.2) so we get predictable deltas
269+
- Assert `latency_ms` and `total_llm_latency_ms` match expected values from mocked time
270+
271+
## Error Handling
272+
273+
No new error paths. Latency computation cannot fail — `time.monotonic()` always returns a float, subtraction always works, `int()` always works. If `acompletion_checking_response()` raises, the timing code is skipped (no latency recorded for failed calls). This is the correct behavior.
274+
275+
## Dependencies
276+
277+
None. Only uses `time.monotonic()` from the standard library.

0 commit comments

Comments
 (0)