Kiln-AI
diff --git a/‎app/web_ui/src/lib/components/compare_chart.svelte‎
Lines changed: 2 additions & 2 deletions b/‎app/web_ui/src/lib/components/compare_chart.svelte‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/web_ui/src/lib/components/compare_radar_chart.svelte‎
Lines changed: 4 additions & 4 deletions b/‎app/web_ui/src/lib/components/compare_radar_chart.svelte‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎app/web_ui/src/lib/utils/formatters.ts‎
Lines changed: 5 additions & 0 deletions b/‎app/web_ui/src/lib/utils/formatters.ts‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎app/web_ui/src/routes/(app)/run/run.svelte‎
Lines changed: 1 addition & 6 deletions b/‎app/web_ui/src/routes/(app)/run/run.svelte‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/compare/+page.svelte‎
Lines changed: 2 additions & 2 deletions b/‎app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/compare/+page.svelte‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎specs/projects/performance-tracking/architecture.md‎
Lines changed: 277 additions & 0 deletions b/‎specs/projects/performance-tracking/architecture.md‎
Lines changed: 277 additions & 0 deletions
@@ -13,6 +13,7 @@
     getRunConfigPromptDisplayName,
   } from "$lib/utils/run_config_formatters"
   import ChartNoData from "./chart_no_data.svelte"
+  import { formatLatency } from "$lib/utils/formatters"
 
   // Type for comparison features (same as parent page)
   type ComparisonFeature = {
@@ -123,8 +124,7 @@
       return `$${value.toFixed(6)}`
     }
     if (dataKey.includes("latency")) {
-      if (value < 1000) return `${Math.round(value)}ms`
-      return `${(value / 1000).toFixed(1)}s`
+      return formatLatency(value)
     }
     if (dataKey.includes("tokens")) {
       return value.toFixed(0)
 
@@ -43,7 +43,7 @@
     return key === COST_KEY || key === LATENCY_KEY
   }
 
-  export function lowerIsBetterToScore(
+  export function metricToScore(
     cost: number,
     costs: number[],
     {
@@ -168,13 +168,13 @@
       if (rawValue === null) {
         html += `<div>${label}: N/A</div>`
       } else if (key === COST_KEY) {
-        const displayValue = lowerIsBetterToScore(
+        const displayValue = metricToScore(
           rawValue,
           lowerIsBetterValues[key] || [],
         )
         html += `<div>${label}: ${displayValue.toFixed(1)} <span style="color: #888;">(Mean Cost: $${rawValue.toFixed(6)})</span></div>`
       } else if (key === LATENCY_KEY) {
-        const displayValue = lowerIsBetterToScore(
+        const displayValue = metricToScore(
           rawValue,
           lowerIsBetterValues[key] || [],
         )
@@ -247,7 +247,7 @@
         if (rawValue === null) {
           displayValue = 0
         } else if (isLowerIsBetterMetric(key)) {
-          displayValue = lowerIsBetterToScore(rawValue, lowerIsBetterValues[key] || [])
+          displayValue = metricToScore(rawValue, lowerIsBetterValues[key] || [])
         } else {
           displayValue = rawValue
         }
 
@@ -326,3 +326,8 @@ export function formatEvalConfigName(
       ]
   return eval_config.name + " — " + parts.join(", ")
 }
+
+export function formatLatency(ms: number): string {
+  if (ms < 1000) return `${Math.round(ms)}ms`
+  return `${(ms / 1000).toFixed(1)}s`
+}
@@ -11,7 +11,7 @@
   import { client } from "$lib/api_client"
   import Output from "$lib/ui/output.svelte"
   import { KilnError, createKilnError } from "$lib/utils/error_handlers"
-  import { formatDate } from "$lib/utils/formatters"
+  import { formatDate, formatLatency } from "$lib/utils/formatters"
   import { bounceOut } from "svelte/easing"
   import { fly } from "svelte/transition"
   import { onMount } from "svelte"
@@ -526,11 +526,6 @@
     )
   }
 
-  function formatLatency(ms: number): string {
-    if (ms < 1000) return `${Math.round(ms)}ms`
-    return `${(ms / 1000).toFixed(1)}s`
-  }
-
   function get_usage_properties(
     run: TaskRun | null,
     subtask_cost: number | null,
 
@@ -5,6 +5,7 @@
   import { goto } from "$app/navigation"
   import { client } from "$lib/api_client"
   import { createKilnError, KilnError } from "$lib/utils/error_handlers"
+  import { formatLatency } from "$lib/utils/formatters"
   import type { Task, TaskRunConfig, Eval } from "$lib/types"
   import type { components } from "$lib/api_schema"
   import CompareChart from "$lib/components/compare_chart.svelte"
@@ -497,8 +498,7 @@
       if (scoreKey === "mean_cost") {
         return `$${value.toFixed(7)}`
       } else if (scoreKey === "mean_total_llm_latency_ms") {
-        if (value < 1000) return `${Math.round(value)}ms`
-        return `${(value / 1000).toFixed(1)}s`
+        return formatLatency(value)
       } else {
         return value.toFixed(1)
       }
 
@@ -0,0 +1,277 @@
+---
+status: complete
+---
+
+# Architecture: Make Performance a First-Class Citizen
+
+## Overview
+
+This is a "follow the pattern" change. Every modification mirrors an existing cost/token pattern. No new architectural concepts, no new dependencies, no new infrastructure.
+
+## Data Model Changes
+
+### 1. `ChatCompletionAssistantMessageParamWrapper` (`libs/core/kiln_ai/utils/open_ai_types.py`)
+
+Add one optional field to the existing TypedDict:
+
+```python
+latency_ms: Optional[int]
+"""Time spent waiting on this specific LLM API call in milliseconds."""
+```
+
+This follows the same pattern as `reasoning_content` on the same TypedDict. TypedDicts with `total=False` allow optional fields naturally.
+
+### 2. `Usage` class (`libs/core/kiln_ai/datamodel/task_run.py`)
+
+Add one field:
+
+```python
+total_llm_latency_ms: int | None = Field(
+    default=None,
+    description="Total time spent waiting on LLM API calls in milliseconds. Sum of per-call latencies, excludes tool execution time.",
+    ge=0,
+)
+```
+
+Update `__add__`:
+
+```python
+return Usage(
+    # ... existing fields ...
+    total_llm_latency_ms=_add_optional_int(self.total_llm_latency_ms, other.total_llm_latency_ms),
+)
+```
+
+### 3. `MeanUsage` class (`app/desktop/studio_server/eval_api.py`)
+
+Add one field:
+
+```python
+mean_total_llm_latency_ms: float | None = Field(
+    default=None,
+    description="Average total LLM latency per run in milliseconds.",
+)
+```
+
+## Computation
+
+### Timing in `litellm_adapter.py`
+
+**In `_run_model_turn()` — time each LLM call:**
+
+The timing wraps `acompletion_checking_response()` inside the existing `while tool_calls_count < MAX_TOOL_CALLS_PER_TURN` loop. Each iteration is one LLM call.
+
+```python
+import time
+
+# Inside the while loop, around the existing completion call:
+start = time.monotonic()
+model_response, response_choice = await self.acompletion_checking_response(**completion_kwargs)
+call_latency_ms = int((time.monotonic() - start) * 1000)
+
+# Add to turn's usage accumulator
+usage += self.usage_from_response(model_response)
+usage.total_llm_latency_ms = (usage.total_llm_latency_ms or 0) + call_latency_ms
+```
+
+Note: `usage_from_response()` does NOT set `total_llm_latency_ms` — that method extracts data from LiteLLM's response object (cost, tokens). Latency is measured externally by us. The addition happens after `usage_from_response` so we don't overwrite.
+
+**Storing `latency_ms` on trace messages:**
+
+In `litellm_message_to_trace_message()`, the LiteLLM message is converted to our wrapper type. We need to pass the `call_latency_ms` through to this method.
+
+Approach: Store the latency on the LiteLLM message object or pass it alongside. Since `_run_model_turn()` appends `response_choice.message` to `messages` (line 156), and later `all_messages_to_trace()` converts these, we need a way to carry the latency.
+
+**Implementation**: After timing the call, attach `latency_ms` as an attribute on the response message before appending:
+
+```python
+# After timing the call:
+response_choice.message._latency_ms = call_latency_ms  # type: ignore
+messages.append(response_choice.message)
+```
+
+Then in `litellm_message_to_trace_message()`, read it:
+
+```python
+message: ChatCompletionAssistantMessageParamWrapper = {
+    "role": "assistant",
+    # ... existing fields ...
+}
+# Add latency if available
+latency_ms = getattr(litellm_msg, "_latency_ms", None)
+if latency_ms is not None:
+    message["latency_ms"] = latency_ms
+```
+
+This is a pragmatic approach — `_latency_ms` is a transient attribute that only lives between timing and trace conversion within the same method call chain. It's not persisted or serialized by LiteLLM.
+
+**Alternative considered**: Passing latency as a separate dict keyed by message index. Rejected — more complex, harder to maintain, and the attribute approach is isolated to `_run_model_turn()` → `litellm_message_to_trace_message()`.
+
+### Streaming path
+
+The streaming path (`_create_run_stream`) constructs messages differently. Review needed during implementation to determine if/how latency tracking applies to streaming. Streaming typically reports usage via `FinishEvent` metadata, and the same timing approach (wrap the LLM call) applies. This should be handled in the same phase as the non-streaming path.
+
+## Eval Aggregation (`eval_api.py`)
+
+In `get_run_config_eval_scores()`, exact same pattern as cost:
+
+```python
+# Initialize (alongside existing accumulators at ~line 1424):
+total_llm_latency_ms_sum = 0.0
+latency_ms_count = 0
+
+# Accumulate (alongside existing cost accumulation at ~line 1506):
+if usage.total_llm_latency_ms is not None:
+    total_llm_latency_ms_sum += usage.total_llm_latency_ms
+    latency_ms_count += 1
+
+# Mean calculation (alongside existing mean_cost at ~line 1578):
+mean_total_llm_latency_ms=total_llm_latency_ms_sum / latency_ms_count if latency_ms_count >= threshold else None,
+```
+
+## Frontend Changes
+
+### Run Page (`app/web_ui/src/routes/(app)/run/run.svelte`)
+
+**`calculate_subtask_usage()`**: Add `latency_ms` to return type and accumulation:
+
+```typescript
+async function calculate_subtask_usage(
+  trace: Trace | null | undefined,
+  visited: Set<string> = new Set(),
+): Promise<{ cost: number; tokens: number; latency_ms: number }> {
+  // ... existing logic ...
+  total_llm_latency_ms += response.data.usage?.total_llm_latency_ms ?? 0
+  // ... recurse for subtasks ...
+  return { cost: total_cost, tokens: total_tokens, latency_ms: total_llm_latency_ms }
+}
+```
+
+**`get_usage_properties()`**: Add latency entries after cost and token entries, same pattern:
+
+```typescript
+// After token entries:
+const run_latency = run?.usage?.total_llm_latency_ms ?? 0
+const total_latency = run_latency + (subtask_latency ?? 0)
+if (total_latency > 0) {
+  properties.push({
+    name: "Total Latency",
+    value: formatLatency(total_latency),
+  })
+}
+```
+
+**Format helper**:
+
+```typescript
+function formatLatency(ms: number): string {
+  if (ms < 1000) return `${Math.round(ms)}ms`
+  return `${(ms / 1000).toFixed(1)}s`
+}
+```
+
+### Compare Page (`app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/compare/+page.svelte`)
+
+**`generateComparisonFeatures()`**: Add one item to `costItems` array:
+
+```typescript
+{ label: "Latency (ms)", key: "cost::mean_total_llm_latency_ms" },
+```
+
+Rename section: `"Average Usage & Cost"` → `"Average Usage, Cost & Latency"`.
+
+**`getModelValueRaw()`**: Add switch case:
+
+```typescript
+case "mean_total_llm_latency_ms":
+  return meanUsage.mean_total_llm_latency_ms ?? null
+```
+
+**`getModelValue()`**: Add formatting:
+
+```typescript
+if (scoreKey === "mean_total_llm_latency_ms") {
+  if (value < 1000) return `${Math.round(value)}ms`
+  return `${(value / 1000).toFixed(1)}s`
+}
+```
+
+### Radar Chart (`app/web_ui/src/lib/components/compare_radar_chart.svelte`)
+
+```typescript
+const LATENCY_KEY = "cost::mean_total_llm_latency_ms"
+
+// Rename isCostMetric → isLowerIsBetterMetric
+function isLowerIsBetterMetric(key: string): boolean {
+  return key === COST_KEY || key === LATENCY_KEY
+}
+```
+
+Add `LATENCY_KEY` to `dataKeys` alongside `COST_KEY`:
+
+```typescript
+$: dataKeys = [
+  ...comparisonFeatures
+    .filter((f) => f.eval_id !== "kiln_cost_section")
+    .flatMap((f) => f.items.map((item) => item.key)),
+  COST_KEY,
+  LATENCY_KEY,
+]
+```
+
+Label: `"Speed"` for the radar axis. Use same `costToScore()` function (lower is better).
+
+### Bar Chart (`app/web_ui/src/lib/components/compare_chart.svelte`)
+
+Add latency formatting in tooltip:
+
+```typescript
+if (key.includes("latency")) {
+  if (value < 1000) return `${Math.round(value)}ms`
+  return `${(value / 1000).toFixed(1)}s`
+}
+```
+
+## API Schema Regeneration
+
+After Python changes, run `app/web_ui/src/lib/generate_schema.sh` to regenerate `api_schema.d.ts`. This is automated — no manual TypeScript type changes needed.
+
+## Testing Strategy
+
+### Python unit tests
+
+**`libs/core/kiln_ai/datamodel/test_task_run.py`** (or equivalent):
+- `Usage.__add__` with `total_llm_latency_ms`: None+None=None, None+val=val, val+val=sum
+- `Usage` serialization/deserialization with `total_llm_latency_ms`
+- Backwards compat: deserialize old JSON without `total_llm_latency_ms` → None
+
+**`libs/core/kiln_ai/adapters/model_adapters/test_litellm_adapter.py`**:
+- `_run_model_turn()` sets `latency_ms` on trace assistant messages
+- `_run_model_turn()` accumulates `total_llm_latency_ms` on Usage across tool-call iterations
+- `_run()` accumulates `total_llm_latency_ms` across turns
+- Timing values are positive integers (sanity check, not exact values — mock `time.monotonic`)
+
+**`app/desktop/studio_server/test_eval_api.py`**:
+- `mean_total_llm_latency_ms` calculated correctly with sample data
+- 50% threshold applied correctly
+- None when insufficient data
+
+### Frontend tests
+
+**Compare page tests**: Verify `mean_total_llm_latency_ms` appears in comparison features and formats correctly.
+
+**Run page tests**: Verify latency appears in usage properties.
+
+### Test approach for timing
+
+Mock `time.monotonic` to return controlled values. Don't assert exact ms values from real calls — they're non-deterministic. Instead:
+- Mock returns incrementing values (e.g., 0.0, 0.5, 0.5, 1.2) so we get predictable deltas
+- Assert `latency_ms` and `total_llm_latency_ms` match expected values from mocked time
+
+## Error Handling
+
+No new error paths. Latency computation cannot fail — `time.monotonic()` always returns a float, subtraction always works, `int()` always works. If `acompletion_checking_response()` raises, the timing code is skipped (no latency recorded for failed calls). This is the correct behavior.
+
+## Dependencies
+
+None. Only uses `time.monotonic()` from the standard library.
Original file line number	Diff line number	Diff line change
`@@ -326,3 +326,8 @@ export function formatEvalConfigName(`
`326`	`326`	`]`
`327`	`327`	`return eval_config.name + " — " + parts.join(", ")`
`328`	`328`	`}`
	`329`	`+`
	`330`	`+export function formatLatency(ms: number): string {`
	`331`	+ if (ms < 1000) return `${Math.round(ms)}ms`
	`332`	+ return `${(ms / 1000).toFixed(1)}s`
	`333`	`+}`