GoogleCloudPlatform
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎SDK.md‎
Lines changed: 3 additions & 3 deletions b/‎SDK.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎dashboard/app.py‎
Lines changed: 4 additions & 3 deletions b/‎dashboard/app.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎deploy/remote_function/dispatch.py‎
Lines changed: 1 addition & 1 deletion b/‎deploy/remote_function/dispatch.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/streaming_evaluation/main.py‎
Lines changed: 0 additions & 1 deletion b/‎deploy/streaming_evaluation/main.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/design.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/design.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/implementation_plan_remote_function.md‎
Lines changed: 7 additions & 24 deletions b/‎docs/implementation_plan_remote_function.md‎
Lines changed: 7 additions & 24 deletions
diff --git a/‎examples/agent_improvement_cycle/DEMO_NARRATION.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/agent_improvement_cycle/DEMO_NARRATION.md‎
Lines changed: 1 addition & 1 deletion
@@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Added `SystemEvaluator` as the preferred name for deterministic/code-defined metrics.
-- Kept `CodeEvaluator` as a backward-compatible alias (deprecated but supported).
+- Kept `CodeEvaluator` as a backward-compatible alias.
 - **Compiled-extractor rollout guide** at
   [`docs/extractor_compilation_rollout_guide.md`](docs/extractor_compilation_rollout_guide.md).
   Operational playbook for the Phase C pipeline (issue
 
@@ -121,13 +121,13 @@ The SDK ships with seven ready-to-use evaluators:
 ```python
 from bigquery_agent_analytics import SystemEvaluator
 
-# Latency: score degrades linearly as avg latency approaches threshold
+# Latency: fails when average latency exceeds the threshold
 evaluator = SystemEvaluator.latency(threshold_ms=5000)
 
-# Turn count: penalizes sessions with too many back-and-forth turns
+# Turn count: fails when session turns exceed the max turns
 evaluator = SystemEvaluator.turn_count(max_turns=10)
 
-# Error rate: penalizes high tool error rates
+# Error rate: fails when tool error rate exceeds the max error rate
 evaluator = SystemEvaluator.error_rate(max_error_rate=0.1)
 
 # Token efficiency: checks total token usage stays within budget
 
@@ -1,8 +1,9 @@
-import streamlit as st
-import pandas as pd
+import re
+
 from google.cloud import bigquery
+import pandas as pd
 import plotly.express as px
-import re
+import streamlit as st
 
 # --- 1. Page Configuration ---
 st.set_page_config(page_title="Agent Analytics V2", layout="wide")
 
@@ -25,9 +25,9 @@
 from typing import Any
 
 from bigquery_agent_analytics import Client
-from bigquery_agent_analytics import SystemEvaluator
 from bigquery_agent_analytics import LLMAsJudge
 from bigquery_agent_analytics import serialize
+from bigquery_agent_analytics import SystemEvaluator
 from bigquery_agent_analytics import TraceFilter
 from bigquery_agent_analytics._deploy_runtime import resolve_client_options
 
 
@@ -19,7 +19,6 @@
 from flask import Flask
 from flask import jsonify
 from flask import request
-
 from worker import handle_scheduled_run
 
 app = Flask(__name__)
 
@@ -208,7 +208,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py):
    │ categorical_evaluator│  │ ontology_* (6 modules)│  │      cli         │
    │ categorical_views    │  │ (YAML → AI.GENERATE → │  │ (Typer commands) │
    │ (label evaluation)   │  │  tables → PG → GQL)   │  │                  │
-   └──────────────────┘  └──────────────────┘  └──────────────────┘
+   └──────────────────────┘  └──────────────────────┘  └──────────────────┘
 
    ┌──────────────────┐  ┌───────────────────┐
    │ udf_kernels      │  │ serialization     │
 
@@ -220,30 +220,13 @@ Dispatch logic:
 ```python
 # Map CLI --evaluator to SDK factory
 EVALUATOR_FACTORIES = {
-    "latency": (
-        lambda t: SystemEvaluator.latency(threshold_ms=t),
-        lambda: SystemEvaluator.latency(),
-    ),
-    "error_rate": (
-        lambda t: SystemEvaluator.error_rate(max_error_rate=t),
-        lambda: SystemEvaluator.error_rate(),
-    ),
-    "turn_count": (
-        lambda t: SystemEvaluator.turn_count(max_turns=int(t)),
-        lambda: SystemEvaluator.turn_count(),
-    ),
-    "token_efficiency": (
-        lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)),
-        lambda: SystemEvaluator.token_efficiency(),
-    ),
-    "ttft": (
-        lambda t: SystemEvaluator.ttft(threshold_ms=t),
-        lambda: SystemEvaluator.ttft(),
-    ),
-    "cost": (
-        lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t),
-        lambda: SystemEvaluator.cost_per_session(),
-    ),
+    "latency": lambda t: SystemEvaluator.latency(threshold_ms=t),
+    "error_rate": lambda t: SystemEvaluator.error_rate(max_error_rate=t),
+    "turn_count": lambda t: SystemEvaluator.turn_count(max_turns=int(t)),
+    "token_efficiency": lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)),
+    "ttft": lambda t: SystemEvaluator.ttft(threshold_ms=t),
+    "cost": lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t),
+    "llm-judge": None,  # special handling
 }
 # context_cache_hit_rate is special-cased so callers can pass
 # fail_on_missing_telemetry in addition to threshold/min_hit_rate.
 
@@ -217,5 +217,5 @@ By default, the script runs a single cycle and stops. The `--auto` flag enables
 ## [CLOSING]
 
 That's the agent improvement cycle. Capture sessions with the BigQuery Agent Analytics Plugin, evaluate quality with the SDK's LLM judge,
-check operational metrics with the SDK's SystemEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable. 
+check operational metrics with the SDK's SystemEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable.
 The golden eval set grows with every cycle, so failures you discover today become regression tests for tomorrow.