From 8e3569d8aa0fe39794dff1aea8a85804f0719be3 Mon Sep 17 00:00:00 2001 From: Gigi Stark Date: Sun, 3 May 2026 13:49:47 +0000 Subject: [PATCH 1/2] Refactor: Rename CodeEvaluator to SystemEvaluator Rename CodeEvaluator to SystemEvaluator to align with its focus on system-level metrics. A CodeEvaluator alias is kept in evaluators.py for backward-compatibility. --- CHANGELOG.md | 3 + README.md | 2 +- SDK.md | 42 +- deploy/remote_function/dispatch.py | 30 +- docs/design.md | 22 +- docs/hatteras_evaluation.md | 6 +- ...plementation_plan_concept_index_runtime.md | 2 +- docs/implementation_plan_remote_function.md | 77 +- docs/prd_unified_analytics_interface.md | 6 +- docs/python_udf_support_design.md | 8 +- .../agent_improvement_cycle/DEMO_NARRATION.md | 6 +- examples/agent_improvement_cycle/README.md | 8 +- .../eval/operational_metrics.py | 10 +- examples/context_graph_adcp_demo.ipynb | 6014 ++++++++--------- examples/e2e_demo.py | 10 +- examples/e2e_notebook_demo.ipynb | 818 ++- .../nba_agent_trace_analysis_notebook.ipynb | 1041 +-- src/bigquery_agent_analytics/__init__.py | 2 + .../_streaming_evaluation.py | 8 +- .../categorical_evaluator.py | 2 +- src/bigquery_agent_analytics/cli.py | 30 +- src/bigquery_agent_analytics/client.py | 16 +- src/bigquery_agent_analytics/evaluators.py | 58 +- .../grader_pipeline.py | 22 +- src/bigquery_agent_analytics/udf_kernels.py | 2 +- tests/test_grader_pipeline.py | 10 +- tests/test_pr16_fixes.py | 2 +- tests/test_pr17_fixes.py | 4 +- tests/test_sdk_client.py | 4 +- tests/test_sdk_evaluators.py | 122 +- tests/test_udf_kernels.py | 20 +- 31 files changed, 4410 insertions(+), 3997 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d29dd107..25e61025 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `SystemEvaluator` as the preferred name for deterministic/code-defined metrics. +- Kept `CodeEvaluator` as a backward-compatible alias (deprecated but supported). + - **``bqaa-revalidate-extractors`` CLI** in `bigquery_agent_analytics.extractor_compilation.cli_revalidate` and diff --git a/README.md b/README.md index 475649e4..0bf24793 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ src/bigquery_agent_analytics/ │ └── formatter.py # Output formatting (json/text/table) │ ├── Evaluation -│ ├── evaluators.py # CodeEvaluator + LLMAsJudge +│ ├── evaluators.py # SystemEvaluator + LLMAsJudge │ ├── trace_evaluator.py # Trajectory matching & replay │ ├── multi_trial.py # Multi-trial runner + pass@k │ ├── grader_pipeline.py # Grader composition pipeline diff --git a/SDK.md b/SDK.md index 1854dbe8..016c3478 100644 --- a/SDK.md +++ b/SDK.md @@ -112,32 +112,32 @@ traces = client.list_traces( ## 3. Code-Based Evaluation (Deterministic Metrics) -`CodeEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0. +`SystemEvaluator` runs deterministic, code-defined metric functions against session summaries. Each metric returns a score between 0.0 and 1.0. ### Pre-Built Evaluators The SDK ships with seven ready-to-use evaluators: ```python -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator -# Latency: fails when average latency exceeds the budget -evaluator = CodeEvaluator.latency(threshold_ms=5000) +# Latency: score degrades linearly as avg latency approaches threshold +evaluator = SystemEvaluator.latency(threshold_ms=5000) -# Turn count: fails when sessions use too many back-and-forth turns -evaluator = CodeEvaluator.turn_count(max_turns=10) +# Turn count: penalizes sessions with too many back-and-forth turns +evaluator = SystemEvaluator.turn_count(max_turns=10) -# Error rate: fails on high tool error rates -evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) +# Error rate: penalizes high tool error rates +evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) # Token efficiency: checks total token usage stays within budget -evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) +evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) # Context cache hit rate: checks repeated prompt-prefix reuse -evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) +evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) # Cost per session: checks estimated USD cost stays under budget -evaluator = CodeEvaluator.cost_per_session( +evaluator = SystemEvaluator.cost_per_session( max_cost_usd=1.0, input_cost_per_1k=0.00025, output_cost_per_1k=0.00125, @@ -173,7 +173,7 @@ Define your own metric functions and chain multiple metrics together: ```python evaluator = ( - CodeEvaluator(name="my_quality_check") + SystemEvaluator(name="my_quality_check") .add_metric( name="latency", fn=lambda s: 1.0 - min(s.get("avg_latency_ms", 0) / 5000, 1.0), @@ -216,7 +216,7 @@ Run evaluation across all sessions matching a filter: from bigquery_agent_analytics import TraceFilter report = client.evaluate( - evaluator=CodeEvaluator.latency(threshold_ms=3000), + evaluator=SystemEvaluator.latency(threshold_ms=3000), filters=TraceFilter(agent_id="my_agent"), ) @@ -561,7 +561,7 @@ pass_pow_k = compute_pass_pow_k(num_trials=10, num_passed=8) # ~0.107 ## 7. Grader Composition Pipeline -Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions) into a single aggregated verdict using configurable scoring strategies. +Combine multiple evaluators (`SystemEvaluator` + `LLMAsJudge` + custom functions) into a single aggregated verdict using configurable scoring strategies. ### Scoring Strategies @@ -575,7 +575,7 @@ Combine multiple evaluators (`CodeEvaluator` + `LLMAsJudge` + custom functions) ```python from bigquery_agent_analytics import ( - CodeEvaluator, GraderPipeline, LLMAsJudge, + SystemEvaluator, GraderPipeline, LLMAsJudge, WeightedStrategy, GraderResult, ) @@ -588,8 +588,8 @@ pipeline = ( }, threshold=0.6, )) - .add_code_grader(CodeEvaluator.latency(threshold_ms=5000), weight=0.2) - .add_code_grader(CodeEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) + .add_code_grader(SystemEvaluator.latency(threshold_ms=5000), weight=0.2) + .add_code_grader(SystemEvaluator.cost_per_session(max_cost_usd=0.50), weight=0.1) .add_llm_grader(LLMAsJudge.correctness(threshold=0.7), weight=0.7) ) @@ -618,8 +618,8 @@ from bigquery_agent_analytics import BinaryStrategy pipeline = ( GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency(threshold_ms=3000)) - .add_code_grader(CodeEvaluator.error_rate(max_error_rate=0.05)) + .add_code_grader(SystemEvaluator.latency(threshold_ms=3000)) + .add_code_grader(SystemEvaluator.error_rate(max_error_rate=0.05)) .add_llm_grader(LLMAsJudge.hallucination(threshold=0.8)) ) @@ -649,7 +649,7 @@ def business_rules_grader(context): pipeline = ( GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency()) + .add_code_grader(SystemEvaluator.latency()) .add_custom_grader("business_rules", business_rules_grader) ) ``` @@ -2057,7 +2057,7 @@ bigquery_agent_analytics/ │ Core │ ├── client.py ← High-level SDK entry point │ ├── trace.py ← Trace/Span reconstruction & DAG rendering -│ └── evaluators.py ← CodeEvaluator + LLMAsJudge + SQL templates +│ └── evaluators.py ← SystemEvaluator + LLMAsJudge + SQL templates │ │ Evaluation Harness │ ├── trace_evaluator.py ← BigQueryTraceEvaluator, trajectory matching, replay diff --git a/deploy/remote_function/dispatch.py b/deploy/remote_function/dispatch.py index 7ccf45fb..2c986910 100644 --- a/deploy/remote_function/dispatch.py +++ b/deploy/remote_function/dispatch.py @@ -25,7 +25,7 @@ from typing import Any from bigquery_agent_analytics import Client -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator from bigquery_agent_analytics import LLMAsJudge from bigquery_agent_analytics import serialize from bigquery_agent_analytics import TraceFilter @@ -137,7 +137,7 @@ def build_filters(params): def build_evaluator(params): - """Build CodeEvaluator from params dict.""" + """Build SystemEvaluator from params dict.""" metric = params.get("metric", "latency") threshold = params.get("threshold") fail_on_missing_telemetry = _bool_param( @@ -145,35 +145,35 @@ def build_evaluator(params): ) factories_with_t = { - "latency": lambda t: CodeEvaluator.latency(threshold_ms=t), - "error_rate": lambda t: CodeEvaluator.error_rate( + "latency": lambda t: SystemEvaluator.latency(threshold_ms=t), + "error_rate": lambda t: SystemEvaluator.error_rate( max_error_rate=t, ), - "turn_count": lambda t: CodeEvaluator.turn_count( + "turn_count": lambda t: SystemEvaluator.turn_count( max_turns=int(t), ), - "token_efficiency": lambda t: CodeEvaluator.token_efficiency( + "token_efficiency": lambda t: SystemEvaluator.token_efficiency( max_tokens=int(t), ), - "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t), - "cost": lambda t: CodeEvaluator.cost_per_session( + "ttft": lambda t: SystemEvaluator.ttft(threshold_ms=t), + "cost": lambda t: SystemEvaluator.cost_per_session( max_cost_usd=t, ), } factories_default = { - "latency": CodeEvaluator.latency, - "error_rate": CodeEvaluator.error_rate, - "turn_count": CodeEvaluator.turn_count, - "token_efficiency": CodeEvaluator.token_efficiency, - "ttft": CodeEvaluator.ttft, - "cost": CodeEvaluator.cost_per_session, + "latency": SystemEvaluator.latency, + "error_rate": SystemEvaluator.error_rate, + "turn_count": SystemEvaluator.turn_count, + "token_efficiency": SystemEvaluator.token_efficiency, + "ttft": SystemEvaluator.ttft, + "cost": SystemEvaluator.cost_per_session, } if metric == "context_cache_hit_rate": kwargs = {"fail_on_missing_telemetry": fail_on_missing_telemetry} if threshold is not None: kwargs["min_hit_rate"] = threshold - return CodeEvaluator.context_cache_hit_rate(**kwargs) + return SystemEvaluator.context_cache_hit_rate(**kwargs) if metric not in factories_with_t: raise ValueError(f"Unknown metric: {metric!r}") diff --git a/docs/design.md b/docs/design.md index 9b094210..a43942b2 100644 --- a/docs/design.md +++ b/docs/design.md @@ -150,7 +150,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py): **Phase 2 — Evaluation:** 1. `Client.get_trace()` retrieves all events for a session -2. `CodeEvaluator` preset factories assess latency, turn count, error rate, token efficiency +2. `SystemEvaluator` preset factories assess latency, turn count, error rate, token efficiency 3. `LLMAsJudge.correctness()` performs semantic evaluation via BigQuery `AI.GENERATE` 4. `BigQueryTraceEvaluator.evaluate_session()` performs trajectory matching against golden tool sequences @@ -208,7 +208,7 @@ As demonstrated in the [e2e demo](../examples/e2e_demo.py): │ categorical_evaluator│ │ ontology_* (6 modules)│ │ cli │ │ categorical_views │ │ (YAML → AI.GENERATE → │ │ (Typer commands) │ │ (label evaluation) │ │ tables → PG → GQL) │ │ │ - └──────────────────────┘ └──────────────────────┘ └──────────────────┘ + └──────────────────┘ └──────────────────┘ └──────────────────┘ ┌──────────────────┐ ┌───────────────────┐ │ udf_kernels │ │ serialization │ @@ -248,7 +248,7 @@ Aggregations, filtering, joins, and even LLM evaluation (via `AI.GENERATE`) are LLM-based evaluation can run via (1) BigQuery `AI.GENERATE`, (2) legacy BigQuery ML `ML.GENERATE_TEXT`, or (3) the Gemini API directly. This maximizes compatibility across different GCP configurations. **Decision 4: Composition over inheritance.** -The `GraderPipeline` composes `CodeEvaluator`, `LLMAsJudge`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class. +The `GraderPipeline` composes `SystemEvaluator`, `LLMAsJudge`, and custom functions via a builder pattern rather than requiring them to share a common base class. The `BigQueryMemoryService` composes four internal services rather than extending a single monolithic class. --- @@ -396,7 +396,7 @@ Each field generates a separate `AND` condition with a corresponding `bigquery.S This module contains two evaluator classes and the SQL templates that power batch evaluation. -#### 4.3.1 `CodeEvaluator` +#### 4.3.1 `SystemEvaluator` Deterministic evaluation using code-defined metric functions. @@ -641,7 +641,7 @@ Combines heterogeneous evaluators into a unified verdict using a strategy patter │ ┌──────────────┼──────────────┐ ▼ ▼ ▼ - CodeEvaluator LLMAsJudge Custom Fn + SystemEvaluator LLMAsJudge Custom Fn (sync) (async) (sync) │ │ │ ▼ ▼ ▼ @@ -1273,7 +1273,7 @@ results = client.query(formatted, job_config=job_config) ``` Evaluation -├── Deterministic (CodeEvaluator) +├── Deterministic (SystemEvaluator) │ ├── Latency │ ├── Turn count │ ├── Error rate @@ -1321,7 +1321,7 @@ All evaluation scores in the SDK are normalized to `[0.0, 1.0]`: | Mode | Evaluator | Where Computation Runs | |------|-----------|----------------------| -| Single session (sync) | `CodeEvaluator.evaluate_session()` | Python | +| Single session (sync) | `SystemEvaluator.evaluate_session()` | Python | | Single session (async) | `LLMAsJudge.evaluate_session()` | Gemini API | | Batch via Client | `Client.evaluate()` | BigQuery (SQL + AI.GENERATE) | | Trajectory matching | `BigQueryTraceEvaluator.evaluate_session()` | BigQuery (fetch) + Python (matching) | @@ -1420,7 +1420,7 @@ Synchronous (user-facing): ├── Client.drift_detection() ├── Client.insights() ├── Client.deep_analysis() -├── CodeEvaluator.evaluate_session() +├── SystemEvaluator.evaluate_session() ├── EvalSuite.* ├── EvalValidator.* └── BigFramesEvaluator.* @@ -1480,10 +1480,10 @@ results = await asyncio.gather(*[_run_one(t) for t in tasks]) ## 10. Extensibility & Plugin Points -### 10.1 Custom Metrics (CodeEvaluator) +### 10.1 Custom Metrics (SystemEvaluator) ```python -evaluator = CodeEvaluator(name="custom").add_metric( +evaluator = SystemEvaluator(name="custom").add_metric( name="business_metric", fn=lambda session: your_scoring_logic(session), threshold=0.7, @@ -1586,7 +1586,7 @@ All tests mock BigQuery — no GCP credentials or live BigQuery access is needed ``` tests/ ├── test_sdk_client.py # Client integration tests -├── test_sdk_evaluators.py # CodeEvaluator + LLMAsJudge +├── test_sdk_evaluators.py # SystemEvaluator + LLMAsJudge ├── test_sdk_trace.py # Trace/Span reconstruction ├── test_sdk_feedback.py # Drift detection ├── test_sdk_insights.py # Insights pipeline diff --git a/docs/hatteras_evaluation.md b/docs/hatteras_evaluation.md index 5d61b252..6fec2f3f 100644 --- a/docs/hatteras_evaluation.md +++ b/docs/hatteras_evaluation.md @@ -7,7 +7,7 @@ agent sessions into user-defined categories directly against traces stored in BigQuery, without relying on an external service. This should be implemented as a new categorical evaluation subsystem, not as -an overload of the existing numeric `CodeEvaluator` / `LLMAsJudge` report +an overload of the existing numeric `SystemEvaluator` / `LLMAsJudge` report path. The goal is to support Hatteras-like functionality inside the SDK: @@ -22,7 +22,7 @@ The goal is to support Hatteras-like functionality inside the SDK: Today the SDK supports two major evaluation modes: -- deterministic numeric scoring via `CodeEvaluator` +- deterministic numeric scoring via `SystemEvaluator` - semantic numeric scoring via `LLMAsJudge` What is missing is a first-class way to answer questions like: @@ -60,7 +60,7 @@ That capability is useful for: This design is not proposing: - a full clone of an external Hatteras service -- a replacement for `CodeEvaluator` +- a replacement for `SystemEvaluator` - a replacement for `LLMAsJudge` - a new remote function or Python UDF surface in the first phase - real-time ingestion-time classification in phase 1 diff --git a/docs/implementation_plan_concept_index_runtime.md b/docs/implementation_plan_concept_index_runtime.md index af0971a9..ad535960 100644 --- a/docs/implementation_plan_concept_index_runtime.md +++ b/docs/implementation_plan_concept_index_runtime.md @@ -165,7 +165,7 @@ Work: `bigquery_ontology/contrib/advertising/` stub with Yahoo's resolver (if co - `src/bigquery_ontology/graph_ddl_compiler.py` — add `compile_concept_index(ontology, binding, *, output_table) -> str`. Preserve `compile_graph()` contract byte-identically. No changes to existing function bodies. - `src/bigquery_ontology/cli.py:299` — `compile` command gains `--emit-concept-index` and `--concept-index-table` flags. When absent, behavior is byte-identical to today. - `src/bigquery_ontology/__init__.py` — add `from .graph_ddl_compiler import compile_concept_index` so the new public function is importable as `from bigquery_ontology import compile_concept_index`, matching the existing pattern for `compile_graph` (`__init__.py:50` today). -- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `CodeEvaluator`, etc.): +- `src/bigquery_agent_analytics/__init__.py` — add the new public surface to the try/except re-export block (same pattern as `Client`, `SystemEvaluator`, etc.): - `OntologyRuntime` from `.ontology_runtime` - `EntityResolver`, `ExactMatchResolver`, `SynonymResolver`, `Candidate`, `ResolveResult` from `.entity_resolver` - `ConceptIndexMismatchError`, `ConceptIndexProvenanceMissing`, `ConceptIndexInconsistentPair`, `ConceptIndexRefreshed` from `.ontology_runtime` diff --git a/docs/implementation_plan_remote_function.md b/docs/implementation_plan_remote_function.md index ff619456..6427dd0c 100644 --- a/docs/implementation_plan_remote_function.md +++ b/docs/implementation_plan_remote_function.md @@ -220,13 +220,30 @@ Dispatch logic: ```python # Map CLI --evaluator to SDK factory EVALUATOR_FACTORIES = { - "latency": lambda t: CodeEvaluator.latency(threshold_ms=t), - "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t), - "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - "token_efficiency": lambda t: CodeEvaluator.token_efficiency(max_tokens=int(t)), - "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t), - "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), - "llm-judge": None, # special handling + "latency": ( + lambda t: SystemEvaluator.latency(threshold_ms=t), + lambda: SystemEvaluator.latency(), + ), + "error_rate": ( + lambda t: SystemEvaluator.error_rate(max_error_rate=t), + lambda: SystemEvaluator.error_rate(), + ), + "turn_count": ( + lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + lambda: SystemEvaluator.turn_count(), + ), + "token_efficiency": ( + lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)), + lambda: SystemEvaluator.token_efficiency(), + ), + "ttft": ( + lambda t: SystemEvaluator.ttft(threshold_ms=t), + lambda: SystemEvaluator.ttft(), + ), + "cost": ( + lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), + lambda: SystemEvaluator.cost_per_session(), + ), } # context_cache_hit_rate is special-cased so callers can pass # fail_on_missing_telemetry in addition to threshold/min_hit_rate. @@ -292,7 +309,7 @@ import functions_framework from flask import jsonify from bigquery_agent_analytics import Client, serialize -from bigquery_agent_analytics import CodeEvaluator, LLMAsJudge +from bigquery_agent_analytics import SystemEvaluator, LLMAsJudge from bigquery_agent_analytics import TraceFilter @@ -396,35 +413,35 @@ def _bool_param(value): def _build_evaluator(params): - """Build CodeEvaluator from params dict.""" + """Build SystemEvaluator from params dict.""" metric = params.get("metric", "latency") threshold = params.get("threshold") fail_on_missing_telemetry = _bool_param( params.get("fail_on_missing_telemetry", False) ) factories = { - "latency": lambda t: CodeEvaluator.latency(threshold_ms=t), - "error_rate": lambda t: CodeEvaluator.error_rate(max_error_rate=t), - "turn_count": lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - "token_efficiency": lambda t: CodeEvaluator.token_efficiency( + "latency": lambda t: SystemEvaluator.latency(threshold_ms=t), + "error_rate": lambda t: SystemEvaluator.error_rate(max_error_rate=t), + "turn_count": lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + "token_efficiency": lambda t: SystemEvaluator.token_efficiency( max_tokens=int(t) ), - "ttft": lambda t: CodeEvaluator.ttft(threshold_ms=t), - "cost": lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), + "ttft": lambda t: SystemEvaluator.ttft(threshold_ms=t), + "cost": lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), } factories_default = { - "latency": CodeEvaluator.latency, - "error_rate": CodeEvaluator.error_rate, - "turn_count": CodeEvaluator.turn_count, - "token_efficiency": CodeEvaluator.token_efficiency, - "ttft": CodeEvaluator.ttft, - "cost": CodeEvaluator.cost_per_session, + "latency": SystemEvaluator.latency, + "error_rate": SystemEvaluator.error_rate, + "turn_count": SystemEvaluator.turn_count, + "token_efficiency": SystemEvaluator.token_efficiency, + "ttft": SystemEvaluator.ttft, + "cost": SystemEvaluator.cost_per_session, } if metric == "context_cache_hit_rate": kwargs = {"fail_on_missing_telemetry": fail_on_missing_telemetry} if threshold is not None: kwargs["min_hit_rate"] = threshold - return CodeEvaluator.context_cache_hit_rate(**kwargs) + return SystemEvaluator.context_cache_hit_rate(**kwargs) if metric not in factories: raise ValueError(f"Unknown metric: {metric}") if threshold is not None: @@ -612,7 +629,7 @@ Complete mapping from interface operations to current SDK code: | Operation | SDK Method | File:Line | Return Type | Serialization Strategy | |-----------|-----------|-----------|-------------|----------------------| | `analyze` | `Client.get_session_trace()` | `client.py` | `Trace` (dataclass) | `serialize()` → recursive `.to_dict()` | -| `evaluate` | `Client.evaluate(CodeEvaluator)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | +| `evaluate` | `Client.evaluate(SystemEvaluator)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | | `judge` | `Client.evaluate(LLMAsJudge)` | `client.py` | `EvaluationReport` (Pydantic) | `.model_dump(mode="json")` | | `insights` | `Client.insights()` | `client.py` | `InsightsReport` (Pydantic) | `.model_dump(mode="json")` | | `drift` | `Client.drift_detection()` | `client.py` | `DriftReport` (Pydantic) | `.model_dump(mode="json")` | @@ -625,13 +642,13 @@ Complete mapping from interface operations to current SDK code: | CLI `--evaluator` | SDK Factory | File | |-------------------|------------|------| -| `latency` | `CodeEvaluator.latency(threshold_ms)` | `evaluators.py` | -| `error_rate` | `CodeEvaluator.error_rate(max_error_rate)` | `evaluators.py` | -| `turn_count` | `CodeEvaluator.turn_count(max_turns)` | `evaluators.py` | -| `token_efficiency` | `CodeEvaluator.token_efficiency(max_tokens)` | `evaluators.py` | -| `context_cache_hit_rate` | `CodeEvaluator.context_cache_hit_rate(min_hit_rate)` | `evaluators.py` | -| `ttft` | `CodeEvaluator.ttft(threshold_ms)` | `evaluators.py` | -| `cost` | `CodeEvaluator.cost_per_session(max_cost_usd)` | `evaluators.py` | +| `latency` | `SystemEvaluator.latency(threshold_ms)` | `evaluators.py` | +| `error_rate` | `SystemEvaluator.error_rate(max_error_rate)` | `evaluators.py` | +| `turn_count` | `SystemEvaluator.turn_count(max_turns)` | `evaluators.py` | +| `token_efficiency` | `SystemEvaluator.token_efficiency(max_tokens)` | `evaluators.py` | +| `context_cache_hit_rate` | `SystemEvaluator.context_cache_hit_rate(min_hit_rate)` | `evaluators.py` | +| `ttft` | `SystemEvaluator.ttft(threshold_ms)` | `evaluators.py` | +| `cost` | `SystemEvaluator.cost_per_session(max_cost_usd)` | `evaluators.py` | | `llm-judge` | `LLMAsJudge.correctness/hallucination/sentiment(threshold)` | `evaluators.py` | ### SDK Capabilities NOT Exposed (v1.2+ candidates) diff --git a/docs/prd_unified_analytics_interface.md b/docs/prd_unified_analytics_interface.md index fb12fa03..d9a87dca 100644 --- a/docs/prd_unified_analytics_interface.md +++ b/docs/prd_unified_analytics_interface.md @@ -109,7 +109,7 @@ All operations go through a single multiplexed function: | Operation | SDK Method | Params (JSON keys) | Output | |-----------|-----------|---------------------|--------| | `analyze` | `Client.get_session_trace()` + metrics | `session_id` | JSON with span count, error count, latency, tool calls | -| `evaluate` | `CodeEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details | +| `evaluate` | `SystemEvaluator` | `session_id`, `metric`, `threshold` | JSON with passed, score, details | | `judge` | `LLMAsJudge` | `session_id`, `criterion` | JSON with score, feedback | | `insights` | Facet extraction | `session_id` | JSON with intent, outcome, friction | | `drift` | Drift detection | `golden_dataset`, `agent_filter`, `start_date`, `end_date` | JSON with coverage, gaps | @@ -443,7 +443,7 @@ import functions_framework import json import os from flask import jsonify -from bigquery_agent_analytics import Client, CodeEvaluator, LLMAsJudge, TraceFilter +from bigquery_agent_analytics import Client, SystemEvaluator, LLMAsJudge, TraceFilter # Initialized once per cold start. Config comes from userDefinedContext # (forwarded by BigQuery) or environment variables as fallback. @@ -490,7 +490,7 @@ def _dispatch(client, operation, params): "final_response": trace.final_response, } elif operation == "evaluate": - evaluator = CodeEvaluator.latency(threshold_ms=params["threshold"]) + evaluator = SystemEvaluator.latency(threshold_ms=params["threshold"]) report = client.evaluate(evaluator=evaluator, filters=TraceFilter(session_ids=[params["session_id"]])) return report.details[0] if report.details else {} diff --git a/docs/python_udf_support_design.md b/docs/python_udf_support_design.md index deba9eee..e3abe6aa 100644 --- a/docs/python_udf_support_design.md +++ b/docs/python_udf_support_design.md @@ -172,7 +172,7 @@ primitive: | SDK area | Python UDF fit | Required redesign | |----------|----------------|-------------------| -| `Client.evaluate(CodeEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields | +| `Client.evaluate(SystemEvaluator, filters)` | Partial | SQL builds per-session summaries first; UDF computes scores from summary fields | | `Client.deep_analysis()` / question distribution | Partial | SQL does grouping / embeddings / top-k; UDF can help with categorization or normalization | | `Client.drift_detection()` | Partial | SQL computes set logic; UDF may help with text normalization or thresholding | | `Client.insights()` | Partial | Best split into SQL extraction + optional UDF post-processing; not a direct port | @@ -224,7 +224,7 @@ That is maintainable. Reusing the entire client inside a Python UDF is not. The current evaluator score math is not implemented as standalone top-level functions today. It lives inside factory-method closures such as -`CodeEvaluator.latency()` and `CodeEvaluator.error_rate()` in +`SystemEvaluator.latency()` and `SystemEvaluator.error_rate()` in [evaluators.py](/Users/haiyuancao/BigQuery-Agent-Analytics-SDK/src/bigquery_agent_analytics/evaluators.py). That means the first implementation step is a deliberate refactor: @@ -281,7 +281,7 @@ the shared extraction helper. ### 7.2 Tier 2: code-evaluator score kernels -These should map directly to the existing `CodeEvaluator` math: +These should map directly to the existing `SystemEvaluator` math: ```sql CREATE FUNCTION `PROJECT.UDF_DATASET.bqaa_score_latency`( @@ -497,7 +497,7 @@ Remote Function should still be described as: - Add `udf_kernels.py` - Move reusable evaluator math into standalone pure functions - Move reusable event semantic helpers into a UDF-safe layer -- Add unit tests proving parity with existing `CodeEvaluator` behavior +- Add unit tests proving parity with existing `SystemEvaluator` behavior ### Phase U2: Tier 1 and Tier 2 UDFs diff --git a/examples/agent_improvement_cycle/DEMO_NARRATION.md b/examples/agent_improvement_cycle/DEMO_NARRATION.md index 9c3d02c1..ca019711 100644 --- a/examples/agent_improvement_cycle/DEMO_NARRATION.md +++ b/examples/agent_improvement_cycle/DEMO_NARRATION.md @@ -98,7 +98,7 @@ Step two sends those ten questions to the agent. Every session is logged to BigQ Step three is where the SDK earns its keep. The quality report script reads those sessions back from BigQuery and an LLM judge scores each one. Four sessions are marked unhelpful — the agent deflected instead of using its tools. One is partial. Five are meaningful. The baseline score: fifty percent meaningful. That's our starting point. -Right below the quality score, the SDK's deterministic CodeEvaluator runs on the same sessions — average latency, total tokens per session, turn count and error_rate. These are the operational baselines. No LLM needed, just math on the data already in BigQuery. We'll compare against these numbers after the improvement to make sure the new prompt didn't trade quality for cost. +Right below the quality score, the SDK's deterministic SystemEvaluator runs on the same sessions — average latency, total tokens per session, turn count and error_rate. These are the operational baselines. No LLM needed, just math on the data already in BigQuery. We'll compare against these numbers after the improvement to make sure the new prompt didn't trade quality for cost. --- @@ -167,7 +167,7 @@ Let's recap what just happened: 3. We **generated ten synthetic questions** covering all six policy topics and ran them through the agent. The agent deflected on expenses, benefits, and holidays — topics it could answer but the prompt told it not to try. -4. The **SDK's quality report** read those sessions from BigQuery and an LLM judge scored them. Baseline: roughly fifty percent meaningful. Right below, the **SDK's CodeEvaluator** established operational baselines — latency, tokens, turns, tool error rate — all from the same BigQuery data, no extra LLM calls. +4. The **SDK's quality report** read those sessions from BigQuery and an LLM judge scored them. Baseline: roughly fifty percent meaningful. Right below, the **SDK's SystemEvaluator** established operational baselines — latency, tokens, turns, tool error rate — all from the same BigQuery data, no extra LLM calls. 5. We **extracted the failures** into the golden eval set — growing it from three to about eight cases. A **teacher agent** — same model, same tools, different prompt — generated ground truth for each failed question. The **Vertex AI Prompt Optimizer** used those triples to generate an improved prompt, and the **regression gate** validated it against all golden cases before promoting it to V2. @@ -217,5 +217,5 @@ By default, the script runs a single cycle and stops. The `--auto` flag enables ## [CLOSING] That's the agent improvement cycle. Capture sessions with the BigQuery Agent Analytics Plugin, evaluate quality with the SDK's LLM judge, -check operational metrics with the SDK's CodeEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable. +check operational metrics with the SDK's SystemEvaluator, optimize prompts with Vertex AI, and measure the results — all automated, all repeatable. The golden eval set grows with every cycle, so failures you discover today become regression tests for tomorrow. diff --git a/examples/agent_improvement_cycle/README.md b/examples/agent_improvement_cycle/README.md index 02fceba8..ad7e304f 100644 --- a/examples/agent_improvement_cycle/README.md +++ b/examples/agent_improvement_cycle/README.md @@ -104,7 +104,7 @@ This demo shows how to close that gap using four components: logged sessions back from BigQuery, evaluates quality using an LLM judge, and produces structured reports that drive automated improvement. -3. **[`SDK CodeEvaluator`](../../bigquery_agent_analytics/evaluators.py)** (the SDK's deterministic evaluator) checks +3. **[`SDK SystemEvaluator`](../../bigquery_agent_analytics/evaluators.py)** (the SDK's deterministic evaluator) checks operational metrics — latency, token efficiency, and turn count — on the same sessions. No LLM calls needed, just math on the data already in BigQuery. This ensures the improved prompt doesn't trade @@ -130,7 +130,7 @@ The full cycle: 5. **MEASURE IMPROVEMENT:** Verify the improved prompt against fresh traffic to quantify the quality jump. At each evaluation step (3 and 5), the SDK's deterministic -`CodeEvaluator` also checks latency, token efficiency, and turn count. +`SystemEvaluator` also checks latency, token efficiency, and turn count. Step 3 establishes the operational baseline; Step 5 shows the before/after comparison to verify the improved prompt didn't regress on cost or performance. No extra agent runs — just math on the session @@ -429,7 +429,7 @@ using ADK's `InMemoryRunner`. Sessions are logged to BigQuery via the **Step 3: Evaluate Quality** -- The SDK's `quality_report.py` reads sessions from BigQuery and scores each one on response_usefulness (meaningful/partial/unhelpful) and task_grounding (grounded/ungrounded). -The SDK's `CodeEvaluator` also runs deterministic checks on the same +The SDK's `SystemEvaluator` also runs deterministic checks on the same sessions — latency, token efficiency, and turn count — to establish an operational baseline. @@ -721,7 +721,7 @@ deployments, consider periodically pruning redundant golden cases. ## Further Reading -- [Your Agent Events Table Is Also a Test Suite](https://medium.com/google-cloud/your-agent-events-table-is-also-a-test-suite-999fbef885ed) — Using the SDK's `CodeEvaluator` and `categorical-eval` CLI to gate PRs against production traces. Covers the same deterministic evaluators (latency, token efficiency, turn count, error rate) this demo uses in Steps 3 and 5. +- [Your Agent Events Table Is Also a Test Suite](https://medium.com/google-cloud/your-agent-events-table-is-also-a-test-suite-999fbef885ed) — Using the SDK's `SystemEvaluator` and `categorical-eval` CLI to gate PRs against production traces. Covers the same deterministic evaluators (latency, token efficiency, turn count, error rate) this demo uses in Steps 3 and 5. - [BigQuery Agent Analytics: From Logs to Graphs](https://medium.com/google-cloud/bigquery-agent-analytics-from-logs-to-graphs-ab0bc34e1418) — Visualizing agent session traces as interactive graphs. Shows how the `BigQueryAgentAnalyticsPlugin` captures the data that powers this improvement cycle. ## Future / Next Steps diff --git a/examples/agent_improvement_cycle/eval/operational_metrics.py b/examples/agent_improvement_cycle/eval/operational_metrics.py index 29726dec..d5040525 100644 --- a/examples/agent_improvement_cycle/eval/operational_metrics.py +++ b/examples/agent_improvement_cycle/eval/operational_metrics.py @@ -106,7 +106,7 @@ def run_metrics(session_ids: list[str]) -> dict: passed, failed, pass_rate, avg_observed}. """ from bigquery_agent_analytics import Client - from bigquery_agent_analytics.evaluators import CodeEvaluator + from bigquery_agent_analytics.evaluators import SystemEvaluator from bigquery_agent_analytics.trace import TraceFilter client = Client( @@ -119,13 +119,13 @@ def run_metrics(session_ids: list[str]) -> dict: results = {} for metric_name, cfg in METRICS.items(): if metric_name == "latency": - evaluator = CodeEvaluator.latency(threshold_ms=cfg["threshold"]) + evaluator = SystemEvaluator.latency(threshold_ms=cfg["threshold"]) elif metric_name == "token_efficiency": - evaluator = CodeEvaluator.token_efficiency(max_tokens=cfg["threshold"]) + evaluator = SystemEvaluator.token_efficiency(max_tokens=cfg["threshold"]) elif metric_name == "turn_count": - evaluator = CodeEvaluator.turn_count(max_turns=cfg["threshold"]) + evaluator = SystemEvaluator.turn_count(max_turns=cfg["threshold"]) elif metric_name == "error_rate": - evaluator = CodeEvaluator.error_rate(max_error_rate=cfg["threshold"]) + evaluator = SystemEvaluator.error_rate(max_error_rate=cfg["threshold"]) else: continue diff --git a/examples/context_graph_adcp_demo.ipynb b/examples/context_graph_adcp_demo.ipynb index 95f96a9f..59d4b0f7 100644 --- a/examples/context_graph_adcp_demo.ipynb +++ b/examples/context_graph_adcp_demo.ipynb @@ -1,3093 +1,3093 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "7b4dd2b6", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:40.608829Z", - "iopub.status.busy": "2026-03-05T09:26:40.608720Z", - "iopub.status.idle": "2026-03-05T09:26:40.611594Z", - "shell.execute_reply": "2026-03-05T09:26:40.611030Z" - } - }, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "id": "0713f88b", - "metadata": {}, - "source": [ - "# Context Graph Demo V2: System of Reasoning for Agentic Ads\n", - "\n", - "**Integrating BigQuery Agent Analytics SDK with Native Property Graphs**\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"Vertex Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ Open in BQ Studio\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "4188cacb", - "metadata": {}, - "source": [ - "## Demo Scenario: Yahoo ADCP \"ELF Cosmetics\" Media Buy\n", - "\n", - "This notebook simulates a realistic **multi-agent media buying workflow** based on the Yahoo Ad Context Protocol (ADCP):\n", - "\n", - "```mermaid\n", - "flowchart LR\n", - " A[Buyer Agent
ELF Cosmetics] -->|ADCP Brief| B[Sales Agent
Yahoo DSP]\n", - " B -->|Inventory Query| C[Inventory Tool]\n", - " B -->|Audience Match| D[Audience Tool]\n", - " B -->|Budget Split| E[Budget Tool]\n", - " B -->|HITL Pause| F[Ad Ops Manager
Slack Approval]\n", - " F -->|Approved| G[Provision Campaign
Google Ad Manager]\n", - " G -->|Artifact| H[GCS Line Item JSON]\n", - "```\n", - "\n", - "We then build a **4-Pillar Context Graph** that cross-links:\n", - "1. **Technical Graph** (execution lineage from ADK plugin)\n", - "2. **Biz Graph** (Products, Targeting, Campaigns extracted via AI.GENERATE)\n", - "3. **Cross-Links** (the \"Missing Why\" connecting decisions to entities)\n", - "4. **Persisted Artifacts** (GCS object references for campaign JSON)\n", - "\n", - "Finally, we demonstrate **World Change detection** for long-running A2A tasks." - ] - }, - { - "cell_type": "markdown", - "id": "fb54dbbe", - "metadata": {}, - "source": [ - "## Install Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ef0c7b17", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:40.613838Z", - "iopub.status.busy": "2026-03-05T09:26:40.613715Z", - "iopub.status.idle": "2026-03-05T09:26:41.651620Z", - "shell.execute_reply": "2026-03-05T09:26:41.651018Z" - } - }, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\u001b[33m\r\n", - "\u001b[0m" - ] + "cell_type": "code", + "execution_count": 1, + "id": "7b4dd2b6", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:40.608829Z", + "iopub.status.busy": "2026-03-05T09:26:40.608720Z", + "iopub.status.idle": "2026-03-05T09:26:40.611594Z", + "shell.execute_reply": "2026-03-05T09:26:40.611030Z" + } + }, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR: Ignored the following versions that require a different python version: 1.19.0 Requires-Python >=3.10; 1.20.0 Requires-Python >=3.10; 1.21.0 Requires-Python >=3.10; 1.22.0 Requires-Python >=3.10; 1.22.1 Requires-Python >=3.10; 1.23.0 Requires-Python >=3.10; 1.24.0 Requires-Python >=3.10; 1.24.1 Requires-Python >=3.10; 1.25.0 Requires-Python >=3.10; 1.25.1 Requires-Python >=3.10; 1.26.0 Requires-Python >=3.10\u001b[0m\u001b[31m\r\n", - "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement bigquery-agent-analytics (from versions: none)\u001b[0m\u001b[31m\r\n", - "\u001b[0m\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip\u001b[0m\r\n", - "\u001b[31mERROR: No matching distribution found for bigquery-agent-analytics\u001b[0m\u001b[31m\r\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" - ] - }, - { - "cell_type": "markdown", - "id": "e654c10e", - "metadata": {}, - "source": [ - "## Authenticate & Configure" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a8a11844", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.653979Z", - "iopub.status.busy": "2026-03-05T09:26:41.653848Z", - "iopub.status.idle": "2026-03-05T09:26:41.659483Z", - "shell.execute_reply": "2026-03-05T09:26:41.658883Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Not running in Colab -- using default credentials.\n", - "Project : test-project-0728-467323\n", - "Dataset : agent_analytics\n", - "Table : agent_events\n", - "Model : gemini-3-flash-preview\n", - "Vertex AI: enabled\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "try:\n", - " from google.colab import auth\n", - " auth.authenticate_user()\n", - " print(\"Colab authentication successful.\")\n", - "except ImportError:\n", - " print(\"Not running in Colab -- using default credentials.\")\n", - "\n", - "# ---------- Configuration ----------\n", - "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"your-project-id\")\n", - "DATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\n", - "TABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\n", - "MODEL_NAME = os.environ.get(\"MODEL_NAME\", \"gemini-2.5-flash\")\n", - "LOCATION = \"US\"\n", - "APP_NAME = \"adcp_context_graph_demo\"\n", - "USER_ID = \"adcp_demo_user\"\n", - "\n", - "os.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", - "os.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n", - "\n", - "import nest_asyncio\n", - "nest_asyncio.apply()\n", - "\n", - "print(f\"Project : {PROJECT_ID}\")\n", - "print(f\"Dataset : {DATASET_ID}\")\n", - "print(f\"Table : {TABLE_ID}\")\n", - "print(f\"Model : {MODEL_NAME}\")\n", - "print(f\"Vertex AI: enabled\")" - ] - }, - { - "cell_type": "markdown", - "id": "55ecb639", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 1: Define ADCP Domain Tools\n", - "\n", - "We create **deterministic tools** that simulate Yahoo's advertising platform. Each tool uses seeded randomness for reproducible demo output." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4aef1ce5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.660859Z", - "iopub.status.busy": "2026-03-05T09:26:41.660786Z", - "iopub.status.idle": "2026-03-05T09:26:41.670682Z", - "shell.execute_reply": "2026-03-05T09:26:41.670195Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ADCP tools defined: query_ad_inventory, match_target_audience, allocate_media_budget, provision_campaign_in_gam\n" - ] - } - ], - "source": [ - "import hashlib\n", - "import json\n", - "import random\n", - "from typing import Any\n", - "\n", - "\n", - "def _rng_from(*parts: str) -> random.Random:\n", - " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", - " return random.Random(seed)\n", - "\n", - "\n", - "async def query_ad_inventory(\n", - " product_name: str,\n", - " format_type: str = \"display\",\n", - " date_range: str = \"2025-Q2\",\n", - ") -> dict[str, Any]:\n", - " \"\"\"Query available ad inventory for a Yahoo product.\n", - "\n", - " Args:\n", - " product_name: Yahoo ad product (e.g. 'Yahoo Homepage', 'Yahoo Mail').\n", - " format_type: Ad format ('display', 'native', 'video').\n", - " date_range: Date range for availability.\n", - "\n", - " Returns:\n", - " Inventory availability with pricing.\n", - " \"\"\"\n", - " rng = _rng_from(product_name, format_type, date_range)\n", - " available_impressions = rng.randint(500_000, 50_000_000)\n", - " cpm = round(rng.uniform(3.50, 45.00), 2)\n", - " return {\n", - " \"product\": product_name,\n", - " \"format\": format_type,\n", - " \"date_range\": date_range,\n", - " \"available_impressions\": available_impressions,\n", - " \"cpm_usd\": cpm,\n", - " \"estimated_reach\": rng.randint(100_000, 10_000_000),\n", - " \"viewability_rate\": round(rng.uniform(0.60, 0.95), 2),\n", - " \"status\": rng.choice([\"available\", \"available\", \"available\", \"limited\"]),\n", - " \"premium_placement\": product_name in (\"Yahoo Homepage\", \"Yahoo Finance\"),\n", - " }\n", - "\n", - "\n", - "async def match_target_audience(\n", - " brand: str,\n", - " target_demographics: str,\n", - " campaign_goal: str = \"brand_awareness\",\n", - ") -> dict[str, Any]:\n", - " \"\"\"Match target audience segments against Yahoo's audience graph.\n", - "\n", - " Args:\n", - " brand: Advertiser brand name.\n", - " target_demographics: Target audience description.\n", - " campaign_goal: Campaign objective.\n", - "\n", - " Returns:\n", - " Matched audience segments with reach estimates.\n", - " \"\"\"\n", - " rng = _rng_from(brand, target_demographics)\n", - " segments = [\n", - " {\"segment\": \"Beauty Enthusiasts\", \"match_score\": 0.95},\n", - " {\"segment\": \"Millennials 25-34\", \"match_score\": 0.92},\n", - " {\"segment\": \"Female Shoppers\", \"match_score\": 0.88},\n", - " {\"segment\": \"Health & Wellness\", \"match_score\": 0.76},\n", - " {\"segment\": \"Premium Consumers\", \"match_score\": 0.71},\n", - " ]\n", - " rng.shuffle(segments)\n", - " selected = segments[:rng.randint(3, 5)]\n", - " return {\n", - " \"brand\": brand,\n", - " \"target_demographics\": target_demographics,\n", - " \"campaign_goal\": campaign_goal,\n", - " \"matched_segments\": sorted(selected, key=lambda s: -s[\"match_score\"]),\n", - " \"total_addressable_audience\": rng.randint(2_000_000, 15_000_000),\n", - " \"overlap_with_yahoo_users_pct\": round(rng.uniform(0.45, 0.85), 2),\n", - " }\n", - "\n", - "\n", - "async def allocate_media_budget(\n", - " total_budget_usd: float,\n", - " products: str,\n", - " campaign_duration_days: int = 30,\n", - ") -> dict[str, Any]:\n", - " \"\"\"Allocate media budget across Yahoo ad products.\n", - "\n", - " Args:\n", - " total_budget_usd: Total campaign budget in USD.\n", - " products: Comma-separated list of Yahoo products.\n", - " campaign_duration_days: Campaign length in days.\n", - "\n", - " Returns:\n", - " Budget allocation with ROI projections.\n", - " \"\"\"\n", - " product_list = [p.strip() for p in products.split(\",\")]\n", - " rng = _rng_from(str(total_budget_usd), products)\n", - " allocations = []\n", - " remaining = total_budget_usd\n", - "\n", - " for i, product in enumerate(product_list):\n", - " if i == len(product_list) - 1:\n", - " amount = round(remaining, 2)\n", - " else:\n", - " pct = rng.uniform(0.15, 0.50)\n", - " amount = round(total_budget_usd * pct, 2)\n", - " remaining -= amount\n", - "\n", - " allocations.append({\n", - " \"product\": product,\n", - " \"allocated_usd\": amount,\n", - " \"pct_of_total\": round(amount / total_budget_usd * 100, 1),\n", - " \"estimated_impressions\": int(amount / rng.uniform(5, 25) * 1000),\n", - " \"projected_ctr_pct\": round(rng.uniform(0.8, 3.5), 2),\n", - " })\n", - "\n", - " return {\n", - " \"total_budget_usd\": total_budget_usd,\n", - " \"campaign_duration_days\": campaign_duration_days,\n", - " \"allocations\": allocations,\n", - " \"projected_total_impressions\": sum(\n", - " a[\"estimated_impressions\"] for a in allocations\n", - " ),\n", - " \"projected_roas\": round(rng.uniform(2.5, 8.0), 2),\n", - " }\n", - "\n", - "\n", - "async def provision_campaign_in_gam(\n", - " campaign_name: str,\n", - " advertiser: str,\n", - " budget_usd: float,\n", - " start_date: str,\n", - " end_date: str,\n", - " targeting_segments: str,\n", - ") -> dict[str, Any]:\n", - " \"\"\"Provision a campaign in Google Ad Manager (GAM).\n", - "\n", - " Args:\n", - " campaign_name: Name for the campaign.\n", - " advertiser: Advertiser name.\n", - " budget_usd: Total budget.\n", - " start_date: Campaign start date.\n", - " end_date: Campaign end date.\n", - " targeting_segments: Comma-separated targeting segments.\n", - "\n", - " Returns:\n", - " Provisioned campaign details with line item IDs.\n", - " \"\"\"\n", - " rng = _rng_from(campaign_name, advertiser)\n", - " line_item_id = f\"LI-{rng.randint(100000, 999999)}\"\n", - " order_id = f\"ORD-{rng.randint(10000, 99999)}\"\n", - "\n", - " artifact = {\n", - " \"campaign_name\": campaign_name,\n", - " \"advertiser\": advertiser,\n", - " \"order_id\": order_id,\n", - " \"line_item_id\": line_item_id,\n", - " \"budget_usd\": budget_usd,\n", - " \"start_date\": start_date,\n", - " \"end_date\": end_date,\n", - " \"targeting\": [s.strip() for s in targeting_segments.split(\",\")],\n", - " \"status\": \"provisioned\",\n", - " \"gcs_artifact_uri\": (\n", - " f\"gs://adcp-artifacts/{advertiser.lower().replace(' ', '_')}/\"\n", - " f\"{line_item_id}.json\"\n", - " ),\n", - " }\n", - " return artifact\n", - "\n", - "\n", - "print(\"ADCP tools defined: query_ad_inventory, match_target_audience,\"\n", - " \" allocate_media_budget, provision_campaign_in_gam\")" - ] - }, - { - "cell_type": "markdown", - "id": "912fd1fc", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 2: Build Multi-Agent System with ADK\n", - "\n", - "We create a **Yahoo Sales Agent** that orchestrates the ADCP workflow. It processes the buyer's brief, queries inventory, matches audiences, allocates budget, and pauses for HITL approval before provisioning." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "25290b99", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:41.672179Z", - "iopub.status.busy": "2026-03-05T09:26:41.672085Z", - "iopub.status.idle": "2026-03-05T09:26:43.629409Z", - "shell.execute_reply": "2026-03-05T09:26:43.628925Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "0713f88b", + "metadata": {}, + "source": [ + "# Context Graph Demo V2: System of Reasoning for Agentic Ads\n", + "\n", + "**Integrating BigQuery Agent Analytics SDK with Native Property Graphs**\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"Vertex Open in Vertex AI Workbench\n", + " \n", + " \n", + " \n", + " \"BQ Open in BQ Studio\n", + " \n", + "
" + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.13/site-packages/requests/__init__.py:113: RequestsDependencyWarning: urllib3 (2.6.3) or chardet (6.0.0.post1)/charset_normalizer (3.4.4) doesn't match a supported version!\n", - " warnings.warn(\n" - ] + "cell_type": "markdown", + "id": "4188cacb", + "metadata": {}, + "source": [ + "## Demo Scenario: Yahoo ADCP \"ELF Cosmetics\" Media Buy\n", + "\n", + "This notebook simulates a realistic **multi-agent media buying workflow** based on the Yahoo Ad Context Protocol (ADCP):\n", + "\n", + "```mermaid\n", + "flowchart LR\n", + " A[Buyer Agent
ELF Cosmetics] -->|ADCP Brief| B[Sales Agent
Yahoo DSP]\n", + " B -->|Inventory Query| C[Inventory Tool]\n", + " B -->|Audience Match| D[Audience Tool]\n", + " B -->|Budget Split| E[Budget Tool]\n", + " B -->|HITL Pause| F[Ad Ops Manager
Slack Approval]\n", + " F -->|Approved| G[Provision Campaign
Google Ad Manager]\n", + " G -->|Artifact| H[GCS Line Item JSON]\n", + "```\n", + "\n", + "We then build a **4-Pillar Context Graph** that cross-links:\n", + "1. **Technical Graph** (execution lineage from ADK plugin)\n", + "2. **Biz Graph** (Products, Targeting, Campaigns extracted via AI.GENERATE)\n", + "3. **Cross-Links** (the \"Missing Why\" connecting decisions to entities)\n", + "4. **Persisted Artifacts** (GCS object references for campaign JSON)\n", + "\n", + "Finally, we demonstrate **World Change detection** for long-running A2A tasks." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "ADCP agent builder ready.\n" - ] - } - ], - "source": [ - "from google.adk.agents import LlmAgent\n", - "from google.genai import types\n", - "\n", - "YAHOO_SALES_AGENT_INSTRUCTION = \"\"\"\\\n", - "You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", - "You process media buying requests from buyer agents and human ad planners.\n", - "\n", - "Workflow for processing a media buying brief:\n", - "1. Parse the buyer's brief to identify: brand, target demographics, budget, and campaign goals.\n", - "2. Query ad inventory for relevant Yahoo products (Yahoo Homepage, Yahoo Mail, Yahoo Finance, Yahoo Sports).\n", - "3. Match the target audience against Yahoo's audience graph.\n", - "4. Allocate the media budget across recommended products.\n", - "5. Present the media plan for human review.\n", - "6. Once approved, provision the campaign in Google Ad Manager (GAM).\n", - "\n", - "Guidelines:\n", - "- Always query at least 2 Yahoo products for inventory.\n", - "- Recommend budget allocation based on audience match scores.\n", - "- Flag any premium placements (Yahoo Homepage, Yahoo Finance).\n", - "- Present clear reasoning for each recommendation.\n", - "- Include projected ROI and impression estimates.\n", - "\"\"\"\n", - "\n", - "\n", - "def build_adcp_agent() -> LlmAgent:\n", - " \"\"\"Build the Yahoo ADCP Sales Agent.\"\"\"\n", - " return LlmAgent(\n", - " name=\"yahoo_sales_agent\",\n", - " model=MODEL_NAME,\n", - " instruction=YAHOO_SALES_AGENT_INSTRUCTION,\n", - " tools=[\n", - " query_ad_inventory,\n", - " match_target_audience,\n", - " allocate_media_budget,\n", - " provision_campaign_in_gam,\n", - " ],\n", - " generate_content_config=types.GenerateContentConfig(\n", - " temperature=1.0,\n", - " ),\n", - " )\n", - "\n", - "\n", - "print(\"ADCP agent builder ready.\")" - ] - }, - { - "cell_type": "markdown", - "id": "0bb7021f", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 3: Run ADCP Workflows & Log Traces to BigQuery\n", - "\n", - "We run **three simulated ADCP conversations** representing different media buying scenarios. The `BigQueryAgentAnalyticsPlugin` captures every event." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b0290a9f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:26:43.631033Z", - "iopub.status.busy": "2026-03-05T09:26:43.630822Z", - "iopub.status.idle": "2026-03-05T09:27:54.876567Z", - "shell.execute_reply": "2026-03-05T09:27:54.875318Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "fb54dbbe", + "metadata": {}, + "source": [ + "## Install Dependencies" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - " Session: adcp-a20d176b82af [ELF Cosmetics -- $50K Brand Awareness Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Targe...\n", - "------------------------------------------------\n" - ] + "cell_type": "code", + "execution_count": 2, + "id": "ef0c7b17", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:40.613838Z", + "iopub.status.busy": "2026-03-05T09:26:40.613715Z", + "iopub.status.idle": "2026-03-05T09:26:41.651620Z", + "shell.execute_reply": "2026-03-05T09:26:41.651018Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621\u001b[0m\u001b[33m\r\n", + "\u001b[0m" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: Ignored the following versions that require a different python version: 1.19.0 Requires-Python >=3.10; 1.20.0 Requires-Python >=3.10; 1.21.0 Requires-Python >=3.10; 1.22.0 Requires-Python >=3.10; 1.22.1 Requires-Python >=3.10; 1.23.0 Requires-Python >=3.10; 1.24.0 Requires-Python >=3.10; 1.24.1 Requires-Python >=3.10; 1.25.0 Requires-Python >=3.10; 1.25.1 Requires-Python >=3.10; 1.26.0 Requires-Python >=3.10\u001b[0m\u001b[31m\r\n", + "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement bigquery-agent-analytics (from versions: none)\u001b[0m\u001b[31m\r\n", + "\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\r\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip\u001b[0m\r\n", + "\u001b[31mERROR: No matching distribution found for bigquery-agent-analytics\u001b[0m\u001b[31m\r\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning: there are non-text parts in the response: ['function_call', 'function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n" - ] + "cell_type": "markdown", + "id": "e654c10e", + "metadata": {}, + "source": [ + "## Authenticate & Configure" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n" - ] + "cell_type": "code", + "execution_count": 3, + "id": "a8a11844", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:41.653979Z", + "iopub.status.busy": "2026-03-05T09:26:41.653848Z", + "iopub.status.idle": "2026-03-05T09:26:41.659483Z", + "shell.execute_reply": "2026-03-05T09:26:41.658883Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not running in Colab -- using default credentials.\n", + "Project : test-project-0728-467323\n", + "Dataset : agent_analytics\n", + "Table : agent_events\n", + "Model : gemini-3-flash-preview\n", + "Vertex AI: enabled\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "try:\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + " print(\"Colab authentication successful.\")\n", + "except ImportError:\n", + " print(\"Not running in Colab -- using default credentials.\")\n", + "\n", + "# ---------- Configuration ----------\n", + "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"your-project-id\")\n", + "DATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\n", + "TABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\n", + "MODEL_NAME = os.environ.get(\"MODEL_NAME\", \"gemini-2.5-flash\")\n", + "LOCATION = \"US\"\n", + "APP_NAME = \"adcp_context_graph_demo\"\n", + "USER_ID = \"adcp_demo_user\"\n", + "\n", + "os.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "os.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n", + "\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "print(f\"Project : {PROJECT_ID}\")\n", + "print(f\"Dataset : {DATASET_ID}\")\n", + "print(f\"Table : {TABLE_ID}\")\n", + "print(f\"Model : {MODEL_NAME}\")\n", + "print(f\"Vertex AI: enabled\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: match_target_audience\n" - ] + "cell_type": "markdown", + "id": "55ecb639", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 1: Define ADCP Domain Tools\n", + "\n", + "We create **deterministic tools** that simulate Yahoo's advertising platform. Each tool uses seeded randomness for reproducible demo output." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: allocate_media_budget\n" - ] + "cell_type": "code", + "execution_count": 4, + "id": "4aef1ce5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:41.660859Z", + "iopub.status.busy": "2026-03-05T09:26:41.660786Z", + "iopub.status.idle": "2026-03-05T09:26:41.670682Z", + "shell.execute_reply": "2026-03-05T09:26:41.670195Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ADCP tools defined: query_ad_inventory, match_target_audience, allocate_media_budget, provision_campaign_in_gam\n" + ] + } + ], + "source": [ + "import hashlib\n", + "import json\n", + "import random\n", + "from typing import Any\n", + "\n", + "\n", + "def _rng_from(*parts: str) -> random.Random:\n", + " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", + " return random.Random(seed)\n", + "\n", + "\n", + "async def query_ad_inventory(\n", + " product_name: str,\n", + " format_type: str = \"display\",\n", + " date_range: str = \"2025-Q2\",\n", + ") -> dict[str, Any]:\n", + " \"\"\"Query available ad inventory for a Yahoo product.\n", + "\n", + " Args:\n", + " product_name: Yahoo ad product (e.g. 'Yahoo Homepage', 'Yahoo Mail').\n", + " format_type: Ad format ('display', 'native', 'video').\n", + " date_range: Date range for availability.\n", + "\n", + " Returns:\n", + " Inventory availability with pricing.\n", + " \"\"\"\n", + " rng = _rng_from(product_name, format_type, date_range)\n", + " available_impressions = rng.randint(500_000, 50_000_000)\n", + " cpm = round(rng.uniform(3.50, 45.00), 2)\n", + " return {\n", + " \"product\": product_name,\n", + " \"format\": format_type,\n", + " \"date_range\": date_range,\n", + " \"available_impressions\": available_impressions,\n", + " \"cpm_usd\": cpm,\n", + " \"estimated_reach\": rng.randint(100_000, 10_000_000),\n", + " \"viewability_rate\": round(rng.uniform(0.60, 0.95), 2),\n", + " \"status\": rng.choice([\"available\", \"available\", \"available\", \"limited\"]),\n", + " \"premium_placement\": product_name in (\"Yahoo Homepage\", \"Yahoo Finance\"),\n", + " }\n", + "\n", + "\n", + "async def match_target_audience(\n", + " brand: str,\n", + " target_demographics: str,\n", + " campaign_goal: str = \"brand_awareness\",\n", + ") -> dict[str, Any]:\n", + " \"\"\"Match target audience segments against Yahoo's audience graph.\n", + "\n", + " Args:\n", + " brand: Advertiser brand name.\n", + " target_demographics: Target audience description.\n", + " campaign_goal: Campaign objective.\n", + "\n", + " Returns:\n", + " Matched audience segments with reach estimates.\n", + " \"\"\"\n", + " rng = _rng_from(brand, target_demographics)\n", + " segments = [\n", + " {\"segment\": \"Beauty Enthusiasts\", \"match_score\": 0.95},\n", + " {\"segment\": \"Millennials 25-34\", \"match_score\": 0.92},\n", + " {\"segment\": \"Female Shoppers\", \"match_score\": 0.88},\n", + " {\"segment\": \"Health & Wellness\", \"match_score\": 0.76},\n", + " {\"segment\": \"Premium Consumers\", \"match_score\": 0.71},\n", + " ]\n", + " rng.shuffle(segments)\n", + " selected = segments[:rng.randint(3, 5)]\n", + " return {\n", + " \"brand\": brand,\n", + " \"target_demographics\": target_demographics,\n", + " \"campaign_goal\": campaign_goal,\n", + " \"matched_segments\": sorted(selected, key=lambda s: -s[\"match_score\"]),\n", + " \"total_addressable_audience\": rng.randint(2_000_000, 15_000_000),\n", + " \"overlap_with_yahoo_users_pct\": round(rng.uniform(0.45, 0.85), 2),\n", + " }\n", + "\n", + "\n", + "async def allocate_media_budget(\n", + " total_budget_usd: float,\n", + " products: str,\n", + " campaign_duration_days: int = 30,\n", + ") -> dict[str, Any]:\n", + " \"\"\"Allocate media budget across Yahoo ad products.\n", + "\n", + " Args:\n", + " total_budget_usd: Total campaign budget in USD.\n", + " products: Comma-separated list of Yahoo products.\n", + " campaign_duration_days: Campaign length in days.\n", + "\n", + " Returns:\n", + " Budget allocation with ROI projections.\n", + " \"\"\"\n", + " product_list = [p.strip() for p in products.split(\",\")]\n", + " rng = _rng_from(str(total_budget_usd), products)\n", + " allocations = []\n", + " remaining = total_budget_usd\n", + "\n", + " for i, product in enumerate(product_list):\n", + " if i == len(product_list) - 1:\n", + " amount = round(remaining, 2)\n", + " else:\n", + " pct = rng.uniform(0.15, 0.50)\n", + " amount = round(total_budget_usd * pct, 2)\n", + " remaining -= amount\n", + "\n", + " allocations.append({\n", + " \"product\": product,\n", + " \"allocated_usd\": amount,\n", + " \"pct_of_total\": round(amount / total_budget_usd * 100, 1),\n", + " \"estimated_impressions\": int(amount / rng.uniform(5, 25) * 1000),\n", + " \"projected_ctr_pct\": round(rng.uniform(0.8, 3.5), 2),\n", + " })\n", + "\n", + " return {\n", + " \"total_budget_usd\": total_budget_usd,\n", + " \"campaign_duration_days\": campaign_duration_days,\n", + " \"allocations\": allocations,\n", + " \"projected_total_impressions\": sum(\n", + " a[\"estimated_impressions\"] for a in allocations\n", + " ),\n", + " \"projected_roas\": round(rng.uniform(2.5, 8.0), 2),\n", + " }\n", + "\n", + "\n", + "async def provision_campaign_in_gam(\n", + " campaign_name: str,\n", + " advertiser: str,\n", + " budget_usd: float,\n", + " start_date: str,\n", + " end_date: str,\n", + " targeting_segments: str,\n", + ") -> dict[str, Any]:\n", + " \"\"\"Provision a campaign in Google Ad Manager (GAM).\n", + "\n", + " Args:\n", + " campaign_name: Name for the campaign.\n", + " advertiser: Advertiser name.\n", + " budget_usd: Total budget.\n", + " start_date: Campaign start date.\n", + " end_date: Campaign end date.\n", + " targeting_segments: Comma-separated targeting segments.\n", + "\n", + " Returns:\n", + " Provisioned campaign details with line item IDs.\n", + " \"\"\"\n", + " rng = _rng_from(campaign_name, advertiser)\n", + " line_item_id = f\"LI-{rng.randint(100000, 999999)}\"\n", + " order_id = f\"ORD-{rng.randint(10000, 99999)}\"\n", + "\n", + " artifact = {\n", + " \"campaign_name\": campaign_name,\n", + " \"advertiser\": advertiser,\n", + " \"order_id\": order_id,\n", + " \"line_item_id\": line_item_id,\n", + " \"budget_usd\": budget_usd,\n", + " \"start_date\": start_date,\n", + " \"end_date\": end_date,\n", + " \"targeting\": [s.strip() for s in targeting_segments.split(\",\")],\n", + " \"status\": \"provisioned\",\n", + " \"gcs_artifact_uri\": (\n", + " f\"gs://adcp-artifacts/{advertiser.lower().replace(' ', '_')}/\"\n", + " f\"{line_item_id}.json\"\n", + " ),\n", + " }\n", + " return artifact\n", + "\n", + "\n", + "print(\"ADCP tools defined: query_ad_inventory, match_target_audience,\"\n", + " \" allocate_media_budget, provision_campaign_in_gam\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awareness for your new skincare line among Millennials and beauty enthusiasts.\n", - "\n", - "### **Campaign Overview**\n", - "* **Brand:** ELF Cosmetics\n", - "* **Campaign Goal:** Brand Awareness\n", - "* **Total Budget:** $50,000\n", - "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", - "* **Total Projected Impressions:** ~8,166,935\n", - "* **Projected ROAS:** 3.55\n", - "\n", - "---\n", - "\n", - "### **Audience Strategy**\n", - "We matched your requirements against Yahoo’s audience graph, identifying a total addressable audience of **7.28M users**. \n", - "* **Primary Segment:** Beauty Enthusiasts (Match Score: 0.95)\n", - "* **Secondary Segment:** Millennials 25-34 (Match Score: 0.92)\n", - "* **Supporting Segment:** Female Shoppers (Match Score: 0.88)\n", - "\n", - "---\n", - "\n", - "### **Media Allo\n", - " ... (truncated, 2212 chars total)\n", - "\n", - "[Turn 2] Buyer: The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign name ...\n", - "------------------------------------------------\n" - ] + "cell_type": "markdown", + "id": "912fd1fc", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 2: Build Multi-Agent System with ADK\n", + "\n", + "We create a **Yahoo Sales Agent** that orchestrates the ADCP workflow. It processes the buyer's brief, queries inventory, matches audiences, allocates budget, and pauses for HITL approval before provisioning." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: provision_campaign_in_gam\n" - ] + "cell_type": "code", + "execution_count": 5, + "id": "25290b99", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:41.672179Z", + "iopub.status.busy": "2026-03-05T09:26:41.672085Z", + "iopub.status.idle": "2026-03-05T09:26:43.629409Z", + "shell.execute_reply": "2026-03-05T09:26:43.628925Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.13/site-packages/requests/__init__.py:113: RequestsDependencyWarning: urllib3 (2.6.3) or chardet (6.0.0.post1)/charset_normalizer (3.4.4) doesn't match a supported version!\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ADCP agent builder ready.\n" + ] + } + ], + "source": [ + "from google.adk.agents import LlmAgent\n", + "from google.genai import types\n", + "\n", + "YAHOO_SALES_AGENT_INSTRUCTION = \"\"\"\\\n", + "You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", + "You process media buying requests from buyer agents and human ad planners.\n", + "\n", + "Workflow for processing a media buying brief:\n", + "1. Parse the buyer's brief to identify: brand, target demographics, budget, and campaign goals.\n", + "2. Query ad inventory for relevant Yahoo products (Yahoo Homepage, Yahoo Mail, Yahoo Finance, Yahoo Sports).\n", + "3. Match the target audience against Yahoo's audience graph.\n", + "4. Allocate the media budget across recommended products.\n", + "5. Present the media plan for human review.\n", + "6. Once approved, provision the campaign in Google Ad Manager (GAM).\n", + "\n", + "Guidelines:\n", + "- Always query at least 2 Yahoo products for inventory.\n", + "- Recommend budget allocation based on audience match scores.\n", + "- Flag any premium placements (Yahoo Homepage, Yahoo Finance).\n", + "- Present clear reasoning for each recommendation.\n", + "- Include projected ROI and impression estimates.\n", + "\"\"\"\n", + "\n", + "\n", + "def build_adcp_agent() -> LlmAgent:\n", + " \"\"\"Build the Yahoo ADCP Sales Agent.\"\"\"\n", + " return LlmAgent(\n", + " name=\"yahoo_sales_agent\",\n", + " model=MODEL_NAME,\n", + " instruction=YAHOO_SALES_AGENT_INSTRUCTION,\n", + " tools=[\n", + " query_ad_inventory,\n", + " match_target_audience,\n", + " allocate_media_budget,\n", + " provision_campaign_in_gam,\n", + " ],\n", + " generate_content_config=types.GenerateContentConfig(\n", + " temperature=1.0,\n", + " ),\n", + " )\n", + "\n", + "\n", + "print(\"ADCP agent builder ready.\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: The campaign has been successfully provisioned in Google Ad Manager.\n", - "\n", - "### **Campaign Provisioning Summary**\n", - "* **Campaign Name:** ELF_Skincare_May2025\n", - "* **Advertiser:** ELF Cosmetics\n", - "* **Order ID:** `ORD-70678`\n", - "* **Line Item ID:** `LI-756565`\n", - "* **Budget:** $50,000.00\n", - "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", - "* **Targeting:** Beauty Enthusiasts, Millennials 25-34\n", - "* **Status:** Provisioned\n", - "\n", - "The campaign is now ready for creative upload and final activation by your ad-ops team. You can find the full configuration details in the GCS artifact: `gs://adcp-artifacts/elf_cosmetics/LI-756565.json`.\n", - "\n", - "Is there anything else you need assistance with today?\n", - "\n", - "======================================================================\n", - " Session: adcp-7d9855e7a71b [Nike -- $200K Multi-Product Performance Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demographics...\n", - "------------------------------------------------\n" - ] + "cell_type": "markdown", + "id": "0bb7021f", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 3: Run ADCP Workflows & Log Traces to BigQuery\n", + "\n", + "We run **three simulated ADCP conversations** representing different media buying scenarios. The `BigQueryAgentAnalyticsPlugin` captures every event." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: match_target_audience\n" - ] + "cell_type": "code", + "execution_count": 6, + "id": "b0290a9f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:26:43.631033Z", + "iopub.status.busy": "2026-03-05T09:26:43.630822Z", + "iopub.status.idle": "2026-03-05T09:27:54.876567Z", + "shell.execute_reply": "2026-03-05T09:27:54.875318Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + " Session: adcp-a20d176b82af [ELF Cosmetics -- $50K Brand Awareness Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Targe...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: there are non-text parts in the response: ['function_call', 'function_call'], returning concatenated text result from text parts. Check the full candidates.content.parts accessor to get the full model response.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: query_ad_inventory\n", + " -> Tool call: query_ad_inventory\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awareness for your new skincare line among Millennials and beauty enthusiasts.\n", + "\n", + "### **Campaign Overview**\n", + "* **Brand:** ELF Cosmetics\n", + "* **Campaign Goal:** Brand Awareness\n", + "* **Total Budget:** $50,000\n", + "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", + "* **Total Projected Impressions:** ~8,166,935\n", + "* **Projected ROAS:** 3.55\n", + "\n", + "---\n", + "\n", + "### **Audience Strategy**\n", + "We matched your requirements against Yahoo’s audience graph, identifying a total addressable audience of **7.28M users**. \n", + "* **Primary Segment:** Beauty Enthusiasts (Match Score: 0.95)\n", + "* **Secondary Segment:** Millennials 25-34 (Match Score: 0.92)\n", + "* **Supporting Segment:** Female Shoppers (Match Score: 0.88)\n", + "\n", + "---\n", + "\n", + "### **Media Allo\n", + " ... (truncated, 2212 chars total)\n", + "\n", + "[Turn 2] Buyer: The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign name ...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: provision_campaign_in_gam\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: The campaign has been successfully provisioned in Google Ad Manager.\n", + "\n", + "### **Campaign Provisioning Summary**\n", + "* **Campaign Name:** ELF_Skincare_May2025\n", + "* **Advertiser:** ELF Cosmetics\n", + "* **Order ID:** `ORD-70678`\n", + "* **Line Item ID:** `LI-756565`\n", + "* **Budget:** $50,000.00\n", + "* **Flight Dates:** 2025-05-01 to 2025-05-31\n", + "* **Targeting:** Beauty Enthusiasts, Millennials 25-34\n", + "* **Status:** Provisioned\n", + "\n", + "The campaign is now ready for creative upload and final activation by your ad-ops team. You can find the full configuration details in the GCS artifact: `gs://adcp-artifacts/elf_cosmetics/LI-756565.json`.\n", + "\n", + "Is there anything else you need assistance with today?\n", + "\n", + "======================================================================\n", + " Session: adcp-7d9855e7a71b [Nike -- $200K Multi-Product Performance Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demographics...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: query_ad_inventory\n", + " -> Tool call: query_ad_inventory\n", + " -> Tool call: query_ad_inventory\n", + " -> Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", + "\n", + "### **Campaign Overview**\n", + "* **Brand:** Nike\n", + "* **Budget:** $200,000\n", + "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", + "* **Goal:** High-impact product launch & reach among sports/sneaker enthusiasts.\n", + "\n", + "---\n", + "\n", + "### **Media Plan Recommendation**\n", + "\n", + "| Product | Budget Allocation | Estimated Impressions | CPM (Avg) | Projected CTR |\n", + "| :--- | :--- | :--- | :--- | :--- |\n", + "| **Yahoo Finance** | $121,327.80 (60.7%) | 5,135,264 | $19.05 | 1.21% |\n", + "| **Yahoo Homepage**| $46,134.57 (23.1%) | 2,784,630 | $21.06 | 2.35% |\n", + "| **Yahoo Sports** | $32,537.63 (16.3%) | 1,475,740 | $5.81 | 2.25% |\n", + "| **TOTAL** | **$200,000.00** | **9,39\n", + " ... (truncated, 1984 chars total)\n", + "\n", + "======================================================================\n", + " Session: adcp-2c401a645c40 [Tesla -- $100K Targeted EV Campaign]\n", + "======================================================================\n", + "\n", + "[Turn 1] Buyer: ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target Dem...\n", + "------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: query_ad_inventory\n", + " -> Tool call: query_ad_inventory\n", + " -> Tool call: match_target_audience\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: allocate_media_budget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> Tool call: provision_campaign_in_gam\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Sales Agent]: I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", + "\n", + "### **1. Media Plan Overview**\n", + "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your preferred Yahoo products.\n", + "\n", + "| Yahoo Product | Format | Budget Allocation | Est. Impressions | Key Reason |\n", + "| :--- | :--- | :--- | :--- | :--- |\n", + "| **Yahoo Finance** | Display | $29,465.15 (29.5%) | 1,432,055 | **Premium Placement:** High affinity with high-net-worth tech professionals. |\n", + "| **Yahoo Mail** | Native | $70,534.85 (70.5%) | 5,808,834 | **High Engagement:** Native ads in inbox provide superior CTR (1.19%) for lead gen. |\n", + "\n", + "**Campaign Performance Projections:**\n", + "* **Total Projected Impressions:** \n", + " ... (truncated, 1746 chars total)\n", + "\n", + "\n", + "Session IDs: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n" + ] + } + ], + "source": [ + "import asyncio\n", + "import uuid\n", + "\n", + "from google.adk.plugins.bigquery_agent_analytics_plugin import (\n", + " BigQueryAgentAnalyticsPlugin,\n", + " BigQueryLoggerConfig,\n", + ")\n", + "from google.adk.runners import Runner\n", + "from google.adk.sessions import InMemorySessionService\n", + "\n", + "agent = build_adcp_agent()\n", + "session_service = InMemorySessionService()\n", + "\n", + "plugin = BigQueryAgentAnalyticsPlugin(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " config=BigQueryLoggerConfig(\n", + " table_id=TABLE_ID,\n", + " batch_size=1,\n", + " batch_flush_interval=1.0,\n", + " ),\n", + " location=LOCATION,\n", + ")\n", + "\n", + "runner = Runner(\n", + " agent=agent,\n", + " app_name=APP_NAME,\n", + " session_service=session_service,\n", + " plugins=[plugin],\n", + ")\n", + "\n", + "# ---------- Three ADCP Conversations ----------\n", + "conversations = [\n", + " {\n", + " \"label\": \"ELF Cosmetics -- $50K Brand Awareness Campaign\",\n", + " \"messages\": [\n", + " (\n", + " \"ADCP Media Buying Brief:\\n\"\n", + " \"Brand: ELF Cosmetics\\n\"\n", + " \"Budget: $50,000\\n\"\n", + " \"Campaign Goal: Brand awareness for new skincare line\\n\"\n", + " \"Target Demographics: Millennials 25-34, beauty enthusiasts\\n\"\n", + " \"Flight Dates: 2025-05-01 to 2025-05-31\\n\"\n", + " \"Preferred Products: Yahoo Homepage, Yahoo Mail\\n\\n\"\n", + " \"Please process this brief: query inventory for Yahoo Homepage \"\n", + " \"and Yahoo Mail, match the target audience, allocate the $50,000 \"\n", + " \"budget across the recommended products, and present the media plan.\"\n", + " ),\n", + " (\n", + " \"The media plan is approved by the ad-ops manager. \"\n", + " \"Please provision the campaign in Google Ad Manager with \"\n", + " \"campaign name 'ELF_Skincare_May2025', advertiser 'ELF Cosmetics', \"\n", + " \"budget $50000, start date 2025-05-01, end date 2025-05-31, \"\n", + " \"and targeting segments 'Beauty Enthusiasts, Millennials 25-34'.\"\n", + " ),\n", + " ],\n", + " },\n", + " {\n", + " \"label\": \"Nike -- $200K Multi-Product Performance Campaign\",\n", + " \"messages\": [\n", + " (\n", + " \"ADCP Media Buying Brief:\\n\"\n", + " \"Brand: Nike\\n\"\n", + " \"Budget: $200,000\\n\"\n", + " \"Campaign Goal: Product launch for Air Max 2025\\n\"\n", + " \"Target Demographics: Sports enthusiasts 18-45, sneakerheads\\n\"\n", + " \"Flight Dates: 2025-06-01 to 2025-06-30\\n\"\n", + " \"Preferred Products: Yahoo Sports, Yahoo Homepage, Yahoo Finance\\n\\n\"\n", + " \"Process this brief: check inventory for Yahoo Sports, \"\n", + " \"Yahoo Homepage, and Yahoo Finance. Match target audiences \"\n", + " \"and recommend a budget split across all three products.\"\n", + " ),\n", + " ],\n", + " },\n", + " {\n", + " \"label\": \"Tesla -- $100K Targeted EV Campaign\",\n", + " \"messages\": [\n", + " (\n", + " \"ADCP Media Buying Brief:\\n\"\n", + " \"Brand: Tesla\\n\"\n", + " \"Budget: $100,000\\n\"\n", + " \"Campaign Goal: Lead generation for Model Y test drives\\n\"\n", + " \"Target Demographics: Tech-savvy professionals 30-55, EV intenders\\n\"\n", + " \"Flight Dates: 2025-07-01 to 2025-07-31\\n\"\n", + " \"Preferred Products: Yahoo Finance, Yahoo Mail\\n\\n\"\n", + " \"Query inventory for Yahoo Finance and Yahoo Mail, \"\n", + " \"match the target audience, and allocate the budget. \"\n", + " \"Then provision the campaign in GAM with name 'Tesla_ModelY_Jul2025', \"\n", + " \"advertiser 'Tesla', budget $100000, start date 2025-07-01, \"\n", + " \"end date 2025-07-31, targeting 'Tech Professionals, EV Intenders'.\"\n", + " ),\n", + " ],\n", + " },\n", + "]\n", + "\n", + "\n", + "async def run_conversation(messages, label=\"\"):\n", + " \"\"\"Run a multi-turn ADCP conversation.\"\"\"\n", + " session_id = f\"adcp-{uuid.uuid4().hex[:12]}\"\n", + " await session_service.create_session(\n", + " app_name=APP_NAME,\n", + " user_id=USER_ID,\n", + " session_id=session_id,\n", + " )\n", + " print(f\"\\n{'=' * 70}\")\n", + " print(f\" Session: {session_id} [{label}]\")\n", + " print(f\"{'=' * 70}\")\n", + "\n", + " for i, message in enumerate(messages, 1):\n", + " print(f\"\\n[Turn {i}] Buyer: {message[:120]}...\")\n", + " print(\"-\" * 48)\n", + " user_content = types.Content(\n", + " role=\"user\",\n", + " parts=[types.Part(text=message)],\n", + " )\n", + " response_parts = []\n", + " async for event in runner.run_async(\n", + " user_id=USER_ID,\n", + " session_id=session_id,\n", + " new_message=user_content,\n", + " ):\n", + " if event.content and event.content.parts:\n", + " for part in event.content.parts:\n", + " if hasattr(part, \"text\") and part.text:\n", + " response_parts.append(part.text)\n", + " elif hasattr(part, \"function_call\") and part.function_call:\n", + " print(f\" -> Tool call: {part.function_call.name}\")\n", + " if response_parts:\n", + " text = \"\\n\".join(response_parts)\n", + " print(f\"\\n[Sales Agent]: {text[:800]}\")\n", + " if len(text) > 800:\n", + " print(f\" ... (truncated, {len(text)} chars total)\")\n", + " return session_id\n", + "\n", + "\n", + "# Run all conversations\n", + "session_ids = []\n", + "for conv in conversations:\n", + " sid = asyncio.get_event_loop().run_until_complete(\n", + " run_conversation(conv[\"messages\"], label=conv[\"label\"])\n", + " )\n", + " session_ids.append(sid)\n", + "\n", + "print(f\"\\n\\nSession IDs: {session_ids}\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: allocate_media_budget\n" - ] + "cell_type": "code", + "execution_count": 7, + "id": "f9548cc1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:27:54.879040Z", + "iopub.status.busy": "2026-03-05T09:27:54.878625Z", + "iopub.status.idle": "2026-03-05T09:28:09.889215Z", + "shell.execute_reply": "2026-03-05T09:28:09.888134Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing traces to BigQuery ...\n", + "Waiting 15s for BigQuery data to settle ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "print(\"Flushing traces to BigQuery ...\")\n", + "try:\n", + " asyncio.get_event_loop().run_until_complete(plugin.flush())\n", + "except Exception as exc:\n", + " print(f\"Flush warning: {exc}\")\n", + "\n", + "settle_seconds = 15\n", + "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", + "time.sleep(settle_seconds)\n", + "print(\"Done.\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", - "\n", - "### **Campaign Overview**\n", - "* **Brand:** Nike\n", - "* **Budget:** $200,000\n", - "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", - "* **Goal:** High-impact product launch & reach among sports/sneaker enthusiasts.\n", - "\n", - "---\n", - "\n", - "### **Media Plan Recommendation**\n", - "\n", - "| Product | Budget Allocation | Estimated Impressions | CPM (Avg) | Projected CTR |\n", - "| :--- | :--- | :--- | :--- | :--- |\n", - "| **Yahoo Finance** | $121,327.80 (60.7%) | 5,135,264 | $19.05 | 1.21% |\n", - "| **Yahoo Homepage**| $46,134.57 (23.1%) | 2,784,630 | $21.06 | 2.35% |\n", - "| **Yahoo Sports** | $32,537.63 (16.3%) | 1,475,740 | $5.81 | 2.25% |\n", - "| **TOTAL** | **$200,000.00** | **9,39\n", - " ... (truncated, 1984 chars total)\n", - "\n", - "======================================================================\n", - " Session: adcp-2c401a645c40 [Tesla -- $100K Targeted EV Campaign]\n", - "======================================================================\n", - "\n", - "[Turn 1] Buyer: ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target Dem...\n", - "------------------------------------------------\n" - ] + "cell_type": "markdown", + "id": "34e76dd7", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 4: Trace Retrieval & Visualization\n", + "\n", + "Use the SDK Client to fetch traces and render the hierarchical execution DAG." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: query_ad_inventory\n", - " -> Tool call: query_ad_inventory\n", - " -> Tool call: match_target_audience\n" - ] + "cell_type": "code", + "execution_count": 8, + "id": "fade6aa2", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:09.890911Z", + "iopub.status.busy": "2026-03-05T09:28:09.890803Z", + "iopub.status.idle": "2026-03-05T09:28:11.701148Z", + "shell.execute_reply": "2026-03-05T09:28:11.699926Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SDK Client initialised.\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import Client, TraceFilter\n", + "\n", + "client = Client(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + " location=LOCATION,\n", + " endpoint=MODEL_NAME,\n", + ")\n", + "print(\"SDK Client initialised.\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: allocate_media_budget\n" - ] + "cell_type": "code", + "execution_count": 9, + "id": "b703e431", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:11.710446Z", + "iopub.status.busy": "2026-03-05T09:28:11.710247Z", + "iopub.status.idle": "2026-03-05T09:28:14.652298Z", + "shell.execute_reply": "2026-03-05T09:28:14.651317Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + " Trace for session: adcp-a20d176b82af\n", + "======================================================================\n", + "Trace: e-7c76dd0c-74c0-4b77-80cc-074a0b546c97 | Session: adcp-a20d176b82af | 24545ms\n", + "====================================================================================\n", + "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", + "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (32135ms)\n", + " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", + "You process media buying requests f...\n", + " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (18226ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3938ms) - call: query_ad_inventory | call: query_ad_inventory\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2437ms) - call: match_target_audience\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2457ms) - call: allocate_media_budget\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (9298ms) - text: 'Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awarenes...\n", + "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign na...\n", + "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", + "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (5859ms)\n", + " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", + "You process media buying requests f...\n", + " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (5858ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3038ms) - call: provision_campaign_in_gam\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (provision_campaign_in_gam)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (provision_campaign_in_gam) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: ELF Cosmetics\n", + "Budget: $50,000\n", + "Campaign Goal: Brand awareness for new skincare line\n", + "Ta...\n", + " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2813ms) - text: 'The campaign has been successfully provisioned in Google Ad Manager.\n", + "\n", + "### **Campaign Provisioning Summary**\n", + "* ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + " Trace for session: adcp-7d9855e7a71b\n", + "======================================================================\n", + "Trace: e-3f6807e1-d3b9-4eb9-a0e3-f23dd72cee57 | Session: adcp-7d9855e7a71b | 14565ms\n", + "====================================================================================\n", + "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demograph...\n", + "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", + "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (14564ms)\n", + " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", + "You process media buying requests f...\n", + " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (14564ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demograph...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3732ms) - call: query_ad_inventory | call: query_ad_inventory | call: query_ad_inventory | call: match_target_audience\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demograph...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3063ms) - call: allocate_media_budget\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Nike\n", + "Budget: $200,000\n", + "Campaign Goal: Product launch for Air Max 2025\n", + "Target Demograph...\n", + " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (7752ms) - text: 'Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audien...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + " Trace for session: adcp-2c401a645c40\n", + "======================================================================\n", + "Trace: e-27133476-0092-41cb-ba67-1e99d226cae5 | Session: adcp-2c401a645c40 | 15069ms\n", + "====================================================================================\n", + "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target ...\n", + "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", + "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (15068ms)\n", + " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", + "You process media buying requests f...\n", + " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (15068ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target ...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3617ms) - call: query_ad_inventory | call: query_ad_inventory | call: match_target_audience\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target ...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2579ms) - call: allocate_media_budget\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target ...\n", + " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3151ms) - call: provision_campaign_in_gam\n", + " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (provision_campaign_in_gam)\n", + " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (provision_campaign_in_gam) (0ms)\n", + " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", + "Brand: Tesla\n", + "Budget: $100,000\n", + "Campaign Goal: Lead generation for Model Y test drives\n", + "Target ...\n", + " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (5700ms) - text: 'I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", + "\n", + "##...\n" + ] + } + ], + "source": [ + "# Retrieve and render each trace\n", + "traces = []\n", + "for sid in session_ids:\n", + " try:\n", + " trace = client.get_session_trace(sid)\n", + " traces.append(trace)\n", + " print(f\"\\n{'=' * 70}\")\n", + " print(f\" Trace for session: {sid}\")\n", + " print(f\"{'=' * 70}\")\n", + " _ = trace.render()\n", + " except Exception as exc:\n", + " print(f\"Could not retrieve trace {sid}: {exc}\")\n", + " traces.append(None)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " -> Tool call: provision_campaign_in_gam\n" - ] + "cell_type": "code", + "execution_count": 10, + "id": "d39c6cf4", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:14.653806Z", + "iopub.status.busy": "2026-03-05T09:28:14.653707Z", + "iopub.status.idle": "2026-03-05T09:28:14.657544Z", + "shell.execute_reply": "2026-03-05T09:28:14.657021Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- Session 1: adcp-a20d176b82af ---\n", + " Total spans : 32\n", + " Tool calls : 5\n", + " - query_ad_inventory\n", + " - query_ad_inventory\n", + " - match_target_audience\n", + " - allocate_media_budget\n", + " - provision_campaign_in_gam\n", + " Final response : text: 'The campaign has been successfully provisioned in Google Ad Manager.\n", + "\n", + "### **Campaign Provisioning Summary**\n", + "* **Campaign Name:** ELF_Skincare_May2025\n", + "* **Advertiser:** ELF Cosmetics\n", + "* **Order ID:** `ORD-70678`\n", + "* **Line Item ID:** `LI-756565`\n", + "* **Budget:** $50,000.00\n", + "* **Flight Dat\n", + " Error spans : 0\n", + " Total latency : 24545ms\n", + "\n", + "--- Session 2: adcp-7d9855e7a71b ---\n", + " Total spans : 21\n", + " Tool calls : 5\n", + " - query_ad_inventory\n", + " - query_ad_inventory\n", + " - query_ad_inventory\n", + " - match_target_audience\n", + " - allocate_media_budget\n", + " Final response : text: 'Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", + "\n", + "### **Campaign Overview**\n", + "* **Brand:** Nike\n", + "* **Budget:** $200,000\n", + "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", + "* \n", + " Error spans : 0\n", + " Total latency : 14565ms\n", + "\n", + "--- Session 3: adcp-2c401a645c40 ---\n", + " Total spans : 23\n", + " Tool calls : 5\n", + " - query_ad_inventory\n", + " - query_ad_inventory\n", + " - match_target_audience\n", + " - allocate_media_budget\n", + " - provision_campaign_in_gam\n", + " Final response : text: 'I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", + "\n", + "### **1. Media Plan Overview**\n", + "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your pre\n", + " Error spans : 0\n", + " Total latency : 15069ms\n" + ] + } + ], + "source": [ + "# Inspect ADCP-specific trace properties\n", + "for i, trace in enumerate(traces):\n", + " if trace is None:\n", + " continue\n", + " print(f\"\\n--- Session {i+1}: {trace.session_id} ---\")\n", + " print(f\" Total spans : {len(trace.spans)}\")\n", + " print(f\" Tool calls : {len(trace.tool_calls)}\")\n", + " for tc in trace.tool_calls:\n", + " print(f\" - {tc.get('tool_name', '?')}\")\n", + " final = trace.final_response or \"(none)\"\n", + " print(f\" Final response : {final[:300]}\")\n", + " errors = trace.error_spans\n", + " print(f\" Error spans : {len(errors)}\")\n", + " if trace.total_latency_ms:\n", + " print(f\" Total latency : {trace.total_latency_ms:.0f}ms\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[Sales Agent]: I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", - "\n", - "### **1. Media Plan Overview**\n", - "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your preferred Yahoo products.\n", - "\n", - "| Yahoo Product | Format | Budget Allocation | Est. Impressions | Key Reason |\n", - "| :--- | :--- | :--- | :--- | :--- |\n", - "| **Yahoo Finance** | Display | $29,465.15 (29.5%) | 1,432,055 | **Premium Placement:** High affinity with high-net-worth tech professionals. |\n", - "| **Yahoo Mail** | Native | $70,534.85 (70.5%) | 5,808,834 | **High Engagement:** Native ads in inbox provide superior CTR (1.19%) for lead gen. |\n", - "\n", - "**Campaign Performance Projections:**\n", - "* **Total Projected Impressions:** \n", - " ... (truncated, 1746 chars total)\n", - "\n", - "\n", - "Session IDs: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n" - ] - } - ], - "source": [ - "import asyncio\n", - "import uuid\n", - "\n", - "from google.adk.plugins.bigquery_agent_analytics_plugin import (\n", - " BigQueryAgentAnalyticsPlugin,\n", - " BigQueryLoggerConfig,\n", - ")\n", - "from google.adk.runners import Runner\n", - "from google.adk.sessions import InMemorySessionService\n", - "\n", - "agent = build_adcp_agent()\n", - "session_service = InMemorySessionService()\n", - "\n", - "plugin = BigQueryAgentAnalyticsPlugin(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " config=BigQueryLoggerConfig(\n", - " table_id=TABLE_ID,\n", - " batch_size=1,\n", - " batch_flush_interval=1.0,\n", - " ),\n", - " location=LOCATION,\n", - ")\n", - "\n", - "runner = Runner(\n", - " agent=agent,\n", - " app_name=APP_NAME,\n", - " session_service=session_service,\n", - " plugins=[plugin],\n", - ")\n", - "\n", - "# ---------- Three ADCP Conversations ----------\n", - "conversations = [\n", - " {\n", - " \"label\": \"ELF Cosmetics -- $50K Brand Awareness Campaign\",\n", - " \"messages\": [\n", - " (\n", - " \"ADCP Media Buying Brief:\\n\"\n", - " \"Brand: ELF Cosmetics\\n\"\n", - " \"Budget: $50,000\\n\"\n", - " \"Campaign Goal: Brand awareness for new skincare line\\n\"\n", - " \"Target Demographics: Millennials 25-34, beauty enthusiasts\\n\"\n", - " \"Flight Dates: 2025-05-01 to 2025-05-31\\n\"\n", - " \"Preferred Products: Yahoo Homepage, Yahoo Mail\\n\\n\"\n", - " \"Please process this brief: query inventory for Yahoo Homepage \"\n", - " \"and Yahoo Mail, match the target audience, allocate the $50,000 \"\n", - " \"budget across the recommended products, and present the media plan.\"\n", - " ),\n", - " (\n", - " \"The media plan is approved by the ad-ops manager. \"\n", - " \"Please provision the campaign in Google Ad Manager with \"\n", - " \"campaign name 'ELF_Skincare_May2025', advertiser 'ELF Cosmetics', \"\n", - " \"budget $50000, start date 2025-05-01, end date 2025-05-31, \"\n", - " \"and targeting segments 'Beauty Enthusiasts, Millennials 25-34'.\"\n", - " ),\n", - " ],\n", - " },\n", - " {\n", - " \"label\": \"Nike -- $200K Multi-Product Performance Campaign\",\n", - " \"messages\": [\n", - " (\n", - " \"ADCP Media Buying Brief:\\n\"\n", - " \"Brand: Nike\\n\"\n", - " \"Budget: $200,000\\n\"\n", - " \"Campaign Goal: Product launch for Air Max 2025\\n\"\n", - " \"Target Demographics: Sports enthusiasts 18-45, sneakerheads\\n\"\n", - " \"Flight Dates: 2025-06-01 to 2025-06-30\\n\"\n", - " \"Preferred Products: Yahoo Sports, Yahoo Homepage, Yahoo Finance\\n\\n\"\n", - " \"Process this brief: check inventory for Yahoo Sports, \"\n", - " \"Yahoo Homepage, and Yahoo Finance. Match target audiences \"\n", - " \"and recommend a budget split across all three products.\"\n", - " ),\n", - " ],\n", - " },\n", - " {\n", - " \"label\": \"Tesla -- $100K Targeted EV Campaign\",\n", - " \"messages\": [\n", - " (\n", - " \"ADCP Media Buying Brief:\\n\"\n", - " \"Brand: Tesla\\n\"\n", - " \"Budget: $100,000\\n\"\n", - " \"Campaign Goal: Lead generation for Model Y test drives\\n\"\n", - " \"Target Demographics: Tech-savvy professionals 30-55, EV intenders\\n\"\n", - " \"Flight Dates: 2025-07-01 to 2025-07-31\\n\"\n", - " \"Preferred Products: Yahoo Finance, Yahoo Mail\\n\\n\"\n", - " \"Query inventory for Yahoo Finance and Yahoo Mail, \"\n", - " \"match the target audience, and allocate the budget. \"\n", - " \"Then provision the campaign in GAM with name 'Tesla_ModelY_Jul2025', \"\n", - " \"advertiser 'Tesla', budget $100000, start date 2025-07-01, \"\n", - " \"end date 2025-07-31, targeting 'Tech Professionals, EV Intenders'.\"\n", - " ),\n", - " ],\n", - " },\n", - "]\n", - "\n", - "\n", - "async def run_conversation(messages, label=\"\"):\n", - " \"\"\"Run a multi-turn ADCP conversation.\"\"\"\n", - " session_id = f\"adcp-{uuid.uuid4().hex[:12]}\"\n", - " await session_service.create_session(\n", - " app_name=APP_NAME,\n", - " user_id=USER_ID,\n", - " session_id=session_id,\n", - " )\n", - " print(f\"\\n{'=' * 70}\")\n", - " print(f\" Session: {session_id} [{label}]\")\n", - " print(f\"{'=' * 70}\")\n", - "\n", - " for i, message in enumerate(messages, 1):\n", - " print(f\"\\n[Turn {i}] Buyer: {message[:120]}...\")\n", - " print(\"-\" * 48)\n", - " user_content = types.Content(\n", - " role=\"user\",\n", - " parts=[types.Part(text=message)],\n", - " )\n", - " response_parts = []\n", - " async for event in runner.run_async(\n", - " user_id=USER_ID,\n", - " session_id=session_id,\n", - " new_message=user_content,\n", - " ):\n", - " if event.content and event.content.parts:\n", - " for part in event.content.parts:\n", - " if hasattr(part, \"text\") and part.text:\n", - " response_parts.append(part.text)\n", - " elif hasattr(part, \"function_call\") and part.function_call:\n", - " print(f\" -> Tool call: {part.function_call.name}\")\n", - " if response_parts:\n", - " text = \"\\n\".join(response_parts)\n", - " print(f\"\\n[Sales Agent]: {text[:800]}\")\n", - " if len(text) > 800:\n", - " print(f\" ... (truncated, {len(text)} chars total)\")\n", - " return session_id\n", - "\n", - "\n", - "# Run all conversations\n", - "session_ids = []\n", - "for conv in conversations:\n", - " sid = asyncio.get_event_loop().run_until_complete(\n", - " run_conversation(conv[\"messages\"], label=conv[\"label\"])\n", - " )\n", - " session_ids.append(sid)\n", - "\n", - "print(f\"\\n\\nSession IDs: {session_ids}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f9548cc1", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:27:54.879040Z", - "iopub.status.busy": "2026-03-05T09:27:54.878625Z", - "iopub.status.idle": "2026-03-05T09:28:09.889215Z", - "shell.execute_reply": "2026-03-05T09:28:09.888134Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "50a56a07", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 5: Context Graph -- Business Entity Extraction\n", + "\n", + "We use BigQuery's `AI.GENERATE` with `output_schema` to extract structured business entities from the unstructured agent payloads. This creates the **Biz Graph** layer of our 4-pillar context graph.\n", + "\n", + "Entity types: `Product`, `Targeting`, `Campaign`, `Budget`, `Audience`" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Flushing traces to BigQuery ...\n", - "Waiting 15s for BigQuery data to settle ...\n" - ] + "cell_type": "code", + "execution_count": 11, + "id": "604322a5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:14.658976Z", + "iopub.status.busy": "2026-03-05T09:28:14.658878Z", + "iopub.status.idle": "2026-03-05T09:28:14.662047Z", + "shell.execute_reply": "2026-03-05T09:28:14.661541Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ContextGraphManager ready.\n", + " Biz nodes table : adcp_biz_nodes\n", + " Cross-links table: adcp_cross_links\n", + " Graph name : adcp_context_graph\n", + " Entity types : ['Product', 'Targeting', 'Campaign', 'Budget', 'Audience', 'Advertiser']\n", + " AI.GENERATE endpoint: https://aiplatform.googleapis.com/v1/projects/test-project-0728-467323/locations...\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import ContextGraphManager, ContextGraphConfig\n", + "\n", + "# Configure the context graph for ADCP domain\n", + "# Note: For Gemini 3.x+ models, AI.GENERATE requires a full Vertex AI\n", + "# endpoint URL. The SDK handles this automatically via _resolve_endpoint().\n", + "cg_config = ContextGraphConfig(\n", + " biz_nodes_table=\"adcp_biz_nodes\",\n", + " cross_links_table=\"adcp_cross_links\",\n", + " graph_name=\"adcp_context_graph\",\n", + " endpoint=MODEL_NAME,\n", + " entity_types=[\n", + " \"Product\",\n", + " \"Targeting\",\n", + " \"Campaign\",\n", + " \"Budget\",\n", + " \"Audience\",\n", + " \"Advertiser\",\n", + " ],\n", + " max_hops=20,\n", + ")\n", + "\n", + "# Option 1: Use client.context_graph() factory\n", + "cgm = client.context_graph(config=cg_config)\n", + "\n", + "# Option 2: Direct instantiation\n", + "# cgm = ContextGraphManager(\n", + "# project_id=PROJECT_ID,\n", + "# dataset_id=DATASET_ID,\n", + "# table_id=TABLE_ID,\n", + "# config=cg_config,\n", + "# location=LOCATION,\n", + "# )\n", + "\n", + "print(f\"ContextGraphManager ready.\")\n", + "print(f\" Biz nodes table : {cg_config.biz_nodes_table}\")\n", + "print(f\" Cross-links table: {cg_config.cross_links_table}\")\n", + "print(f\" Graph name : {cg_config.graph_name}\")\n", + "print(f\" Entity types : {cg_config.entity_types}\")\n", + "print(f\" AI.GENERATE endpoint: {cgm._resolve_endpoint()[:80]}...\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - } - ], - "source": [ - "import time\n", - "\n", - "print(\"Flushing traces to BigQuery ...\")\n", - "try:\n", - " asyncio.get_event_loop().run_until_complete(plugin.flush())\n", - "except Exception as exc:\n", - " print(f\"Flush warning: {exc}\")\n", - "\n", - "settle_seconds = 15\n", - "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", - "time.sleep(settle_seconds)\n", - "print(\"Done.\")" - ] - }, - { - "cell_type": "markdown", - "id": "34e76dd7", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 4: Trace Retrieval & Visualization\n", - "\n", - "Use the SDK Client to fetch traces and render the hierarchical execution DAG." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fade6aa2", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:09.890911Z", - "iopub.status.busy": "2026-03-05T09:28:09.890803Z", - "iopub.status.idle": "2026-03-05T09:28:11.701148Z", - "shell.execute_reply": "2026-03-05T09:28:11.699926Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 12, + "id": "0d7f19df", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:28:14.663303Z", + "iopub.status.busy": "2026-03-05T09:28:14.663220Z", + "iopub.status.idle": "2026-03-05T09:29:01.878768Z", + "shell.execute_reply": "2026-03-05T09:29:01.878077Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted 180 business entities:\n", + " [Product] Yahoo Sports (confidence=1.00)\n", + " [Advertiser] Nike (confidence=1.00)\n", + " [Product] Yahoo Finance (confidence=1.00)\n", + " [Advertiser] Nike (confidence=1.00)\n", + " [Budget] 50000 (confidence=1.00)\n", + " [Budget] 50000 (confidence=1.00)\n", + " [Product] Yahoo Homepage (confidence=1.00)\n", + " [Budget] 200000 (confidence=1.00)\n", + " [Product] Yahoo Finance (confidence=1.00)\n", + " [Product] Yahoo Mail (confidence=1.00)\n", + " [Product] Yahoo Sports (confidence=1.00)\n", + " [Audience] Millennials 25-34 (confidence=1.00)\n", + " [Product] Yahoo Homepage (confidence=1.00)\n", + " [Campaign] Tesla_ModelY_Jul2025 (confidence=1.00)\n", + " [Budget] $50,000 (confidence=1.00)\n", + " ... (165 more)\n" + ] + } + ], + "source": [ + "# Extract business entities using AI.GENERATE (server-side)\n", + "# Falls back to client-side extraction if AI.GENERATE is not available\n", + "try:\n", + " biz_nodes = cgm.extract_biz_nodes(\n", + " session_ids=session_ids,\n", + " use_ai_generate=True,\n", + " )\n", + " print(f\"Extracted {len(biz_nodes)} business entities:\")\n", + " for node in biz_nodes[:15]:\n", + " print(f\" [{node.node_type}] {node.node_value} \"\n", + " f\"(confidence={node.confidence:.2f})\")\n", + " if len(biz_nodes) > 15:\n", + " print(f\" ... ({len(biz_nodes) - 15} more)\")\n", + "except Exception as exc:\n", + " print(f\"AI.GENERATE extraction not available: {exc}\")\n", + " print(\"Falling back to client-side extraction ...\")\n", + " biz_nodes = cgm.extract_biz_nodes(\n", + " session_ids=session_ids,\n", + " use_ai_generate=False,\n", + " )\n", + " print(f\"Fetched {len(biz_nodes)} raw payloads for manual review.\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "SDK Client initialised.\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import Client, TraceFilter\n", - "\n", - "client = Client(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - " location=LOCATION,\n", - " endpoint=MODEL_NAME,\n", - ")\n", - "print(\"SDK Client initialised.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b703e431", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:11.710446Z", - "iopub.status.busy": "2026-03-05T09:28:11.710247Z", - "iopub.status.idle": "2026-03-05T09:28:14.652298Z", - "shell.execute_reply": "2026-03-05T09:28:14.651317Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "5a903ce8", + "metadata": {}, + "source": [ + "### Alternative: Manual Entity Extraction & Storage\n", + "\n", + "If `AI.GENERATE` is not available, you can extract entities client-side and store them via `store_biz_nodes()`." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - " Trace for session: adcp-a20d176b82af\n", - "======================================================================\n", - "Trace: e-7c76dd0c-74c0-4b77-80cc-074a0b546c97 | Session: adcp-a20d176b82af | 24545ms\n", - "====================================================================================\n", - "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", - "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (32135ms)\n", - " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", - "You process media buying requests f...\n", - " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (18226ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3938ms) - call: query_ad_inventory | call: query_ad_inventory\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2437ms) - call: match_target_audience\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2457ms) - call: allocate_media_budget\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (9298ms) - text: 'Based on your brief for **ELF Cosmetics**, I have developed a comprehensive media plan to drive brand awarenes...\n", - "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - The media plan is approved by the ad-ops manager. Please provision the campaign in Google Ad Manager with campaign na...\n", - "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", - "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (5859ms)\n", - " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", - "You process media buying requests f...\n", - " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (5858ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3038ms) - call: provision_campaign_in_gam\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (provision_campaign_in_gam)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (provision_campaign_in_gam) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: ELF Cosmetics\n", - "Budget: $50,000\n", - "Campaign Goal: Brand awareness for new skincare line\n", - "Ta...\n", - " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2813ms) - text: 'The campaign has been successfully provisioned in Google Ad Manager.\n", - "\n", - "### **Campaign Provisioning Summary**\n", - "* ...\n" - ] + "cell_type": "code", + "execution_count": 13, + "id": "3e744c32", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:01.880329Z", + "iopub.status.busy": "2026-03-05T09:29:01.880213Z", + "iopub.status.idle": "2026-03-05T09:29:01.883830Z", + "shell.execute_reply": "2026-03-05T09:29:01.883278Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Manual biz nodes prepared: 7\n", + " [Advertiser] ELF Cosmetics\n", + " [Product] Yahoo Homepage\n", + " [Product] Yahoo Mail\n", + " [Targeting] Millennials 25-34\n", + " [Targeting] Beauty Enthusiasts\n", + " [Budget] $50,000\n", + " [Campaign] ELF_Skincare_May2025\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import BizNode\n", + "\n", + "# Example: manually define business entities from the ELF campaign\n", + "manual_biz_nodes = [\n", + " BizNode(\n", + " span_id=\"manual-1\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Advertiser\",\n", + " node_value=\"ELF Cosmetics\",\n", + " confidence=1.0,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-2\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Product\",\n", + " node_value=\"Yahoo Homepage\",\n", + " confidence=0.95,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-3\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Product\",\n", + " node_value=\"Yahoo Mail\",\n", + " confidence=0.90,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-4\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Targeting\",\n", + " node_value=\"Millennials 25-34\",\n", + " confidence=0.92,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-5\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Targeting\",\n", + " node_value=\"Beauty Enthusiasts\",\n", + " confidence=0.95,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-6\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Budget\",\n", + " node_value=\"$50,000\",\n", + " confidence=1.0,\n", + " ),\n", + " BizNode(\n", + " span_id=\"manual-7\",\n", + " session_id=session_ids[0],\n", + " node_type=\"Campaign\",\n", + " node_value=\"ELF_Skincare_May2025\",\n", + " confidence=1.0,\n", + " ),\n", + "]\n", + "\n", + "print(f\"Manual biz nodes prepared: {len(manual_biz_nodes)}\")\n", + "for node in manual_biz_nodes:\n", + " print(f\" [{node.node_type}] {node.node_value}\")\n", + "\n", + "# Uncomment to store in BigQuery:\n", + "# success = cgm.store_biz_nodes(manual_biz_nodes)\n", + "# print(f\"Stored: {success}\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - " Trace for session: adcp-7d9855e7a71b\n", - "======================================================================\n", - "Trace: e-3f6807e1-d3b9-4eb9-a0e3-f23dd72cee57 | Session: adcp-7d9855e7a71b | 14565ms\n", - "====================================================================================\n", - "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demograph...\n", - "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", - "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (14564ms)\n", - " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", - "You process media buying requests f...\n", - " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (14564ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demograph...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3732ms) - call: query_ad_inventory | call: query_ad_inventory | call: query_ad_inventory | call: match_target_audience\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demograph...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3063ms) - call: allocate_media_budget\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Nike\n", - "Budget: $200,000\n", - "Campaign Goal: Product launch for Air Max 2025\n", - "Target Demograph...\n", - " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (7752ms) - text: 'Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audien...\n" - ] + "cell_type": "markdown", + "id": "19f70d11", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 6: Property Graph DDL\n", + "\n", + "We generate and inspect the `CREATE PROPERTY GRAPH` DDL that formalizes the 4-pillar Context Graph in BigQuery. This creates:\n", + "- **TechNode** — spans from `agent_events` (execution lineage)\n", + "- **BizNode** — entities from `extracted_biz_nodes` (domain entities)\n", + "- **Caused** edges — parent→child span linkage (decision lineage)\n", + "- **Evaluated** edges — tech event → business entity cross-links" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - " Trace for session: adcp-2c401a645c40\n", - "======================================================================\n", - "Trace: e-27133476-0092-41cb-ba67-1e99d226cae5 | Session: adcp-2c401a645c40 | 15069ms\n", - "====================================================================================\n", - "└─ [✓] USER_MESSAGE_RECEIVED [yahoo_sales_agent] - ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target ...\n", - "└─ [✓] INVOCATION_STARTING [yahoo_sales_agent]\n", - "└─ [✓] INVOCATION_COMPLETED [yahoo_sales_agent] (15068ms)\n", - " ├─ [✓] AGENT_STARTING [yahoo_sales_agent] - You are the Yahoo DSP Sales Agent operating under the Ad Context Protocol (ADCP).\n", - "You process media buying requests f...\n", - " └─ [✓] AGENT_COMPLETED [yahoo_sales_agent] (15068ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target ...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3617ms) - call: query_ad_inventory | call: query_ad_inventory | call: match_target_audience\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (query_ad_inventory)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (query_ad_inventory) (0ms)\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (match_target_audience)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (match_target_audience) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target ...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (2579ms) - call: allocate_media_budget\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (allocate_media_budget)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (allocate_media_budget) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target ...\n", - " ├─ [✓] LLM_RESPONSE [yahoo_sales_agent] (3151ms) - call: provision_campaign_in_gam\n", - " ├─ [✓] TOOL_STARTING [yahoo_sales_agent] (provision_campaign_in_gam)\n", - " ├─ [✓] TOOL_COMPLETED [yahoo_sales_agent] (provision_campaign_in_gam) (0ms)\n", - " ├─ [✓] LLM_REQUEST [yahoo_sales_agent] (gemini-3-flash-preview) - ADCP Media Buying Brief:\n", - "Brand: Tesla\n", - "Budget: $100,000\n", - "Campaign Goal: Lead generation for Model Y test drives\n", - "Target ...\n", - " └─ [✓] LLM_RESPONSE [yahoo_sales_agent] (5700ms) - text: 'I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", - "\n", - "##...\n" - ] - } - ], - "source": [ - "# Retrieve and render each trace\n", - "traces = []\n", - "for sid in session_ids:\n", - " try:\n", - " trace = client.get_session_trace(sid)\n", - " traces.append(trace)\n", - " print(f\"\\n{'=' * 70}\")\n", - " print(f\" Trace for session: {sid}\")\n", - " print(f\"{'=' * 70}\")\n", - " _ = trace.render()\n", - " except Exception as exc:\n", - " print(f\"Could not retrieve trace {sid}: {exc}\")\n", - " traces.append(None)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "d39c6cf4", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:14.653806Z", - "iopub.status.busy": "2026-03-05T09:28:14.653707Z", - "iopub.status.idle": "2026-03-05T09:28:14.657544Z", - "shell.execute_reply": "2026-03-05T09:28:14.657021Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 14, + "id": "3625ad9f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:01.885148Z", + "iopub.status.busy": "2026-03-05T09:29:01.885079Z", + "iopub.status.idle": "2026-03-05T09:29:01.887228Z", + "shell.execute_reply": "2026-03-05T09:29:01.886715Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + " CREATE PROPERTY GRAPH DDL\n", + "======================================================================\n", + "CREATE OR REPLACE PROPERTY GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", + " NODE TABLES (\n", + " -- Technical execution nodes (spans from ADK plugin)\n", + " `test-project-0728-467323.agent_analytics.agent_events` AS TechNode\n", + " KEY (span_id)\n", + " LABEL TechNode\n", + " PROPERTIES (\n", + " event_type,\n", + " agent,\n", + " timestamp,\n", + " session_id,\n", + " invocation_id,\n", + " content,\n", + " latency_ms,\n", + " status,\n", + " error_message\n", + " ),\n", + " -- Business domain nodes (extracted entities)\n", + " `test-project-0728-467323.agent_analytics.adcp_biz_nodes` AS BizNode\n", + " KEY (node_value)\n", + " LABEL BizNode\n", + " PROPERTIES (\n", + " node_type,\n", + " node_value,\n", + " confidence,\n", + " session_id\n", + " )\n", + " )\n", + " EDGE TABLES (\n", + " -- Causal lineage: parent span -> child span\n", + " `test-project-0728-467323.agent_analytics.agent_events` AS Caused\n", + " KEY (span_id)\n", + " SOURCE KEY (parent_span_id) REFERENCES TechNode (span_id)\n", + " DESTINATION KEY (span_id) REFERENCES TechNode (span_id)\n", + " LABEL Caused,\n", + "\n", + " -- Cross-link: technical event -> business entity it evaluated\n", + " `test-project-0728-467323.agent_analytics.adcp_cross_links` AS Evaluated\n", + " KEY (span_id)\n", + " SOURCE KEY (span_id) REFERENCES TechNode (span_id)\n", + " DESTINATION KEY (node_value) REFERENCES BizNode (node_value)\n", + " LABEL Evaluated\n", + " )\n", + "\n" + ] + } + ], + "source": [ + "# Generate and display the Property Graph DDL\n", + "ddl = cgm.get_property_graph_ddl()\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" CREATE PROPERTY GRAPH DDL\")\n", + "print(\"=\" * 70)\n", + "print(ddl)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "--- Session 1: adcp-a20d176b82af ---\n", - " Total spans : 32\n", - " Tool calls : 5\n", - " - query_ad_inventory\n", - " - query_ad_inventory\n", - " - match_target_audience\n", - " - allocate_media_budget\n", - " - provision_campaign_in_gam\n", - " Final response : text: 'The campaign has been successfully provisioned in Google Ad Manager.\n", - "\n", - "### **Campaign Provisioning Summary**\n", - "* **Campaign Name:** ELF_Skincare_May2025\n", - "* **Advertiser:** ELF Cosmetics\n", - "* **Order ID:** `ORD-70678`\n", - "* **Line Item ID:** `LI-756565`\n", - "* **Budget:** $50,000.00\n", - "* **Flight Dat\n", - " Error spans : 0\n", - " Total latency : 24545ms\n", - "\n", - "--- Session 2: adcp-7d9855e7a71b ---\n", - " Total spans : 21\n", - " Tool calls : 5\n", - " - query_ad_inventory\n", - " - query_ad_inventory\n", - " - query_ad_inventory\n", - " - match_target_audience\n", - " - allocate_media_budget\n", - " Final response : text: 'Based on your brief for the **Nike Air Max 2025** product launch, I have analyzed Yahoo's inventory and audience graph to develop a recommended media plan.\n", - "\n", - "### **Campaign Overview**\n", - "* **Brand:** Nike\n", - "* **Budget:** $200,000\n", - "* **Flight Dates:** June 1, 2025 – June 30, 2025 (30 Days)\n", - "* \n", - " Error spans : 0\n", - " Total latency : 14565ms\n", - "\n", - "--- Session 3: adcp-2c401a645c40 ---\n", - " Total spans : 23\n", - " Tool calls : 5\n", - " - query_ad_inventory\n", - " - query_ad_inventory\n", - " - match_target_audience\n", - " - allocate_media_budget\n", - " - provision_campaign_in_gam\n", - " Final response : text: 'I have successfully processed your media buying brief for the **Tesla Model Y Lead Generation Campaign**. \n", - "\n", - "### **1. Media Plan Overview**\n", - "Based on your budget of **$100,000** and target audience of **Tech-savvy professionals and EV intenders**, I have optimized the allocation across your pre\n", - " Error spans : 0\n", - " Total latency : 15069ms\n" - ] - } - ], - "source": [ - "# Inspect ADCP-specific trace properties\n", - "for i, trace in enumerate(traces):\n", - " if trace is None:\n", - " continue\n", - " print(f\"\\n--- Session {i+1}: {trace.session_id} ---\")\n", - " print(f\" Total spans : {len(trace.spans)}\")\n", - " print(f\" Tool calls : {len(trace.tool_calls)}\")\n", - " for tc in trace.tool_calls:\n", - " print(f\" - {tc.get('tool_name', '?')}\")\n", - " final = trace.final_response or \"(none)\"\n", - " print(f\" Final response : {final[:300]}\")\n", - " errors = trace.error_spans\n", - " print(f\" Error spans : {len(errors)}\")\n", - " if trace.total_latency_ms:\n", - " print(f\" Total latency : {trace.total_latency_ms:.0f}ms\")" - ] - }, - { - "cell_type": "markdown", - "id": "50a56a07", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 5: Context Graph -- Business Entity Extraction\n", - "\n", - "We use BigQuery's `AI.GENERATE` with `output_schema` to extract structured business entities from the unstructured agent payloads. This creates the **Biz Graph** layer of our 4-pillar context graph.\n", - "\n", - "Entity types: `Product`, `Targeting`, `Campaign`, `Budget`, `Audience`" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "604322a5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:14.658976Z", - "iopub.status.busy": "2026-03-05T09:28:14.658878Z", - "iopub.status.idle": "2026-03-05T09:28:14.662047Z", - "shell.execute_reply": "2026-03-05T09:28:14.661541Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 15, + "id": "08c600a1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:01.888414Z", + "iopub.status.busy": "2026-03-05T09:29:01.888327Z", + "iopub.status.idle": "2026-03-05T09:29:07.898328Z", + "shell.execute_reply": "2026-03-05T09:29:07.897303Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cross-links created: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to create Property Graph: 400 Unsupported statement CREATE PROPERTY GRAPH.; reason: invalidQuery, message: Unsupported statement CREATE PROPERTY GRAPH.\n", + "\n", + "Location: US\n", + "Job ID: 89c29192-bb8a-4b3a-821c-0e1d34a2ceed\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Property Graph created: False\n" + ] + } + ], + "source": [ + "# Create the Property Graph in BigQuery\n", + "# NOTE: Property Graphs require BigQuery Studio (BQ Notebooks).\n", + "# If running outside BQ Studio, the DDL and GQL are generated for you\n", + "# to copy-paste into a BigQuery Studio notebook cell with %%bigquery magic.\n", + "\n", + "# Step 1: Create cross-links\n", + "try:\n", + " cross_links_ok = cgm.create_cross_links(session_ids)\n", + " print(f\"Cross-links created: {cross_links_ok}\")\n", + "except Exception as exc:\n", + " print(f\"Cross-links creation: {exc}\")\n", + "\n", + "# Step 2: Create the Property Graph\n", + "try:\n", + " graph_ok = cgm.create_property_graph()\n", + " print(f\"Property Graph created: {graph_ok}\")\n", + "except Exception as exc:\n", + " print(f\"Property Graph creation: {exc}\")\n", + " print(\"\\n--- To create the Property Graph, run the DDL above in ---\")\n", + " print(\"--- a BigQuery Studio notebook cell: ---\")\n", + " print(\"--- %%bigquery ---\")\n", + " print(\"--- CREATE OR REPLACE PROPERTY GRAPH ... ---\")\n", + " print(\"--- ---\")\n", + " print(\"--- Reference: https://github.com/GoogleCloudPlatform/ ---\")\n", + " print(\"--- devrel-demos/blob/main/data-analytics/ ---\")\n", + " print(\"--- knowledge_graph_demo/kg_demo_template.ipynb ---\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "ContextGraphManager ready.\n", - " Biz nodes table : adcp_biz_nodes\n", - " Cross-links table: adcp_cross_links\n", - " Graph name : adcp_context_graph\n", - " Entity types : ['Product', 'Targeting', 'Campaign', 'Budget', 'Audience', 'Advertiser']\n", - " AI.GENERATE endpoint: https://aiplatform.googleapis.com/v1/projects/test-project-0728-467323/locations...\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import ContextGraphManager, ContextGraphConfig\n", - "\n", - "# Configure the context graph for ADCP domain\n", - "# Note: For Gemini 3.x+ models, AI.GENERATE requires a full Vertex AI\n", - "# endpoint URL. The SDK handles this automatically via _resolve_endpoint().\n", - "cg_config = ContextGraphConfig(\n", - " biz_nodes_table=\"adcp_biz_nodes\",\n", - " cross_links_table=\"adcp_cross_links\",\n", - " graph_name=\"adcp_context_graph\",\n", - " endpoint=MODEL_NAME,\n", - " entity_types=[\n", - " \"Product\",\n", - " \"Targeting\",\n", - " \"Campaign\",\n", - " \"Budget\",\n", - " \"Audience\",\n", - " \"Advertiser\",\n", - " ],\n", - " max_hops=20,\n", - ")\n", - "\n", - "# Option 1: Use client.context_graph() factory\n", - "cgm = client.context_graph(config=cg_config)\n", - "\n", - "# Option 2: Direct instantiation\n", - "# cgm = ContextGraphManager(\n", - "# project_id=PROJECT_ID,\n", - "# dataset_id=DATASET_ID,\n", - "# table_id=TABLE_ID,\n", - "# config=cg_config,\n", - "# location=LOCATION,\n", - "# )\n", - "\n", - "print(f\"ContextGraphManager ready.\")\n", - "print(f\" Biz nodes table : {cg_config.biz_nodes_table}\")\n", - "print(f\" Cross-links table: {cg_config.cross_links_table}\")\n", - "print(f\" Graph name : {cg_config.graph_name}\")\n", - "print(f\" Entity types : {cg_config.entity_types}\")\n", - "print(f\" AI.GENERATE endpoint: {cgm._resolve_endpoint()[:80]}...\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "0d7f19df", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:28:14.663303Z", - "iopub.status.busy": "2026-03-05T09:28:14.663220Z", - "iopub.status.idle": "2026-03-05T09:29:01.878768Z", - "shell.execute_reply": "2026-03-05T09:29:01.878077Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "q9v6s6cftur", + "metadata": {}, + "source": [ + "### Run in BigQuery Studio (BQ Notebooks)\n", + "\n", + "Property Graphs and GQL visualization require **BigQuery Studio**. When running this notebook in BQ Studio, uncomment and execute the `%%bigquery` cells below. The graph will render as an interactive visualization.\n", + "\n", + "> **Reference**: [Knowledge Graph Demo by Google Cloud](https://github.com/GoogleCloudPlatform/devrel-demos/blob/main/data-analytics/knowledge_graph_demo/kg_demo_template.ipynb)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracted 180 business entities:\n", - " [Product] Yahoo Sports (confidence=1.00)\n", - " [Advertiser] Nike (confidence=1.00)\n", - " [Product] Yahoo Finance (confidence=1.00)\n", - " [Advertiser] Nike (confidence=1.00)\n", - " [Budget] 50000 (confidence=1.00)\n", - " [Budget] 50000 (confidence=1.00)\n", - " [Product] Yahoo Homepage (confidence=1.00)\n", - " [Budget] 200000 (confidence=1.00)\n", - " [Product] Yahoo Finance (confidence=1.00)\n", - " [Product] Yahoo Mail (confidence=1.00)\n", - " [Product] Yahoo Sports (confidence=1.00)\n", - " [Audience] Millennials 25-34 (confidence=1.00)\n", - " [Product] Yahoo Homepage (confidence=1.00)\n", - " [Campaign] Tesla_ModelY_Jul2025 (confidence=1.00)\n", - " [Budget] $50,000 (confidence=1.00)\n", - " ... (165 more)\n" - ] - } - ], - "source": [ - "# Extract business entities using AI.GENERATE (server-side)\n", - "# Falls back to client-side extraction if AI.GENERATE is not available\n", - "try:\n", - " biz_nodes = cgm.extract_biz_nodes(\n", - " session_ids=session_ids,\n", - " use_ai_generate=True,\n", - " )\n", - " print(f\"Extracted {len(biz_nodes)} business entities:\")\n", - " for node in biz_nodes[:15]:\n", - " print(f\" [{node.node_type}] {node.node_value} \"\n", - " f\"(confidence={node.confidence:.2f})\")\n", - " if len(biz_nodes) > 15:\n", - " print(f\" ... ({len(biz_nodes) - 15} more)\")\n", - "except Exception as exc:\n", - " print(f\"AI.GENERATE extraction not available: {exc}\")\n", - " print(\"Falling back to client-side extraction ...\")\n", - " biz_nodes = cgm.extract_biz_nodes(\n", - " session_ids=session_ids,\n", - " use_ai_generate=False,\n", - " )\n", - " print(f\"Fetched {len(biz_nodes)} raw payloads for manual review.\")" - ] - }, - { - "cell_type": "markdown", - "id": "5a903ce8", - "metadata": {}, - "source": [ - "### Alternative: Manual Entity Extraction & Storage\n", - "\n", - "If `AI.GENERATE` is not available, you can extract entities client-side and store them via `store_biz_nodes()`." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "3e744c32", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:01.880329Z", - "iopub.status.busy": "2026-03-05T09:29:01.880213Z", - "iopub.status.idle": "2026-03-05T09:29:01.883830Z", - "shell.execute_reply": "2026-03-05T09:29:01.883278Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 16, + "id": "66lobz9hhp4", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:07.901006Z", + "iopub.status.busy": "2026-03-05T09:29:07.900843Z", + "iopub.status.idle": "2026-03-05T09:29:07.905103Z", + "shell.execute_reply": "2026-03-05T09:29:07.904010Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\n", + "The Property Graph DDL was generated by the SDK in the previous cell.\n" + ] + } + ], + "source": [ + "# ============================================================\n", + "# BQ Studio: Create Property Graph\n", + "# Uncomment the %%bigquery magic below when running in BQ Studio\n", + "# ============================================================\n", + "\n", + "# %%bigquery\n", + "# CREATE OR REPLACE PROPERTY GRAPH `{DATASET_ID}.adcp_context_graph`\n", + "# NODE TABLES (\n", + "# `{DATASET_ID}.{TABLE_ID}` AS TechNode\n", + "# KEY (span_id)\n", + "# LABEL TechNode\n", + "# PROPERTIES (event_type, agent, timestamp, session_id, invocation_id,\n", + "# content, latency_ms, status, error_message),\n", + "# `{DATASET_ID}.adcp_biz_nodes` AS BizNode\n", + "# KEY (node_value)\n", + "# LABEL BizNode\n", + "# PROPERTIES (node_type, node_value, confidence, session_id)\n", + "# )\n", + "# EDGE TABLES (\n", + "# `{DATASET_ID}.{TABLE_ID}` AS Caused\n", + "# KEY (span_id)\n", + "# SOURCE KEY (parent_span_id) REFERENCES TechNode (span_id)\n", + "# DESTINATION KEY (span_id) REFERENCES TechNode (span_id)\n", + "# LABEL Caused,\n", + "# `{DATASET_ID}.adcp_cross_links` AS Evaluated\n", + "# KEY (span_id)\n", + "# SOURCE KEY (span_id) REFERENCES TechNode (span_id)\n", + "# DESTINATION KEY (node_value) REFERENCES BizNode (node_value)\n", + "# LABEL Evaluated\n", + "# )\n", + "\n", + "# Print the DDL for copy-paste into BQ Studio\n", + "print(\"Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\")\n", + "print(\"The Property Graph DDL was generated by the SDK in the previous cell.\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Manual biz nodes prepared: 7\n", - " [Advertiser] ELF Cosmetics\n", - " [Product] Yahoo Homepage\n", - " [Product] Yahoo Mail\n", - " [Targeting] Millennials 25-34\n", - " [Targeting] Beauty Enthusiasts\n", - " [Budget] $50,000\n", - " [Campaign] ELF_Skincare_May2025\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import BizNode\n", - "\n", - "# Example: manually define business entities from the ELF campaign\n", - "manual_biz_nodes = [\n", - " BizNode(\n", - " span_id=\"manual-1\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Advertiser\",\n", - " node_value=\"ELF Cosmetics\",\n", - " confidence=1.0,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-2\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Product\",\n", - " node_value=\"Yahoo Homepage\",\n", - " confidence=0.95,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-3\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Product\",\n", - " node_value=\"Yahoo Mail\",\n", - " confidence=0.90,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-4\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Targeting\",\n", - " node_value=\"Millennials 25-34\",\n", - " confidence=0.92,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-5\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Targeting\",\n", - " node_value=\"Beauty Enthusiasts\",\n", - " confidence=0.95,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-6\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Budget\",\n", - " node_value=\"$50,000\",\n", - " confidence=1.0,\n", - " ),\n", - " BizNode(\n", - " span_id=\"manual-7\",\n", - " session_id=session_ids[0],\n", - " node_type=\"Campaign\",\n", - " node_value=\"ELF_Skincare_May2025\",\n", - " confidence=1.0,\n", - " ),\n", - "]\n", - "\n", - "print(f\"Manual biz nodes prepared: {len(manual_biz_nodes)}\")\n", - "for node in manual_biz_nodes:\n", - " print(f\" [{node.node_type}] {node.node_value}\")\n", - "\n", - "# Uncomment to store in BigQuery:\n", - "# success = cgm.store_biz_nodes(manual_biz_nodes)\n", - "# print(f\"Stored: {success}\")" - ] - }, - { - "cell_type": "markdown", - "id": "19f70d11", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 6: Property Graph DDL\n", - "\n", - "We generate and inspect the `CREATE PROPERTY GRAPH` DDL that formalizes the 4-pillar Context Graph in BigQuery. This creates:\n", - "- **TechNode** — spans from `agent_events` (execution lineage)\n", - "- **BizNode** — entities from `extracted_biz_nodes` (domain entities)\n", - "- **Caused** edges — parent→child span linkage (decision lineage)\n", - "- **Evaluated** edges — tech event → business entity cross-links" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "3625ad9f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:01.885148Z", - "iopub.status.busy": "2026-03-05T09:29:01.885079Z", - "iopub.status.idle": "2026-03-05T09:29:01.887228Z", - "shell.execute_reply": "2026-03-05T09:29:01.886715Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "3981ee96", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 7: GQL Reasoning Chain Traversal\n", + "\n", + "With the Property Graph in place, we use **Graph Query Language (GQL)** to answer the question:\n", + "\n", + "> _\"Why was the Yahoo Homepage selected for the $50K ELF campaign?\"_\n", + "\n", + "The GQL query traces causal hops from the final decision back to the business inputs that informed it." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - " CREATE PROPERTY GRAPH DDL\n", - "======================================================================\n", - "CREATE OR REPLACE PROPERTY GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", - " NODE TABLES (\n", - " -- Technical execution nodes (spans from ADK plugin)\n", - " `test-project-0728-467323.agent_analytics.agent_events` AS TechNode\n", - " KEY (span_id)\n", - " LABEL TechNode\n", - " PROPERTIES (\n", - " event_type,\n", - " agent,\n", - " timestamp,\n", - " session_id,\n", - " invocation_id,\n", - " content,\n", - " latency_ms,\n", - " status,\n", - " error_message\n", - " ),\n", - " -- Business domain nodes (extracted entities)\n", - " `test-project-0728-467323.agent_analytics.adcp_biz_nodes` AS BizNode\n", - " KEY (node_value)\n", - " LABEL BizNode\n", - " PROPERTIES (\n", - " node_type,\n", - " node_value,\n", - " confidence,\n", - " session_id\n", - " )\n", - " )\n", - " EDGE TABLES (\n", - " -- Causal lineage: parent span -> child span\n", - " `test-project-0728-467323.agent_analytics.agent_events` AS Caused\n", - " KEY (span_id)\n", - " SOURCE KEY (parent_span_id) REFERENCES TechNode (span_id)\n", - " DESTINATION KEY (span_id) REFERENCES TechNode (span_id)\n", - " LABEL Caused,\n", - "\n", - " -- Cross-link: technical event -> business entity it evaluated\n", - " `test-project-0728-467323.agent_analytics.adcp_cross_links` AS Evaluated\n", - " KEY (span_id)\n", - " SOURCE KEY (span_id) REFERENCES TechNode (span_id)\n", - " DESTINATION KEY (node_value) REFERENCES BizNode (node_value)\n", - " LABEL Evaluated\n", - " )\n", - "\n" - ] - } - ], - "source": [ - "# Generate and display the Property Graph DDL\n", - "ddl = cgm.get_property_graph_ddl()\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" CREATE PROPERTY GRAPH DDL\")\n", - "print(\"=\" * 70)\n", - "print(ddl)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "08c600a1", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:01.888414Z", - "iopub.status.busy": "2026-03-05T09:29:01.888327Z", - "iopub.status.idle": "2026-03-05T09:29:07.898328Z", - "shell.execute_reply": "2026-03-05T09:29:07.897303Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 17, + "id": "cf8343eb", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:07.907407Z", + "iopub.status.busy": "2026-03-05T09:29:07.907269Z", + "iopub.status.idle": "2026-03-05T09:29:07.910486Z", + "shell.execute_reply": "2026-03-05T09:29:07.909808Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + " GQL: Why was Yahoo Homepage selected?\n", + "======================================================================\n", + "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", + "MATCH\n", + " (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", + " -[e:Evaluated]->(biz:BizNode)\n", + "WHERE decision.event_type = @decision_event_type\n", + " AND biz.node_value = 'Yahoo Homepage'\n", + "RETURN\n", + " TO_JSON(decision) AS decision_node,\n", + " decision.span_id AS decision_span_id,\n", + " decision.event_type AS decision_type,\n", + " step.span_id AS reasoning_span_id,\n", + " step.event_type AS step_type,\n", + " step.agent AS step_agent,\n", + " COALESCE(\n", + " JSON_EXTRACT_SCALAR(step.content, '$.text_summary'),\n", + " JSON_EXTRACT_SCALAR(step.content, '$.response'),\n", + " ''\n", + " ) AS reasoning_text,\n", + " step.latency_ms AS step_latency_ms,\n", + " biz.node_type AS entity_type,\n", + " biz.node_value AS entity_value,\n", + " biz.confidence AS entity_confidence,\n", + " TO_JSON(step) AS step_node,\n", + " TO_JSON(biz) AS biz_node\n", + "ORDER BY step.timestamp ASC\n", + "LIMIT @result_limit\n", + "\n" + ] + } + ], + "source": [ + "# Generate the GQL reasoning chain query\n", + "gql_query = cgm.get_reasoning_chain_gql(\n", + " decision_event_type=\"AGENT_COMPLETED\",\n", + " biz_entity=\"Yahoo Homepage\",\n", + " max_hops=15,\n", + ")\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" GQL: Why was Yahoo Homepage selected?\")\n", + "print(\"=\" * 70)\n", + "print(gql_query)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cross-links created: True\n" - ] + "cell_type": "code", + "execution_count": 18, + "id": "6bc59a25", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:07.912716Z", + "iopub.status.busy": "2026-03-05T09:29:07.912595Z", + "iopub.status.idle": "2026-03-05T09:29:08.134377Z", + "shell.execute_reply": "2026-03-05T09:29:08.132739Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GQL reasoning chain query failed: 400 Property graphs are not supported.; reason: invalid, message: Property graphs are not supported.\n", + "\n", + "Location: US\n", + "Job ID: 87c077bb-8160-44e3-87f1-82adbee95d1d\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reasoning chain: 0 steps\n" + ] + } + ], + "source": [ + "# Execute the GQL query (requires Property Graph to be created)\n", + "try:\n", + " chain = cgm.explain_decision(\n", + " decision_event_type=\"AGENT_COMPLETED\",\n", + " biz_entity=\"Yahoo Homepage\",\n", + " )\n", + " print(f\"Reasoning chain: {len(chain)} steps\")\n", + " for step in chain:\n", + " print(f\" [{step.get('step_type', '?')}] \"\n", + " f\"{step.get('step_agent', '?')}: \"\n", + " f\"{step.get('reasoning_text', '')[:150]}\")\n", + " print(f\" -> Entity: {step.get('entity_type', '?')}: \"\n", + " f\"{step.get('entity_value', '?')}\")\n", + "except Exception as exc:\n", + " print(f\"GQL traversal: {exc}\")\n", + " print(\"\\nThe GQL query above can be run in BigQuery Console\")\n", + " print(\"once the Property Graph is created.\")" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Failed to create Property Graph: 400 Unsupported statement CREATE PROPERTY GRAPH.; reason: invalidQuery, message: Unsupported statement CREATE PROPERTY GRAPH.\n", - "\n", - "Location: US\n", - "Job ID: 89c29192-bb8a-4b3a-821c-0e1d34a2ceed\n", - "\n" - ] + "cell_type": "code", + "execution_count": 19, + "id": "335bc8db", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:08.136944Z", + "iopub.status.busy": "2026-03-05T09:29:08.136822Z", + "iopub.status.idle": "2026-03-05T09:29:08.139362Z", + "shell.execute_reply": "2026-03-05T09:29:08.138893Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + " GQL: Full Causal Chain for ELF Campaign\n", + "======================================================================\n", + "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", + "MATCH\n", + " (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", + "WHERE root.session_id = @session_id\n", + " AND root.event_type = 'USER_MESSAGE_RECEIVED'\n", + "RETURN\n", + " TO_JSON(root) AS root_node,\n", + " root.span_id AS root_span_id,\n", + " leaf.span_id AS leaf_span_id,\n", + " leaf.event_type AS leaf_event_type,\n", + " leaf.agent AS leaf_agent,\n", + " COALESCE(\n", + " JSON_EXTRACT_SCALAR(leaf.content, '$.text_summary'),\n", + " JSON_EXTRACT_SCALAR(leaf.content, '$.response'),\n", + " ''\n", + " ) AS leaf_content,\n", + " leaf.latency_ms AS leaf_latency_ms,\n", + " TO_JSON(leaf) AS leaf_node,\n", + " TO_JSON(c) AS edge\n", + "ORDER BY leaf.timestamp ASC\n", + "LIMIT @result_limit\n", + "\n" + ] + } + ], + "source": [ + "# Full causal chain for the ELF campaign session\n", + "causal_gql = cgm.get_causal_chain_gql(\n", + " session_id=session_ids[0],\n", + " max_hops=20,\n", + ")\n", + "\n", + "print(\"=\" * 70)\n", + "print(\" GQL: Full Causal Chain for ELF Campaign\")\n", + "print(\"=\" * 70)\n", + "print(causal_gql)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Property Graph created: False\n" - ] - } - ], - "source": [ - "# Create the Property Graph in BigQuery\n", - "# NOTE: Property Graphs require BigQuery Studio (BQ Notebooks).\n", - "# If running outside BQ Studio, the DDL and GQL are generated for you\n", - "# to copy-paste into a BigQuery Studio notebook cell with %%bigquery magic.\n", - "\n", - "# Step 1: Create cross-links\n", - "try:\n", - " cross_links_ok = cgm.create_cross_links(session_ids)\n", - " print(f\"Cross-links created: {cross_links_ok}\")\n", - "except Exception as exc:\n", - " print(f\"Cross-links creation: {exc}\")\n", - "\n", - "# Step 2: Create the Property Graph\n", - "try:\n", - " graph_ok = cgm.create_property_graph()\n", - " print(f\"Property Graph created: {graph_ok}\")\n", - "except Exception as exc:\n", - " print(f\"Property Graph creation: {exc}\")\n", - " print(\"\\n--- To create the Property Graph, run the DDL above in ---\")\n", - " print(\"--- a BigQuery Studio notebook cell: ---\")\n", - " print(\"--- %%bigquery ---\")\n", - " print(\"--- CREATE OR REPLACE PROPERTY GRAPH ... ---\")\n", - " print(\"--- ---\")\n", - " print(\"--- Reference: https://github.com/GoogleCloudPlatform/ ---\")\n", - " print(\"--- devrel-demos/blob/main/data-analytics/ ---\")\n", - " print(\"--- knowledge_graph_demo/kg_demo_template.ipynb ---\")" - ] - }, - { - "cell_type": "markdown", - "id": "q9v6s6cftur", - "metadata": {}, - "source": [ - "### Run in BigQuery Studio (BQ Notebooks)\n", - "\n", - "Property Graphs and GQL visualization require **BigQuery Studio**. When running this notebook in BQ Studio, uncomment and execute the `%%bigquery` cells below. The graph will render as an interactive visualization.\n", - "\n", - "> **Reference**: [Knowledge Graph Demo by Google Cloud](https://github.com/GoogleCloudPlatform/devrel-demos/blob/main/data-analytics/knowledge_graph_demo/kg_demo_template.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "66lobz9hhp4", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:07.901006Z", - "iopub.status.busy": "2026-03-05T09:29:07.900843Z", - "iopub.status.idle": "2026-03-05T09:29:07.905103Z", - "shell.execute_reply": "2026-03-05T09:29:07.904010Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 20, + "id": "xif3qepf5ng", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:08.140628Z", + "iopub.status.busy": "2026-03-05T09:29:08.140553Z", + "iopub.status.idle": "2026-03-05T09:29:08.143016Z", + "shell.execute_reply": "2026-03-05T09:29:08.142610Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GQL visualization cells are ready for BQ Studio.\n", + "Uncomment the %%bigquery --graph cells above when running in BQ Studio.\n" + ] + } + ], + "source": [ + "# ============================================================\n", + "# BQ Studio: Visualize the Context Graph\n", + "# Uncomment the %%bigquery magic below when running in BQ Studio.\n", + "# These cells render interactive graph visualizations.\n", + "# ============================================================\n", + "\n", + "# --- Visualize ALL relationships in the Context Graph ---\n", + "# %%bigquery --graph display_only\n", + "# GRAPH `agent_analytics.adcp_context_graph`\n", + "# MATCH (source)-[r]->(target)\n", + "# RETURN\n", + "# TO_JSON(source) AS Source_Node,\n", + "# TO_JSON(r) AS Edge,\n", + "# TO_JSON(target) AS Target_Node\n", + "\n", + "# --- Reasoning Chain: Why was Yahoo Homepage selected? ---\n", + "# %%bigquery --graph display_only\n", + "# GRAPH `agent_analytics.adcp_context_graph`\n", + "# MATCH (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", + "# -[e:Evaluated]->(biz:BizNode)\n", + "# WHERE decision.event_type = 'AGENT_COMPLETED'\n", + "# AND biz.node_value = 'Yahoo Homepage'\n", + "# RETURN\n", + "# TO_JSON(decision) AS decision_node,\n", + "# TO_JSON(c) AS caused_edge,\n", + "# TO_JSON(step) AS reasoning_step,\n", + "# TO_JSON(e) AS evaluated_edge,\n", + "# TO_JSON(biz) AS business_entity\n", + "\n", + "# --- Full Causal Chain for a session ---\n", + "# %%bigquery --graph display_only\n", + "# GRAPH `agent_analytics.adcp_context_graph`\n", + "# MATCH (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", + "# WHERE root.event_type = 'USER_MESSAGE_RECEIVED'\n", + "# RETURN\n", + "# TO_JSON(root) AS root_node,\n", + "# TO_JSON(c) AS caused_edge,\n", + "# TO_JSON(leaf) AS leaf_node\n", + "\n", + "print(\"GQL visualization cells are ready for BQ Studio.\")\n", + "print(\"Uncomment the %%bigquery --graph cells above when running in BQ Studio.\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\n", - "The Property Graph DDL was generated by the SDK in the previous cell.\n" - ] - } - ], - "source": [ - "# ============================================================\n", - "# BQ Studio: Create Property Graph\n", - "# Uncomment the %%bigquery magic below when running in BQ Studio\n", - "# ============================================================\n", - "\n", - "# %%bigquery\n", - "# CREATE OR REPLACE PROPERTY GRAPH `{DATASET_ID}.adcp_context_graph`\n", - "# NODE TABLES (\n", - "# `{DATASET_ID}.{TABLE_ID}` AS TechNode\n", - "# KEY (span_id)\n", - "# LABEL TechNode\n", - "# PROPERTIES (event_type, agent, timestamp, session_id, invocation_id,\n", - "# content, latency_ms, status, error_message),\n", - "# `{DATASET_ID}.adcp_biz_nodes` AS BizNode\n", - "# KEY (node_value)\n", - "# LABEL BizNode\n", - "# PROPERTIES (node_type, node_value, confidence, session_id)\n", - "# )\n", - "# EDGE TABLES (\n", - "# `{DATASET_ID}.{TABLE_ID}` AS Caused\n", - "# KEY (span_id)\n", - "# SOURCE KEY (parent_span_id) REFERENCES TechNode (span_id)\n", - "# DESTINATION KEY (span_id) REFERENCES TechNode (span_id)\n", - "# LABEL Caused,\n", - "# `{DATASET_ID}.adcp_cross_links` AS Evaluated\n", - "# KEY (span_id)\n", - "# SOURCE KEY (span_id) REFERENCES TechNode (span_id)\n", - "# DESTINATION KEY (node_value) REFERENCES BizNode (node_value)\n", - "# LABEL Evaluated\n", - "# )\n", - "\n", - "# Print the DDL for copy-paste into BQ Studio\n", - "print(\"Copy the DDL from the cell above into a %%bigquery cell in BQ Studio.\")\n", - "print(\"The Property Graph DDL was generated by the SDK in the previous cell.\")" - ] - }, - { - "cell_type": "markdown", - "id": "3981ee96", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 7: GQL Reasoning Chain Traversal\n", - "\n", - "With the Property Graph in place, we use **Graph Query Language (GQL)** to answer the question:\n", - "\n", - "> _\"Why was the Yahoo Homepage selected for the $50K ELF campaign?\"_\n", - "\n", - "The GQL query traces causal hops from the final decision back to the business inputs that informed it." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "cf8343eb", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:07.907407Z", - "iopub.status.busy": "2026-03-05T09:29:07.907269Z", - "iopub.status.idle": "2026-03-05T09:29:07.910486Z", - "shell.execute_reply": "2026-03-05T09:29:07.909808Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "f4db3f75", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 8: World Change Detection\n", + "\n", + "A2A direct deal workflows can pause for **days or weeks** waiting for human approval. The Context Graph might become stale:\n", + "- Yahoo Homepage inventory is sold out\n", + "- CPM prices have changed\n", + "- Target audience segments have shifted\n", + "\n", + "We run a **\"diff check\"** before the HITL approval is finalized. The SDK traverses the graph to find the original BizNodes, queries current availability, and alerts the manager if the \"World State\" has drifted." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - " GQL: Why was Yahoo Homepage selected?\n", - "======================================================================\n", - "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", - "MATCH\n", - " (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", - " -[e:Evaluated]->(biz:BizNode)\n", - "WHERE decision.event_type = @decision_event_type\n", - " AND biz.node_value = 'Yahoo Homepage'\n", - "RETURN\n", - " TO_JSON(decision) AS decision_node,\n", - " decision.span_id AS decision_span_id,\n", - " decision.event_type AS decision_type,\n", - " step.span_id AS reasoning_span_id,\n", - " step.event_type AS step_type,\n", - " step.agent AS step_agent,\n", - " COALESCE(\n", - " JSON_EXTRACT_SCALAR(step.content, '$.text_summary'),\n", - " JSON_EXTRACT_SCALAR(step.content, '$.response'),\n", - " ''\n", - " ) AS reasoning_text,\n", - " step.latency_ms AS step_latency_ms,\n", - " biz.node_type AS entity_type,\n", - " biz.node_value AS entity_value,\n", - " biz.confidence AS entity_confidence,\n", - " TO_JSON(step) AS step_node,\n", - " TO_JSON(biz) AS biz_node\n", - "ORDER BY step.timestamp ASC\n", - "LIMIT @result_limit\n", - "\n" - ] - } - ], - "source": [ - "# Generate the GQL reasoning chain query\n", - "gql_query = cgm.get_reasoning_chain_gql(\n", - " decision_event_type=\"AGENT_COMPLETED\",\n", - " biz_entity=\"Yahoo Homepage\",\n", - " max_hops=15,\n", - ")\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" GQL: Why was Yahoo Homepage selected?\")\n", - "print(\"=\" * 70)\n", - "print(gql_query)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6bc59a25", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:07.912716Z", - "iopub.status.busy": "2026-03-05T09:29:07.912595Z", - "iopub.status.idle": "2026-03-05T09:29:08.134377Z", - "shell.execute_reply": "2026-03-05T09:29:08.132739Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 21, + "id": "2079e426", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:08.144314Z", + "iopub.status.busy": "2026-03-05T09:29:08.144248Z", + "iopub.status.idle": "2026-03-05T09:29:09.150800Z", + "shell.execute_reply": "2026-03-05T09:29:09.149816Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "World Change Report — Session: adcp-a20d176b82af\n", + " Entities checked : 67\n", + " Stale entities : 4\n", + " Safe to approve : False\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", + " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import WorldChangeReport\n", + "\n", + "\n", + "def check_current_inventory(biz_node):\n", + " \"\"\"Simulate checking current inventory state.\n", + "\n", + " In production, this would call the actual Yahoo inventory API.\n", + " Here we simulate that Yahoo Homepage is now sold out (world changed!)\n", + " but Yahoo Mail is still available.\n", + " \"\"\"\n", + " if biz_node.node_type != \"Product\":\n", + " return {\"available\": True, \"current_value\": biz_node.node_value}\n", + "\n", + " if biz_node.node_value == \"Yahoo Homepage\":\n", + " return {\n", + " \"available\": False,\n", + " \"current_value\": \"SOLD OUT -- Q2 inventory depleted\",\n", + " \"drift_type\": \"inventory_depleted\",\n", + " \"severity\": 0.95,\n", + " \"recommendation\": (\n", + " \"Yahoo Homepage inventory is sold out for Q2. \"\n", + " \"Consider reallocating budget to Yahoo Finance \"\n", + " \"(similar premium placement, 2.3M impressions available).\"\n", + " ),\n", + " }\n", + "\n", + " return {\"available\": True, \"current_value\": biz_node.node_value}\n", + "\n", + "\n", + "# Run world change detection on the ELF campaign\n", + "# First, store some manual biz nodes for the session to detect against\n", + "try:\n", + " report = cgm.detect_world_changes(\n", + " session_id=session_ids[0],\n", + " current_state_fn=check_current_inventory,\n", + " )\n", + " print(report.summary())\n", + "except Exception as exc:\n", + " print(f\"Note: World change detection requires biz_nodes table: {exc}\")\n", + " print(\"\\nDemonstrating with manual nodes instead...\")\n", + "\n", + " # Demonstrate with manual nodes directly\n", + " from bigquery_agent_analytics.context_graph import WorldChangeAlert\n", + "\n", + " manual_report = WorldChangeReport(\n", + " session_id=session_ids[0],\n", + " alerts=[\n", + " WorldChangeAlert(\n", + " biz_node=\"Yahoo Homepage\",\n", + " original_state=\"Product: Yahoo Homepage (available, CPM $12.50)\",\n", + " current_state=\"SOLD OUT -- Q2 inventory depleted\",\n", + " drift_type=\"inventory_depleted\",\n", + " severity=0.95,\n", + " recommendation=(\n", + " \"Yahoo Homepage inventory is sold out for Q2. \"\n", + " \"Consider reallocating $8,000 budget to Yahoo Finance \"\n", + " \"(similar premium placement, 2.3M impressions available).\"\n", + " ),\n", + " ),\n", + " ],\n", + " total_entities_checked=7,\n", + " stale_entities=1,\n", + " is_safe_to_approve=False,\n", + " )\n", + " print(manual_report.summary())\n", + " print(f\"\\nRecommendation: {manual_report.alerts[0].recommendation}\")" + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "GQL reasoning chain query failed: 400 Property graphs are not supported.; reason: invalid, message: Property graphs are not supported.\n", - "\n", - "Location: US\n", - "Job ID: 87c077bb-8160-44e3-87f1-82adbee95d1d\n", - "\n" - ] + "cell_type": "markdown", + "id": "f0fed921", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 9: SDK Evaluation Pipeline\n", + "\n", + "Now we evaluate the ADCP agent's performance using the full SDK evaluation stack." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reasoning chain: 0 steps\n" - ] - } - ], - "source": [ - "# Execute the GQL query (requires Property Graph to be created)\n", - "try:\n", - " chain = cgm.explain_decision(\n", - " decision_event_type=\"AGENT_COMPLETED\",\n", - " biz_entity=\"Yahoo Homepage\",\n", - " )\n", - " print(f\"Reasoning chain: {len(chain)} steps\")\n", - " for step in chain:\n", - " print(f\" [{step.get('step_type', '?')}] \"\n", - " f\"{step.get('step_agent', '?')}: \"\n", - " f\"{step.get('reasoning_text', '')[:150]}\")\n", - " print(f\" -> Entity: {step.get('entity_type', '?')}: \"\n", - " f\"{step.get('entity_value', '?')}\")\n", - "except Exception as exc:\n", - " print(f\"GQL traversal: {exc}\")\n", - " print(\"\\nThe GQL query above can be run in BigQuery Console\")\n", - " print(\"once the Property Graph is created.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "335bc8db", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:08.136944Z", - "iopub.status.busy": "2026-03-05T09:29:08.136822Z", - "iopub.status.idle": "2026-03-05T09:29:08.139362Z", - "shell.execute_reply": "2026-03-05T09:29:08.138893Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "8cc77228", + "metadata": {}, + "source": [ + "### 9a. Code-Based Evaluation" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - " GQL: Full Causal Chain for ELF Campaign\n", - "======================================================================\n", - "GRAPH `test-project-0728-467323.agent_analytics.adcp_context_graph`\n", - "MATCH\n", - " (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", - "WHERE root.session_id = @session_id\n", - " AND root.event_type = 'USER_MESSAGE_RECEIVED'\n", - "RETURN\n", - " TO_JSON(root) AS root_node,\n", - " root.span_id AS root_span_id,\n", - " leaf.span_id AS leaf_span_id,\n", - " leaf.event_type AS leaf_event_type,\n", - " leaf.agent AS leaf_agent,\n", - " COALESCE(\n", - " JSON_EXTRACT_SCALAR(leaf.content, '$.text_summary'),\n", - " JSON_EXTRACT_SCALAR(leaf.content, '$.response'),\n", - " ''\n", - " ) AS leaf_content,\n", - " leaf.latency_ms AS leaf_latency_ms,\n", - " TO_JSON(leaf) AS leaf_node,\n", - " TO_JSON(c) AS edge\n", - "ORDER BY leaf.timestamp ASC\n", - "LIMIT @result_limit\n", - "\n" - ] - } - ], - "source": [ - "# Full causal chain for the ELF campaign session\n", - "causal_gql = cgm.get_causal_chain_gql(\n", - " session_id=session_ids[0],\n", - " max_hops=20,\n", - ")\n", - "\n", - "print(\"=\" * 70)\n", - "print(\" GQL: Full Causal Chain for ELF Campaign\")\n", - "print(\"=\" * 70)\n", - "print(causal_gql)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "xif3qepf5ng", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:08.140628Z", - "iopub.status.busy": "2026-03-05T09:29:08.140553Z", - "iopub.status.idle": "2026-03-05T09:29:08.143016Z", - "shell.execute_reply": "2026-03-05T09:29:08.142610Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 22, + "id": "b2a911a2", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:09.153081Z", + "iopub.status.busy": "2026-03-05T09:29:09.152945Z", + "iopub.status.idle": "2026-03-05T09:29:13.675561Z", + "shell.execute_reply": "2026-03-05T09:29:13.674462Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[latency]\n", + "Evaluation Report: latency_evaluator\n", + " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", + " Sessions: 3\n", + " Passed: 3 (100%)\n", + " Failed: 0\n", + " Aggregate Scores:\n", + " latency: 0.842\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[turn_count]\n", + "Evaluation Report: turn_count_evaluator\n", + " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", + " Sessions: 3\n", + " Passed: 3 (100%)\n", + " Failed: 0\n", + " Aggregate Scores:\n", + " turn_count: 0.867\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[error_rate]\n", + "Evaluation Report: error_rate_evaluator\n", + " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", + " Sessions: 3\n", + " Passed: 3 (100%)\n", + " Failed: 0\n", + " Aggregate Scores:\n", + " error_rate: 1.000\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[token_efficiency]\n", + "Evaluation Report: token_efficiency_evaluator\n", + " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", + " Sessions: 3\n", + " Passed: 3 (100%)\n", + " Failed: 0\n", + " Aggregate Scores:\n", + " token_efficiency: 0.932\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import SystemEvaluator\n", + "\n", + "trace_filter = TraceFilter(session_ids=session_ids)\n", + "\n", + "presets = [\n", + " (\"latency\", SystemEvaluator.latency(threshold_ms=30000)),\n", + " (\"turn_count\", SystemEvaluator.turn_count(max_turns=10)),\n", + " (\"error_rate\", SystemEvaluator.error_rate(max_error_rate=0.1)),\n", + " (\"token_efficiency\", SystemEvaluator.token_efficiency(max_tokens=100000)),\n", + "]\n", + "\n", + "for name, evaluator in presets:\n", + " try:\n", + " report = asyncio.get_event_loop().run_until_complete(\n", + " asyncio.to_thread(\n", + " client.evaluate,\n", + " evaluator=evaluator,\n", + " filters=trace_filter,\n", + " )\n", + " )\n", + " print(f\"\\n[{name}]\")\n", + " print(report.summary())\n", + " except Exception as exc:\n", + " print(f\"\\n[{name}] Failed: {exc}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "GQL visualization cells are ready for BQ Studio.\n", - "Uncomment the %%bigquery --graph cells above when running in BQ Studio.\n" - ] - } - ], - "source": [ - "# ============================================================\n", - "# BQ Studio: Visualize the Context Graph\n", - "# Uncomment the %%bigquery magic below when running in BQ Studio.\n", - "# These cells render interactive graph visualizations.\n", - "# ============================================================\n", - "\n", - "# --- Visualize ALL relationships in the Context Graph ---\n", - "# %%bigquery --graph display_only\n", - "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (source)-[r]->(target)\n", - "# RETURN\n", - "# TO_JSON(source) AS Source_Node,\n", - "# TO_JSON(r) AS Edge,\n", - "# TO_JSON(target) AS Target_Node\n", - "\n", - "# --- Reasoning Chain: Why was Yahoo Homepage selected? ---\n", - "# %%bigquery --graph display_only\n", - "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (decision:TechNode)-[c:Caused]->{1,15}(step:TechNode)\n", - "# -[e:Evaluated]->(biz:BizNode)\n", - "# WHERE decision.event_type = 'AGENT_COMPLETED'\n", - "# AND biz.node_value = 'Yahoo Homepage'\n", - "# RETURN\n", - "# TO_JSON(decision) AS decision_node,\n", - "# TO_JSON(c) AS caused_edge,\n", - "# TO_JSON(step) AS reasoning_step,\n", - "# TO_JSON(e) AS evaluated_edge,\n", - "# TO_JSON(biz) AS business_entity\n", - "\n", - "# --- Full Causal Chain for a session ---\n", - "# %%bigquery --graph display_only\n", - "# GRAPH `agent_analytics.adcp_context_graph`\n", - "# MATCH (root:TechNode)-[c:Caused]->{1,20}(leaf:TechNode)\n", - "# WHERE root.event_type = 'USER_MESSAGE_RECEIVED'\n", - "# RETURN\n", - "# TO_JSON(root) AS root_node,\n", - "# TO_JSON(c) AS caused_edge,\n", - "# TO_JSON(leaf) AS leaf_node\n", - "\n", - "print(\"GQL visualization cells are ready for BQ Studio.\")\n", - "print(\"Uncomment the %%bigquery --graph cells above when running in BQ Studio.\")" - ] - }, - { - "cell_type": "markdown", - "id": "f4db3f75", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 8: World Change Detection\n", - "\n", - "A2A direct deal workflows can pause for **days or weeks** waiting for human approval. The Context Graph might become stale:\n", - "- Yahoo Homepage inventory is sold out\n", - "- CPM prices have changed\n", - "- Target audience segments have shifted\n", - "\n", - "We run a **\"diff check\"** before the HITL approval is finalized. The SDK traverses the graph to find the original BizNodes, queries current availability, and alerts the manager if the \"World State\" has drifted." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "2079e426", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:08.144314Z", - "iopub.status.busy": "2026-03-05T09:29:08.144248Z", - "iopub.status.idle": "2026-03-05T09:29:09.150800Z", - "shell.execute_reply": "2026-03-05T09:29:09.149816Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "724698e1", + "metadata": {}, + "source": [ + "### 9b. LLM-as-Judge Evaluation" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "World Change Report — Session: adcp-a20d176b82af\n", - " Entities checked : 67\n", - " Stale entities : 4\n", - " Safe to approve : False\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n", - " [inventory_depleted] Yahoo Homepage: Product: Yahoo Homepage -> SOLD OUT -- Q2 inventory depleted (severity=0.95)\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import WorldChangeReport\n", - "\n", - "\n", - "def check_current_inventory(biz_node):\n", - " \"\"\"Simulate checking current inventory state.\n", - "\n", - " In production, this would call the actual Yahoo inventory API.\n", - " Here we simulate that Yahoo Homepage is now sold out (world changed!)\n", - " but Yahoo Mail is still available.\n", - " \"\"\"\n", - " if biz_node.node_type != \"Product\":\n", - " return {\"available\": True, \"current_value\": biz_node.node_value}\n", - "\n", - " if biz_node.node_value == \"Yahoo Homepage\":\n", - " return {\n", - " \"available\": False,\n", - " \"current_value\": \"SOLD OUT -- Q2 inventory depleted\",\n", - " \"drift_type\": \"inventory_depleted\",\n", - " \"severity\": 0.95,\n", - " \"recommendation\": (\n", - " \"Yahoo Homepage inventory is sold out for Q2. \"\n", - " \"Consider reallocating budget to Yahoo Finance \"\n", - " \"(similar premium placement, 2.3M impressions available).\"\n", - " ),\n", - " }\n", - "\n", - " return {\"available\": True, \"current_value\": biz_node.node_value}\n", - "\n", - "\n", - "# Run world change detection on the ELF campaign\n", - "# First, store some manual biz nodes for the session to detect against\n", - "try:\n", - " report = cgm.detect_world_changes(\n", - " session_id=session_ids[0],\n", - " current_state_fn=check_current_inventory,\n", - " )\n", - " print(report.summary())\n", - "except Exception as exc:\n", - " print(f\"Note: World change detection requires biz_nodes table: {exc}\")\n", - " print(\"\\nDemonstrating with manual nodes instead...\")\n", - "\n", - " # Demonstrate with manual nodes directly\n", - " from bigquery_agent_analytics.context_graph import WorldChangeAlert\n", - "\n", - " manual_report = WorldChangeReport(\n", - " session_id=session_ids[0],\n", - " alerts=[\n", - " WorldChangeAlert(\n", - " biz_node=\"Yahoo Homepage\",\n", - " original_state=\"Product: Yahoo Homepage (available, CPM $12.50)\",\n", - " current_state=\"SOLD OUT -- Q2 inventory depleted\",\n", - " drift_type=\"inventory_depleted\",\n", - " severity=0.95,\n", - " recommendation=(\n", - " \"Yahoo Homepage inventory is sold out for Q2. \"\n", - " \"Consider reallocating $8,000 budget to Yahoo Finance \"\n", - " \"(similar premium placement, 2.3M impressions available).\"\n", - " ),\n", - " ),\n", - " ],\n", - " total_entities_checked=7,\n", - " stale_entities=1,\n", - " is_safe_to_approve=False,\n", - " )\n", - " print(manual_report.summary())\n", - " print(f\"\\nRecommendation: {manual_report.alerts[0].recommendation}\")" - ] - }, - { - "cell_type": "markdown", - "id": "f0fed921", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 9: SDK Evaluation Pipeline\n", - "\n", - "Now we evaluate the ADCP agent's performance using the full SDK evaluation stack." - ] - }, - { - "cell_type": "markdown", - "id": "8cc77228", - "metadata": {}, - "source": [ - "### 9a. Code-Based Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "b2a911a2", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:09.153081Z", - "iopub.status.busy": "2026-03-05T09:29:09.152945Z", - "iopub.status.idle": "2026-03-05T09:29:13.675561Z", - "shell.execute_reply": "2026-03-05T09:29:13.674462Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 23, + "id": "056c9080", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:13.677450Z", + "iopub.status.busy": "2026-03-05T09:29:13.677335Z", + "iopub.status.idle": "2026-03-05T09:29:38.674573Z", + "shell.execute_reply": "2026-03-05T09:29:38.673907Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LLM Judge: Correctness]\n", + "Evaluation Report: correctness_judge\n", + " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", + " Sessions: 3\n", + " Passed: 3 (100%)\n", + " Failed: 0\n", + " Aggregate Scores:\n", + " correctness: 0.900\n", + "\n", + "Per-session details:\n", + " adcp-2c401a645c40: scores={'correctness': 1.0} passed=True\n", + " Feedback: correctness: The agent provided a comprehensive and accurate response that directly addresses the user's media buying brief. It correctly allocated the budget, provided plausible projections for impre\n", + " adcp-7d9855e7a71b: scores={'correctness': 0.7} passed=True\n", + " Feedback: correctness: The agent provides a comprehensive and well-structured media plan with correct budget calculations. The strategic reasoning is logical. However, the agent assumes flight dates (June 1, 20\n", + " adcp-a20d176b82af: scores={'correctness': 1.0} passed=True\n", + " Feedback: correctness: The agent correctly identified the need to provision the campaign in Google Ad Manager, executed the relevant tool, and provided a comprehensive summary of the provisioned campaign detail\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import LLMAsJudge\n", + "\n", + "# Correctness: Does the agent follow ADCP protocol correctly?\n", + "judge_correctness = LLMAsJudge.correctness(threshold=0.6)\n", + "try:\n", + " report = asyncio.get_event_loop().run_until_complete(\n", + " asyncio.to_thread(\n", + " client.evaluate,\n", + " evaluator=judge_correctness,\n", + " filters=trace_filter,\n", + " )\n", + " )\n", + " print(\"[LLM Judge: Correctness]\")\n", + " print(report.summary())\n", + " print(\"\\nPer-session details:\")\n", + " for ss in report.session_scores:\n", + " print(f\" {ss.session_id}: scores={ss.scores} \"\n", + " f\"passed={ss.passed}\")\n", + " if ss.llm_feedback:\n", + " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", + "except Exception as exc:\n", + " print(f\"Correctness judge failed: {exc}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[latency]\n", - "Evaluation Report: latency_evaluator\n", - " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", - " Sessions: 3\n", - " Passed: 3 (100%)\n", - " Failed: 0\n", - " Aggregate Scores:\n", - " latency: 0.842\n" - ] + "cell_type": "markdown", + "id": "55bca58f", + "metadata": {}, + "source": [ + "### 9c. Trajectory Matching -- ADCP Workflow Compliance" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[turn_count]\n", - "Evaluation Report: turn_count_evaluator\n", - " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", - " Sessions: 3\n", - " Passed: 3 (100%)\n", - " Failed: 0\n", - " Aggregate Scores:\n", - " turn_count: 0.867\n" - ] + "cell_type": "code", + "execution_count": 24, + "id": "59ad9161", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:38.676374Z", + "iopub.status.busy": "2026-03-05T09:29:38.676256Z", + "iopub.status.idle": "2026-03-05T09:29:40.307629Z", + "shell.execute_reply": "2026-03-05T09:29:40.307149Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\n", + " Session : adcp-a20d176b82af\n", + " Status : EvalStatus.PASSED\n", + " Scores : {'trajectory_in_order': 1.0, 'step_efficiency': 0.8}\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", + "from bigquery_agent_analytics.trace_evaluator import MatchType\n", + "\n", + "trace_evaluator = BigQueryTraceEvaluator(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + ")\n", + "\n", + "# Golden trajectory for the full ADCP workflow\n", + "golden_adcp_full = [\n", + " {\"tool_name\": \"query_ad_inventory\"},\n", + " {\"tool_name\": \"match_target_audience\"},\n", + " {\"tool_name\": \"allocate_media_budget\"},\n", + " {\"tool_name\": \"provision_campaign_in_gam\"},\n", + "]\n", + "\n", + "# Golden trajectory for brief processing only (no provisioning)\n", + "golden_adcp_brief = [\n", + " {\"tool_name\": \"query_ad_inventory\"},\n", + " {\"tool_name\": \"match_target_audience\"},\n", + " {\"tool_name\": \"allocate_media_budget\"},\n", + "]\n", + "\n", + "# Evaluate ELF campaign (full workflow with provisioning)\n", + "try:\n", + " result = asyncio.get_event_loop().run_until_complete(\n", + " trace_evaluator.evaluate_session(\n", + " session_id=session_ids[0],\n", + " golden_trajectory=golden_adcp_full,\n", + " match_type=MatchType.IN_ORDER,\n", + " )\n", + " )\n", + " print(\"[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\")\n", + " print(f\" Session : {result.session_id}\")\n", + " print(f\" Status : {result.eval_status}\")\n", + " print(f\" Scores : {result.scores}\")\n", + "except Exception as exc:\n", + " print(f\"Trajectory evaluation failed: {exc}\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[error_rate]\n", - "Evaluation Report: error_rate_evaluator\n", - " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", - " Sessions: 3\n", - " Passed: 3 (100%)\n", - " Failed: 0\n", - " Aggregate Scores:\n", - " error_rate: 1.000\n" - ] + "cell_type": "code", + "execution_count": 25, + "id": "316bcb43", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:40.309212Z", + "iopub.status.busy": "2026-03-05T09:29:40.309103Z", + "iopub.status.idle": "2026-03-05T09:29:41.393364Z", + "shell.execute_reply": "2026-03-05T09:29:41.392888Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Batch Trajectory Evaluation -- ANY_ORDER]\n", + " adcp-a20d176b82af: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n", + " adcp-7d9855e7a71b: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.6}\n", + " adcp-2c401a645c40: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n" + ] + } + ], + "source": [ + "# Batch trajectory evaluation across all sessions\n", + "eval_dataset = [\n", + " {\n", + " \"session_id\": session_ids[0],\n", + " \"expected_trajectory\": golden_adcp_full,\n", + " },\n", + " {\n", + " \"session_id\": session_ids[1],\n", + " \"expected_trajectory\": golden_adcp_brief,\n", + " },\n", + " {\n", + " \"session_id\": session_ids[2],\n", + " \"expected_trajectory\": golden_adcp_full,\n", + " },\n", + "]\n", + "\n", + "try:\n", + " batch_results = asyncio.get_event_loop().run_until_complete(\n", + " trace_evaluator.evaluate_batch(\n", + " eval_dataset=eval_dataset,\n", + " match_type=MatchType.ANY_ORDER,\n", + " )\n", + " )\n", + " print(\"[Batch Trajectory Evaluation -- ANY_ORDER]\")\n", + " for r in batch_results:\n", + " print(f\" {r.session_id}: {r.eval_status} \"\n", + " f\"scores={r.scores}\")\n", + "except Exception as exc:\n", + " print(f\"Batch evaluation failed: {exc}\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[token_efficiency]\n", - "Evaluation Report: token_efficiency_evaluator\n", - " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", - " Sessions: 3\n", - " Passed: 3 (100%)\n", - " Failed: 0\n", - " Aggregate Scores:\n", - " token_efficiency: 0.932\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import CodeEvaluator\n", - "\n", - "trace_filter = TraceFilter(session_ids=session_ids)\n", - "\n", - "presets = [\n", - " (\"latency\", CodeEvaluator.latency(threshold_ms=30000)),\n", - " (\"turn_count\", CodeEvaluator.turn_count(max_turns=10)),\n", - " (\"error_rate\", CodeEvaluator.error_rate(max_error_rate=0.1)),\n", - " (\"token_efficiency\", CodeEvaluator.token_efficiency(max_tokens=100000)),\n", - "]\n", - "\n", - "for name, evaluator in presets:\n", - " try:\n", - " report = asyncio.get_event_loop().run_until_complete(\n", - " asyncio.to_thread(\n", - " client.evaluate,\n", - " evaluator=evaluator,\n", - " filters=trace_filter,\n", - " )\n", - " )\n", - " print(f\"\\n[{name}]\")\n", - " print(report.summary())\n", - " except Exception as exc:\n", - " print(f\"\\n[{name}] Failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "724698e1", - "metadata": {}, - "source": [ - "### 9b. LLM-as-Judge Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "056c9080", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:13.677450Z", - "iopub.status.busy": "2026-03-05T09:29:13.677335Z", - "iopub.status.idle": "2026-03-05T09:29:38.674573Z", - "shell.execute_reply": "2026-03-05T09:29:38.673907Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "511fd003", + "metadata": {}, + "source": [ + "### 9d. Grader Pipeline -- Composite ADCP Quality Score" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[LLM Judge: Correctness]\n", - "Evaluation Report: correctness_judge\n", - " Dataset: test-project-0728-467323.agent_analytics.agent_events WHERE session_id IN UNNEST(@session_ids)\n", - " Sessions: 3\n", - " Passed: 3 (100%)\n", - " Failed: 0\n", - " Aggregate Scores:\n", - " correctness: 0.900\n", - "\n", - "Per-session details:\n", - " adcp-2c401a645c40: scores={'correctness': 1.0} passed=True\n", - " Feedback: correctness: The agent provided a comprehensive and accurate response that directly addresses the user's media buying brief. It correctly allocated the budget, provided plausible projections for impre\n", - " adcp-7d9855e7a71b: scores={'correctness': 0.7} passed=True\n", - " Feedback: correctness: The agent provides a comprehensive and well-structured media plan with correct budget calculations. The strategic reasoning is logical. However, the agent assumes flight dates (June 1, 20\n", - " adcp-a20d176b82af: scores={'correctness': 1.0} passed=True\n", - " Feedback: correctness: The agent correctly identified the need to provision the campaign in Google Ad Manager, executed the relevant tool, and provided a comprehensive summary of the provisioned campaign detail\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import LLMAsJudge\n", - "\n", - "# Correctness: Does the agent follow ADCP protocol correctly?\n", - "judge_correctness = LLMAsJudge.correctness(threshold=0.6)\n", - "try:\n", - " report = asyncio.get_event_loop().run_until_complete(\n", - " asyncio.to_thread(\n", - " client.evaluate,\n", - " evaluator=judge_correctness,\n", - " filters=trace_filter,\n", - " )\n", - " )\n", - " print(\"[LLM Judge: Correctness]\")\n", - " print(report.summary())\n", - " print(\"\\nPer-session details:\")\n", - " for ss in report.session_scores:\n", - " print(f\" {ss.session_id}: scores={ss.scores} \"\n", - " f\"passed={ss.passed}\")\n", - " if ss.llm_feedback:\n", - " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", - "except Exception as exc:\n", - " print(f\"Correctness judge failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "55bca58f", - "metadata": {}, - "source": [ - "### 9c. Trajectory Matching -- ADCP Workflow Compliance" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "59ad9161", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:38.676374Z", - "iopub.status.busy": "2026-03-05T09:29:38.676256Z", - "iopub.status.idle": "2026-03-05T09:29:40.307629Z", - "shell.execute_reply": "2026-03-05T09:29:40.307149Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 26, + "id": "5461103a", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:41.395279Z", + "iopub.status.busy": "2026-03-05T09:29:41.395161Z", + "iopub.status.idle": "2026-03-05T09:29:46.400507Z", + "shell.execute_reply": "2026-03-05T09:29:46.399815Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[GraderPipeline -- ADCP Quality Score]\n", + " Final score : 0.991\n", + " Passed : True\n", + " Strategy : weighted\n", + " Grader breakdown:\n", + " - latency_evaluator: scores={'latency': 0.97443249375} passed=True\n", + " - error_rate_evaluator: scores={'error_rate': 1.0} passed=True\n", + " - correctness_judge: scores={'correctness': 1.0} passed=True\n" + ] + } + ], + "source": [ + "import contextlib\n", + "import io\n", + "\n", + "from bigquery_agent_analytics import (\n", + " GraderPipeline,\n", + " WeightedStrategy,\n", + ")\n", + "\n", + "pipeline = (\n", + " GraderPipeline(WeightedStrategy(threshold=0.6))\n", + " .add_code_grader(\n", + " SystemEvaluator.latency(threshold_ms=30000),\n", + " weight=1.0,\n", + " )\n", + " .add_code_grader(\n", + " SystemEvaluator.error_rate(max_error_rate=0.1),\n", + " weight=1.0,\n", + " )\n", + " .add_llm_grader(\n", + " LLMAsJudge.correctness(threshold=0.6),\n", + " weight=2.0,\n", + " )\n", + ")\n", + "\n", + "# Evaluate the ELF campaign trace\n", + "trace_idx = 0\n", + "if traces[trace_idx] is not None:\n", + " trace = traces[trace_idx]\n", + " session_summary = {\n", + " \"session_id\": trace.session_id,\n", + " \"total_events\": len(trace.spans),\n", + " \"tool_calls\": len(trace.tool_calls),\n", + " \"tool_errors\": len(trace.error_spans),\n", + " \"llm_calls\": sum(\n", + " 1 for s in trace.spans\n", + " if s.event_type in (\"llm_request\", \"llm_response\")\n", + " ),\n", + " \"avg_latency_ms\": (\n", + " trace.total_latency_ms / max(len(trace.spans), 1)\n", + " if trace.total_latency_ms\n", + " else 0.0\n", + " ),\n", + " \"max_latency_ms\": max(\n", + " (s.latency_ms or 0 for s in trace.spans), default=0\n", + " ),\n", + " \"total_latency_ms\": trace.total_latency_ms or 0.0,\n", + " \"turn_count\": sum(\n", + " 1 for s in trace.spans if s.event_type == \"user_message\"\n", + " ),\n", + " \"has_error\": len(trace.error_spans) > 0,\n", + " \"input_tokens\": sum(\n", + " s.attributes.get(\"input_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " \"output_tokens\": sum(\n", + " s.attributes.get(\"output_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " \"total_tokens\": sum(\n", + " s.attributes.get(\"total_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " }\n", + "\n", + " buf = io.StringIO()\n", + " with contextlib.redirect_stdout(buf):\n", + " trace_text = trace.render(format=\"tree\")\n", + " if not isinstance(trace_text, str):\n", + " trace_text = buf.getvalue()\n", + " final_response = trace.final_response or \"\"\n", + "\n", + " verdict = asyncio.get_event_loop().run_until_complete(\n", + " pipeline.evaluate(\n", + " session_summary=session_summary,\n", + " trace_text=trace_text,\n", + " final_response=final_response,\n", + " )\n", + " )\n", + "\n", + " print(\"[GraderPipeline -- ADCP Quality Score]\")\n", + " print(f\" Final score : {verdict.final_score:.3f}\")\n", + " print(f\" Passed : {verdict.passed}\")\n", + " print(f\" Strategy : {verdict.strategy_name}\")\n", + " print(f\" Grader breakdown:\")\n", + " for gr in verdict.grader_results:\n", + " print(f\" - {gr.grader_name}: scores={gr.scores} \"\n", + " f\"passed={gr.passed}\")\n", + "else:\n", + " print(\"Trace not available -- skipping pipeline evaluation.\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\n", - " Session : adcp-a20d176b82af\n", - " Status : EvalStatus.PASSED\n", - " Scores : {'trajectory_in_order': 1.0, 'step_efficiency': 0.8}\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", - "from bigquery_agent_analytics.trace_evaluator import MatchType\n", - "\n", - "trace_evaluator = BigQueryTraceEvaluator(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - ")\n", - "\n", - "# Golden trajectory for the full ADCP workflow\n", - "golden_adcp_full = [\n", - " {\"tool_name\": \"query_ad_inventory\"},\n", - " {\"tool_name\": \"match_target_audience\"},\n", - " {\"tool_name\": \"allocate_media_budget\"},\n", - " {\"tool_name\": \"provision_campaign_in_gam\"},\n", - "]\n", - "\n", - "# Golden trajectory for brief processing only (no provisioning)\n", - "golden_adcp_brief = [\n", - " {\"tool_name\": \"query_ad_inventory\"},\n", - " {\"tool_name\": \"match_target_audience\"},\n", - " {\"tool_name\": \"allocate_media_budget\"},\n", - "]\n", - "\n", - "# Evaluate ELF campaign (full workflow with provisioning)\n", - "try:\n", - " result = asyncio.get_event_loop().run_until_complete(\n", - " trace_evaluator.evaluate_session(\n", - " session_id=session_ids[0],\n", - " golden_trajectory=golden_adcp_full,\n", - " match_type=MatchType.IN_ORDER,\n", - " )\n", - " )\n", - " print(\"[Trajectory: ELF Campaign -- Full ADCP Workflow (IN_ORDER)]\")\n", - " print(f\" Session : {result.session_id}\")\n", - " print(f\" Status : {result.eval_status}\")\n", - " print(f\" Scores : {result.scores}\")\n", - "except Exception as exc:\n", - " print(f\"Trajectory evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "316bcb43", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:40.309212Z", - "iopub.status.busy": "2026-03-05T09:29:40.309103Z", - "iopub.status.idle": "2026-03-05T09:29:41.393364Z", - "shell.execute_reply": "2026-03-05T09:29:41.392888Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "bd1591ff", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 10: Eval Suite & Multi-Trial\n", + "\n", + "Define a reusable **EvalSuite** for ADCP workflow compliance and run multi-trial evaluation." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Batch Trajectory Evaluation -- ANY_ORDER]\n", - " adcp-a20d176b82af: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n", - " adcp-7d9855e7a71b: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.6}\n", - " adcp-2c401a645c40: EvalStatus.PASSED scores={'trajectory_any_order': 1.0, 'step_efficiency': 0.8}\n" - ] - } - ], - "source": [ - "# Batch trajectory evaluation across all sessions\n", - "eval_dataset = [\n", - " {\n", - " \"session_id\": session_ids[0],\n", - " \"expected_trajectory\": golden_adcp_full,\n", - " },\n", - " {\n", - " \"session_id\": session_ids[1],\n", - " \"expected_trajectory\": golden_adcp_brief,\n", - " },\n", - " {\n", - " \"session_id\": session_ids[2],\n", - " \"expected_trajectory\": golden_adcp_full,\n", - " },\n", - "]\n", - "\n", - "try:\n", - " batch_results = asyncio.get_event_loop().run_until_complete(\n", - " trace_evaluator.evaluate_batch(\n", - " eval_dataset=eval_dataset,\n", - " match_type=MatchType.ANY_ORDER,\n", - " )\n", - " )\n", - " print(\"[Batch Trajectory Evaluation -- ANY_ORDER]\")\n", - " for r in batch_results:\n", - " print(f\" {r.session_id}: {r.eval_status} \"\n", - " f\"scores={r.scores}\")\n", - "except Exception as exc:\n", - " print(f\"Batch evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "511fd003", - "metadata": {}, - "source": [ - "### 9d. Grader Pipeline -- Composite ADCP Quality Score" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "5461103a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:41.395279Z", - "iopub.status.busy": "2026-03-05T09:29:41.395161Z", - "iopub.status.idle": "2026-03-05T09:29:46.400507Z", - "shell.execute_reply": "2026-03-05T09:29:46.399815Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 27, + "id": "32336f6b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:46.402586Z", + "iopub.status.busy": "2026-03-05T09:29:46.402436Z", + "iopub.status.idle": "2026-03-05T09:29:46.407601Z", + "shell.execute_reply": "2026-03-05T09:29:46.406977Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EvalSuite 'adcp_workflow_evals' -- 3 tasks:\n", + " [capability] elf_full_workflow: ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\n", + " [capability] nike_brief_processing: Nike brief processing: inventory + audience + budget.\n", + " [regression] tesla_end_to_end: Tesla end-to-end: brief -> provision in single turn.\n", + "\n", + "Validation: 1 warnings\n", + " [warning] balance: High positive case ratio (100%). Consider adding more negative test cases.\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import (\n", + " EvalSuite,\n", + " EvalTaskDef,\n", + " EvalCategory,\n", + " EvalValidator,\n", + ")\n", + "\n", + "suite = EvalSuite(name=\"adcp_workflow_evals\")\n", + "\n", + "suite.add_task(EvalTaskDef(\n", + " task_id=\"elf_full_workflow\",\n", + " session_id=session_ids[0],\n", + " description=\"ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\",\n", + " category=EvalCategory.CAPABILITY,\n", + " expected_trajectory=golden_adcp_full,\n", + " thresholds={\"trajectory_match\": 0.8, \"latency\": 0.7},\n", + " tags=[\"full_workflow\", \"brand_awareness\"],\n", + "))\n", + "\n", + "suite.add_task(EvalTaskDef(\n", + " task_id=\"nike_brief_processing\",\n", + " session_id=session_ids[1],\n", + " description=\"Nike brief processing: inventory + audience + budget.\",\n", + " category=EvalCategory.CAPABILITY,\n", + " expected_trajectory=golden_adcp_brief,\n", + " thresholds={\"trajectory_match\": 0.9},\n", + " tags=[\"brief_only\", \"product_launch\"],\n", + "))\n", + "\n", + "suite.add_task(EvalTaskDef(\n", + " task_id=\"tesla_end_to_end\",\n", + " session_id=session_ids[2],\n", + " description=\"Tesla end-to-end: brief -> provision in single turn.\",\n", + " category=EvalCategory.REGRESSION,\n", + " expected_trajectory=golden_adcp_full,\n", + " thresholds={\"trajectory_match\": 0.8},\n", + " tags=[\"full_workflow\", \"lead_gen\"],\n", + "))\n", + "\n", + "print(f\"EvalSuite '{suite.name}' -- {len(suite.get_tasks())} tasks:\")\n", + "for t in suite.get_tasks():\n", + " print(f\" [{t.category.value}] {t.task_id}: {t.description}\")\n", + "\n", + "# Validate\n", + "warnings = EvalValidator.validate_suite(suite)\n", + "print(f\"\\nValidation: {len(warnings)} warnings\")\n", + "for w in warnings:\n", + " print(f\" [{w.severity}] {w.check_name}: {w.message}\")\n", + "if not warnings:\n", + " print(\" Suite looks healthy!\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[GraderPipeline -- ADCP Quality Score]\n", - " Final score : 0.991\n", - " Passed : True\n", - " Strategy : weighted\n", - " Grader breakdown:\n", - " - latency_evaluator: scores={'latency': 0.97443249375} passed=True\n", - " - error_rate_evaluator: scores={'error_rate': 1.0} passed=True\n", - " - correctness_judge: scores={'correctness': 1.0} passed=True\n" - ] - } - ], - "source": [ - "import contextlib\n", - "import io\n", - "\n", - "from bigquery_agent_analytics import (\n", - " GraderPipeline,\n", - " WeightedStrategy,\n", - ")\n", - "\n", - "pipeline = (\n", - " GraderPipeline(WeightedStrategy(threshold=0.6))\n", - " .add_code_grader(\n", - " CodeEvaluator.latency(threshold_ms=30000),\n", - " weight=1.0,\n", - " )\n", - " .add_code_grader(\n", - " CodeEvaluator.error_rate(max_error_rate=0.1),\n", - " weight=1.0,\n", - " )\n", - " .add_llm_grader(\n", - " LLMAsJudge.correctness(threshold=0.6),\n", - " weight=2.0,\n", - " )\n", - ")\n", - "\n", - "# Evaluate the ELF campaign trace\n", - "trace_idx = 0\n", - "if traces[trace_idx] is not None:\n", - " trace = traces[trace_idx]\n", - " session_summary = {\n", - " \"session_id\": trace.session_id,\n", - " \"total_events\": len(trace.spans),\n", - " \"tool_calls\": len(trace.tool_calls),\n", - " \"tool_errors\": len(trace.error_spans),\n", - " \"llm_calls\": sum(\n", - " 1 for s in trace.spans\n", - " if s.event_type in (\"llm_request\", \"llm_response\")\n", - " ),\n", - " \"avg_latency_ms\": (\n", - " trace.total_latency_ms / max(len(trace.spans), 1)\n", - " if trace.total_latency_ms\n", - " else 0.0\n", - " ),\n", - " \"max_latency_ms\": max(\n", - " (s.latency_ms or 0 for s in trace.spans), default=0\n", - " ),\n", - " \"total_latency_ms\": trace.total_latency_ms or 0.0,\n", - " \"turn_count\": sum(\n", - " 1 for s in trace.spans if s.event_type == \"user_message\"\n", - " ),\n", - " \"has_error\": len(trace.error_spans) > 0,\n", - " \"input_tokens\": sum(\n", - " s.attributes.get(\"input_tokens\", 0) or 0\n", - " for s in trace.spans\n", - " ),\n", - " \"output_tokens\": sum(\n", - " s.attributes.get(\"output_tokens\", 0) or 0\n", - " for s in trace.spans\n", - " ),\n", - " \"total_tokens\": sum(\n", - " s.attributes.get(\"total_tokens\", 0) or 0\n", - " for s in trace.spans\n", - " ),\n", - " }\n", - "\n", - " buf = io.StringIO()\n", - " with contextlib.redirect_stdout(buf):\n", - " trace_text = trace.render(format=\"tree\")\n", - " if not isinstance(trace_text, str):\n", - " trace_text = buf.getvalue()\n", - " final_response = trace.final_response or \"\"\n", - "\n", - " verdict = asyncio.get_event_loop().run_until_complete(\n", - " pipeline.evaluate(\n", - " session_summary=session_summary,\n", - " trace_text=trace_text,\n", - " final_response=final_response,\n", - " )\n", - " )\n", - "\n", - " print(\"[GraderPipeline -- ADCP Quality Score]\")\n", - " print(f\" Final score : {verdict.final_score:.3f}\")\n", - " print(f\" Passed : {verdict.passed}\")\n", - " print(f\" Strategy : {verdict.strategy_name}\")\n", - " print(f\" Grader breakdown:\")\n", - " for gr in verdict.grader_results:\n", - " print(f\" - {gr.grader_name}: scores={gr.scores} \"\n", - " f\"passed={gr.passed}\")\n", - "else:\n", - " print(\"Trace not available -- skipping pipeline evaluation.\")" - ] - }, - { - "cell_type": "markdown", - "id": "bd1591ff", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 10: Eval Suite & Multi-Trial\n", - "\n", - "Define a reusable **EvalSuite** for ADCP workflow compliance and run multi-trial evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "32336f6b", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:46.402586Z", - "iopub.status.busy": "2026-03-05T09:29:46.402436Z", - "iopub.status.idle": "2026-03-05T09:29:46.407601Z", - "shell.execute_reply": "2026-03-05T09:29:46.406977Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 28, + "id": "5c8d9c5a", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:46.409275Z", + "iopub.status.busy": "2026-03-05T09:29:46.409161Z", + "iopub.status.idle": "2026-03-05T09:29:55.312224Z", + "shell.execute_reply": "2026-03-05T09:29:55.311350Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Multi-Trial -- ELF Campaign, 3 trials]\n", + " pass@k : 1.000\n", + " pass^k : 1.000\n", + " per_trial_pass_rate: 1.000\n", + " mean_scores : {'llm_judge_efficiency': 0.8, 'llm_judge_reasoning': 0.26666666666666666, 'llm_judge_task_completion': 0.9666666666666667, 'llm_judge_tool_usage': 0.9333333333333333, 'step_efficiency': 0.8, 'trajectory_in_order': 1.0}\n", + "\n", + " Per-trial results:\n", + " Trial 0: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.8, 'llm_judge_tool_usage': 0.9}\n", + " Trial 1: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.9, 'llm_judge_tool_usage': 1.0}\n", + " Trial 2: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 0.9, 'llm_judge_efficiency': 0.7, 'llm_judge_tool_usage': 0.9, 'llm_judge_reasoning': 0.8}\n" + ] + } + ], + "source": [ + "from bigquery_agent_analytics import TrialRunner\n", + "\n", + "trial_runner = TrialRunner(\n", + " evaluator=trace_evaluator,\n", + " num_trials=3,\n", + " concurrency=3,\n", + ")\n", + "\n", + "try:\n", + " trial_report = asyncio.get_event_loop().run_until_complete(\n", + " trial_runner.run_trials(\n", + " session_id=session_ids[0],\n", + " golden_trajectory=golden_adcp_full,\n", + " match_type=MatchType.IN_ORDER,\n", + " use_llm_judge=True,\n", + " )\n", + " )\n", + " print(\"[Multi-Trial -- ELF Campaign, 3 trials]\")\n", + " print(f\" pass@k : {trial_report.pass_at_k:.3f}\")\n", + " print(f\" pass^k : {trial_report.pass_pow_k:.3f}\")\n", + " print(f\" per_trial_pass_rate: {trial_report.per_trial_pass_rate:.3f}\")\n", + " print(f\" mean_scores : {trial_report.mean_scores}\")\n", + " print(f\"\\n Per-trial results:\")\n", + " for tr in trial_report.trial_results:\n", + " print(f\" Trial {tr.trial_index}: passed={tr.passed} \"\n", + " f\"scores={tr.scores}\")\n", + "except Exception as exc:\n", + " print(f\"Multi-trial evaluation failed: {exc}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "EvalSuite 'adcp_workflow_evals' -- 3 tasks:\n", - " [capability] elf_full_workflow: ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\n", - " [capability] nike_brief_processing: Nike brief processing: inventory + audience + budget.\n", - " [regression] tesla_end_to_end: Tesla end-to-end: brief -> provision in single turn.\n", - "\n", - "Validation: 1 warnings\n", - " [warning] balance: High positive case ratio (100%). Consider adding more negative test cases.\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import (\n", - " EvalSuite,\n", - " EvalTaskDef,\n", - " EvalCategory,\n", - " EvalValidator,\n", - ")\n", - "\n", - "suite = EvalSuite(name=\"adcp_workflow_evals\")\n", - "\n", - "suite.add_task(EvalTaskDef(\n", - " task_id=\"elf_full_workflow\",\n", - " session_id=session_ids[0],\n", - " description=\"ELF Cosmetics full ADCP workflow: brief -> plan -> provision.\",\n", - " category=EvalCategory.CAPABILITY,\n", - " expected_trajectory=golden_adcp_full,\n", - " thresholds={\"trajectory_match\": 0.8, \"latency\": 0.7},\n", - " tags=[\"full_workflow\", \"brand_awareness\"],\n", - "))\n", - "\n", - "suite.add_task(EvalTaskDef(\n", - " task_id=\"nike_brief_processing\",\n", - " session_id=session_ids[1],\n", - " description=\"Nike brief processing: inventory + audience + budget.\",\n", - " category=EvalCategory.CAPABILITY,\n", - " expected_trajectory=golden_adcp_brief,\n", - " thresholds={\"trajectory_match\": 0.9},\n", - " tags=[\"brief_only\", \"product_launch\"],\n", - "))\n", - "\n", - "suite.add_task(EvalTaskDef(\n", - " task_id=\"tesla_end_to_end\",\n", - " session_id=session_ids[2],\n", - " description=\"Tesla end-to-end: brief -> provision in single turn.\",\n", - " category=EvalCategory.REGRESSION,\n", - " expected_trajectory=golden_adcp_full,\n", - " thresholds={\"trajectory_match\": 0.8},\n", - " tags=[\"full_workflow\", \"lead_gen\"],\n", - "))\n", - "\n", - "print(f\"EvalSuite '{suite.name}' -- {len(suite.get_tasks())} tasks:\")\n", - "for t in suite.get_tasks():\n", - " print(f\" [{t.category.value}] {t.task_id}: {t.description}\")\n", - "\n", - "# Validate\n", - "warnings = EvalValidator.validate_suite(suite)\n", - "print(f\"\\nValidation: {len(warnings)} warnings\")\n", - "for w in warnings:\n", - " print(f\" [{w.severity}] {w.check_name}: {w.message}\")\n", - "if not warnings:\n", - " print(\" Suite looks healthy!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "5c8d9c5a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:46.409275Z", - "iopub.status.busy": "2026-03-05T09:29:46.409161Z", - "iopub.status.idle": "2026-03-05T09:29:55.312224Z", - "shell.execute_reply": "2026-03-05T09:29:55.311350Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "94ad7bbb", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 11: AI-Powered Insights Report\n", + "\n", + "Generate a comprehensive insights report analyzing all ADCP sessions." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Multi-Trial -- ELF Campaign, 3 trials]\n", - " pass@k : 1.000\n", - " pass^k : 1.000\n", - " per_trial_pass_rate: 1.000\n", - " mean_scores : {'llm_judge_efficiency': 0.8, 'llm_judge_reasoning': 0.26666666666666666, 'llm_judge_task_completion': 0.9666666666666667, 'llm_judge_tool_usage': 0.9333333333333333, 'step_efficiency': 0.8, 'trajectory_in_order': 1.0}\n", - "\n", - " Per-trial results:\n", - " Trial 0: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.8, 'llm_judge_tool_usage': 0.9}\n", - " Trial 1: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 1.0, 'llm_judge_efficiency': 0.9, 'llm_judge_tool_usage': 1.0}\n", - " Trial 2: passed=True scores={'trajectory_in_order': 1.0, 'step_efficiency': 0.8, 'llm_judge_task_completion': 0.9, 'llm_judge_efficiency': 0.7, 'llm_judge_tool_usage': 0.9, 'llm_judge_reasoning': 0.8}\n" - ] - } - ], - "source": [ - "from bigquery_agent_analytics import TrialRunner\n", - "\n", - "trial_runner = TrialRunner(\n", - " evaluator=trace_evaluator,\n", - " num_trials=3,\n", - " concurrency=3,\n", - ")\n", - "\n", - "try:\n", - " trial_report = asyncio.get_event_loop().run_until_complete(\n", - " trial_runner.run_trials(\n", - " session_id=session_ids[0],\n", - " golden_trajectory=golden_adcp_full,\n", - " match_type=MatchType.IN_ORDER,\n", - " use_llm_judge=True,\n", - " )\n", - " )\n", - " print(\"[Multi-Trial -- ELF Campaign, 3 trials]\")\n", - " print(f\" pass@k : {trial_report.pass_at_k:.3f}\")\n", - " print(f\" pass^k : {trial_report.pass_pow_k:.3f}\")\n", - " print(f\" per_trial_pass_rate: {trial_report.per_trial_pass_rate:.3f}\")\n", - " print(f\" mean_scores : {trial_report.mean_scores}\")\n", - " print(f\"\\n Per-trial results:\")\n", - " for tr in trial_report.trial_results:\n", - " print(f\" Trial {tr.trial_index}: passed={tr.passed} \"\n", - " f\"scores={tr.scores}\")\n", - "except Exception as exc:\n", - " print(f\"Multi-trial evaluation failed: {exc}\")" - ] - }, - { - "cell_type": "markdown", - "id": "94ad7bbb", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 11: AI-Powered Insights Report\n", - "\n", - "Generate a comprehensive insights report analyzing all ADCP sessions." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "57d59966", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:29:55.315628Z", - "iopub.status.busy": "2026-03-05T09:29:55.315444Z", - "iopub.status.idle": "2026-03-05T09:31:21.970015Z", - "shell.execute_reply": "2026-03-05T09:31:21.969378Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 29, + "id": "57d59966", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:29:55.315628Z", + "iopub.status.busy": "2026-03-05T09:29:55.315444Z", + "iopub.status.idle": "2026-03-05T09:31:21.970015Z", + "shell.execute_reply": "2026-03-05T09:31:21.969378Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ADCP Insights Report]\n", + "Agent Insights Report\n", + " Generated: 2026-03-05 09:31 UTC\n", + " Sessions analyzed: 3\n", + " Success rate: 100%\n", + " Avg effectiveness: 10.0/10\n", + " Avg latency: 4737ms\n", + " Avg turns: 1.3\n", + " Error rate: 0.0%\n", + "\n", + " Top Goals:\n", + " data_retrieval: 3\n", + " planning: 3\n", + " analysis: 2\n", + " task_automation: 2\n", + " Outcomes:\n", + " success: 3\n", + "\n", + " Analysis Sections:\n", + " - Task Areas\n", + " - Interaction Patterns\n", + " - What Works Well\n", + " - Friction Analysis\n", + " - Tool Usage Patterns\n", + " - Improvement Suggestions\n", + " - Trends & Anomalies\n" + ] + } + ], + "source": [ + "import time as _time\n", + "from bigquery_agent_analytics import InsightsConfig\n", + "\n", + "\n", + "def _run_with_retry(fn, max_retries=3, base_delay=5.0):\n", + " \"\"\"Run a function with exponential backoff on 429/RESOURCE_EXHAUSTED.\"\"\"\n", + " for attempt in range(max_retries + 1):\n", + " try:\n", + " return fn()\n", + " except Exception as exc:\n", + " is_rate_limit = \"429\" in str(exc) or \"RESOURCE_EXHAUSTED\" in str(exc)\n", + " if is_rate_limit and attempt < max_retries:\n", + " delay = base_delay * (2 ** attempt)\n", + " print(f\" Rate limited (attempt {attempt + 1}/{max_retries + 1}),\"\n", + " f\" retrying in {delay:.0f}s ...\")\n", + " _time.sleep(delay)\n", + " else:\n", + " raise\n", + "\n", + "\n", + "try:\n", + " insights_report = _run_with_retry(\n", + " lambda: client.insights(\n", + " filters=TraceFilter(session_ids=session_ids),\n", + " config=InsightsConfig(\n", + " max_sessions=10,\n", + " min_events_per_session=3,\n", + " min_turns_per_session=1,\n", + " ),\n", + " ),\n", + " max_retries=3,\n", + " base_delay=5.0,\n", + " )\n", + " print(\"[ADCP Insights Report]\")\n", + " print(insights_report.summary())\n", + "except Exception as exc:\n", + " print(f\"Insights generation failed after retries: {exc}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ADCP Insights Report]\n", - "Agent Insights Report\n", - " Generated: 2026-03-05 09:31 UTC\n", - " Sessions analyzed: 3\n", - " Success rate: 100%\n", - " Avg effectiveness: 10.0/10\n", - " Avg latency: 4737ms\n", - " Avg turns: 1.3\n", - " Error rate: 0.0%\n", - "\n", - " Top Goals:\n", - " data_retrieval: 3\n", - " planning: 3\n", - " analysis: 2\n", - " task_automation: 2\n", - " Outcomes:\n", - " success: 3\n", - "\n", - " Analysis Sections:\n", - " - Task Areas\n", - " - Interaction Patterns\n", - " - What Works Well\n", - " - Friction Analysis\n", - " - Tool Usage Patterns\n", - " - Improvement Suggestions\n", - " - Trends & Anomalies\n" - ] - } - ], - "source": [ - "import time as _time\n", - "from bigquery_agent_analytics import InsightsConfig\n", - "\n", - "\n", - "def _run_with_retry(fn, max_retries=3, base_delay=5.0):\n", - " \"\"\"Run a function with exponential backoff on 429/RESOURCE_EXHAUSTED.\"\"\"\n", - " for attempt in range(max_retries + 1):\n", - " try:\n", - " return fn()\n", - " except Exception as exc:\n", - " is_rate_limit = \"429\" in str(exc) or \"RESOURCE_EXHAUSTED\" in str(exc)\n", - " if is_rate_limit and attempt < max_retries:\n", - " delay = base_delay * (2 ** attempt)\n", - " print(f\" Rate limited (attempt {attempt + 1}/{max_retries + 1}),\"\n", - " f\" retrying in {delay:.0f}s ...\")\n", - " _time.sleep(delay)\n", - " else:\n", - " raise\n", - "\n", - "\n", - "try:\n", - " insights_report = _run_with_retry(\n", - " lambda: client.insights(\n", - " filters=TraceFilter(session_ids=session_ids),\n", - " config=InsightsConfig(\n", - " max_sessions=10,\n", - " min_events_per_session=3,\n", - " min_turns_per_session=1,\n", - " ),\n", - " ),\n", - " max_retries=3,\n", - " base_delay=5.0,\n", - " )\n", - " print(\"[ADCP Insights Report]\")\n", - " print(insights_report.summary())\n", - "except Exception as exc:\n", - " print(f\"Insights generation failed after retries: {exc}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "9f00660f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:31:21.975014Z", - "iopub.status.busy": "2026-03-05T09:31:21.974876Z", - "iopub.status.idle": "2026-03-05T09:31:21.978303Z", - "shell.execute_reply": "2026-03-05T09:31:21.977766Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 30, + "id": "9f00660f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:31:21.975014Z", + "iopub.status.busy": "2026-03-05T09:31:21.974876Z", + "iopub.status.idle": "2026-03-05T09:31:21.978303Z", + "shell.execute_reply": "2026-03-05T09:31:21.977766Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Executive Summary]\n", + "The agent demonstrates exceptional performance, achieving a 100% success rate and perfect effectiveness with high user satisfaction across all interactions. Users primarily leverage it for critical campaign management tasks, including media planning, data retrieval, and provisioning, often through short, task-oriented sessions. The most notable friction point is a significant average latency of 4.7 seconds per interaction, which could impact user perception despite successful task completion. While functionally flawless, implementing advanced user feedback and sentiment analysis is crucial to uncover subtle user experience issues. Prioritizing latency reduction and gaining deeper qualitative insights will further enhance this highly reliable agent's value and user delight.\n" + ] + } + ], + "source": [ + "# Executive summary (with retry backoff)\n", + "try:\n", + " if insights_report.executive_summary:\n", + " print(\"[Executive Summary]\")\n", + " print(insights_report.executive_summary)\n", + " else:\n", + " # Generate if not yet available\n", + " exec_summary = _run_with_retry(\n", + " lambda: insights_report.generate_executive_summary(),\n", + " max_retries=3,\n", + " base_delay=5.0,\n", + " )\n", + " print(\"[Executive Summary]\")\n", + " print(exec_summary)\n", + "except NameError:\n", + " print(\"Insights report not available -- run previous cell first.\")\n", + "except Exception as exc:\n", + " if \"429\" in str(exc) or \"RESOURCE_EXHAUSTED\" in str(exc):\n", + " print(f\"Executive summary failed: {exc}\")\n", + " print(\"The insights report was generated successfully above.\")\n", + " print(\"Executive summary can be retried after the rate limit resets.\")\n", + " else:\n", + " print(f\"Executive summary failed: {exc}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Executive Summary]\n", - "The agent demonstrates exceptional performance, achieving a 100% success rate and perfect effectiveness with high user satisfaction across all interactions. Users primarily leverage it for critical campaign management tasks, including media planning, data retrieval, and provisioning, often through short, task-oriented sessions. The most notable friction point is a significant average latency of 4.7 seconds per interaction, which could impact user perception despite successful task completion. While functionally flawless, implementing advanced user feedback and sentiment analysis is crucial to uncover subtle user experience issues. Prioritizing latency reduction and gaining deeper qualitative insights will further enhance this highly reliable agent's value and user delight.\n" - ] - } - ], - "source": [ - "# Executive summary (with retry backoff)\n", - "try:\n", - " if insights_report.executive_summary:\n", - " print(\"[Executive Summary]\")\n", - " print(insights_report.executive_summary)\n", - " else:\n", - " # Generate if not yet available\n", - " exec_summary = _run_with_retry(\n", - " lambda: insights_report.generate_executive_summary(),\n", - " max_retries=3,\n", - " base_delay=5.0,\n", - " )\n", - " print(\"[Executive Summary]\")\n", - " print(exec_summary)\n", - "except NameError:\n", - " print(\"Insights report not available -- run previous cell first.\")\n", - "except Exception as exc:\n", - " if \"429\" in str(exc) or \"RESOURCE_EXHAUSTED\" in str(exc):\n", - " print(f\"Executive summary failed: {exc}\")\n", - " print(\"The insights report was generated successfully above.\")\n", - " print(\"Executive summary can be retried after the rate limit resets.\")\n", - " else:\n", - " print(f\"Executive summary failed: {exc}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "10125977", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:31:21.979684Z", - "iopub.status.busy": "2026-03-05T09:31:21.979576Z", - "iopub.status.idle": "2026-03-05T09:31:21.982117Z", - "shell.execute_reply": "2026-03-05T09:31:21.981660Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 31, + "id": "10125977", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:31:21.979684Z", + "iopub.status.busy": "2026-03-05T09:31:21.979576Z", + "iopub.status.idle": "2026-03-05T09:31:21.982117Z", + "shell.execute_reply": "2026-03-05T09:31:21.981660Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Session Facets]\n", + "\n", + " Session: adcp-a20d176b82af\n", + " Goal categories : ['data_retrieval', 'analysis', 'planning', 'task_automation']\n", + " Outcome : success\n", + " Key topics : ['Media planning', 'Campaign provisioning', 'Ad inventory']\n", + " Effectiveness : 10.0\n", + "\n", + " Session: adcp-7d9855e7a71b\n", + " Goal categories : ['data_retrieval', 'analysis', 'planning']\n", + " Outcome : success\n", + " Key topics : ['media buying', 'campaign planning', 'budget allocation']\n", + " Effectiveness : 10.0\n", + "\n", + " Session: adcp-2c401a645c40\n", + " Goal categories : ['data_retrieval', 'planning', 'task_automation']\n", + " Outcome : success\n", + " Key topics : ['media buying', 'campaign provisioning', 'ad inventory']\n", + " Effectiveness : 10.0\n" + ] + } + ], + "source": [ + "# Per-session facets\n", + "try:\n", + " print(\"[Session Facets]\")\n", + " for facet in insights_report.session_facets:\n", + " print(f\"\\n Session: {facet.session_id}\")\n", + " if facet.goal_categories:\n", + " print(f\" Goal categories : {facet.goal_categories}\")\n", + " if facet.outcome:\n", + " print(f\" Outcome : {facet.outcome}\")\n", + " if facet.key_topics:\n", + " print(f\" Key topics : {facet.key_topics}\")\n", + " print(f\" Effectiveness : {facet.agent_effectiveness}\")\n", + "except NameError:\n", + " print(\"Insights report not available -- run previous cells first.\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Session Facets]\n", - "\n", - " Session: adcp-a20d176b82af\n", - " Goal categories : ['data_retrieval', 'analysis', 'planning', 'task_automation']\n", - " Outcome : success\n", - " Key topics : ['Media planning', 'Campaign provisioning', 'Ad inventory']\n", - " Effectiveness : 10.0\n", - "\n", - " Session: adcp-7d9855e7a71b\n", - " Goal categories : ['data_retrieval', 'analysis', 'planning']\n", - " Outcome : success\n", - " Key topics : ['media buying', 'campaign planning', 'budget allocation']\n", - " Effectiveness : 10.0\n", - "\n", - " Session: adcp-2c401a645c40\n", - " Goal categories : ['data_retrieval', 'planning', 'task_automation']\n", - " Outcome : success\n", - " Key topics : ['media buying', 'campaign provisioning', 'ad inventory']\n", - " Effectiveness : 10.0\n" - ] - } - ], - "source": [ - "# Per-session facets\n", - "try:\n", - " print(\"[Session Facets]\")\n", - " for facet in insights_report.session_facets:\n", - " print(f\"\\n Session: {facet.session_id}\")\n", - " if facet.goal_categories:\n", - " print(f\" Goal categories : {facet.goal_categories}\")\n", - " if facet.outcome:\n", - " print(f\" Outcome : {facet.outcome}\")\n", - " if facet.key_topics:\n", - " print(f\" Key topics : {facet.key_topics}\")\n", - " print(f\" Effectiveness : {facet.agent_effectiveness}\")\n", - "except NameError:\n", - " print(\"Insights report not available -- run previous cells first.\")" - ] - }, - { - "cell_type": "markdown", - "id": "686343b9", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Phase 12: End-to-End Pipeline (One-Shot)\n", - "\n", - "The `build_context_graph()` method runs the full pipeline in a single call: extract entities, create cross-links, and create the Property Graph." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "6551b064", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:31:21.983566Z", - "iopub.status.busy": "2026-03-05T09:31:21.983495Z", - "iopub.status.idle": "2026-03-05T09:32:04.285052Z", - "shell.execute_reply": "2026-03-05T09:32:04.282978Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "686343b9", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Phase 12: End-to-End Pipeline (One-Shot)\n", + "\n", + "The `build_context_graph()` method runs the full pipeline in a single call: extract entities, create cross-links, and create the Property Graph." + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Failed to create Property Graph: 400 Unsupported statement CREATE PROPERTY GRAPH.; reason: invalidQuery, message: Unsupported statement CREATE PROPERTY GRAPH.\n", - "\n", - "Location: US\n", - "Job ID: 7dbcf5d1-12f1-4d97-927d-0af5ed51b2a4\n", - "\n" - ] + "cell_type": "code", + "execution_count": 32, + "id": "6551b064", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:31:21.983566Z", + "iopub.status.busy": "2026-03-05T09:31:21.983495Z", + "iopub.status.idle": "2026-03-05T09:32:04.285052Z", + "shell.execute_reply": "2026-03-05T09:32:04.282978Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to create Property Graph: 400 Unsupported statement CREATE PROPERTY GRAPH.; reason: invalidQuery, message: Unsupported statement CREATE PROPERTY GRAPH.\n", + "\n", + "Location: US\n", + "Job ID: 7dbcf5d1-12f1-4d97-927d-0af5ed51b2a4\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Context Graph Pipeline Results]\n", + " Biz nodes extracted : 179\n", + " Cross-links created : True\n", + " Property Graph created : False\n" + ] + } + ], + "source": [ + "# One-shot pipeline: extract + cross-link + create graph\n", + "try:\n", + " results = cgm.build_context_graph(\n", + " session_ids=session_ids,\n", + " use_ai_generate=True,\n", + " )\n", + " print(\"[Context Graph Pipeline Results]\")\n", + " print(f\" Biz nodes extracted : {results['biz_nodes_count']}\")\n", + " print(f\" Cross-links created : {results['cross_links_created']}\")\n", + " print(f\" Property Graph created : {results['property_graph_created']}\")\n", + "except Exception as exc:\n", + " print(f\"Pipeline: {exc}\")\n", + " print(\"\\nThe individual steps (extract, cross-link, create graph)\")\n", + " print(\"can be run separately as shown in Phases 5-7.\")" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Context Graph Pipeline Results]\n", - " Biz nodes extracted : 179\n", - " Cross-links created : True\n", - " Property Graph created : False\n" - ] - } - ], - "source": [ - "# One-shot pipeline: extract + cross-link + create graph\n", - "try:\n", - " results = cgm.build_context_graph(\n", - " session_ids=session_ids,\n", - " use_ai_generate=True,\n", - " )\n", - " print(\"[Context Graph Pipeline Results]\")\n", - " print(f\" Biz nodes extracted : {results['biz_nodes_count']}\")\n", - " print(f\" Cross-links created : {results['cross_links_created']}\")\n", - " print(f\" Property Graph created : {results['property_graph_created']}\")\n", - "except Exception as exc:\n", - " print(f\"Pipeline: {exc}\")\n", - " print(\"\\nThe individual steps (extract, cross-link, create graph)\")\n", - " print(\"can be run separately as shown in Phases 5-7.\")" - ] - }, - { - "cell_type": "markdown", - "id": "88d77f19", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## Summary\n", - "\n", - "This notebook demonstrated the **Context Graph V2: System of Reasoning** for agentic advertising:\n", - "\n", - "| Phase | Feature | Description |\n", - "|---|---|---|\n", - "| 1 | **ADCP Tools** | Deterministic Yahoo inventory, audience, budget, and GAM provisioning tools |\n", - "| 2 | **Multi-Agent System** | Yahoo Sales Agent with ADCP workflow orchestration |\n", - "| 3 | **Trace Logging** | BigQueryAgentAnalyticsPlugin captures every event |\n", - "| 4 | **Trace Visualization** | SDK Client retrieves and renders hierarchical execution DAGs |\n", - "| 5 | **Biz Entity Extraction** | AI.GENERATE extracts Products, Targeting, Campaigns from payloads |\n", - "| 6 | **Property Graph DDL** | CREATE PROPERTY GRAPH with TechNode, BizNode, Caused, Evaluated |\n", - "| 7 | **GQL Traversal** | Quantified-path queries: \"Why was Yahoo Homepage selected?\" |\n", - "| 8 | **World Change Detection** | Temporal drift check before HITL approval |\n", - "| 9 | **Full Evaluation** | Code metrics, LLM judge, trajectory matching, grader pipeline |\n", - "| 10 | **Eval Suite** | Reusable ADCP compliance tests with multi-trial pass@k |\n", - "| 11 | **AI Insights** | Multi-stage analysis with executive summary |\n", - "| 12 | **One-Shot Pipeline** | `build_context_graph()` runs extract -> cross-link -> create |\n", - "\n", - "### Key Takeaways\n", - "\n", - "- **System of Reasoning**: The Context Graph cross-links \"what the agent did\" (TechGraph) with \"why it did it\" (BizGraph), providing the automated explainability required for HITL ad planners to trust multi-thousand-dollar media decisions.\n", - "- **Native Property Graphs**: BigQuery's `CREATE PROPERTY GRAPH` + GQL replaces cumbersome recursive CTEs with elegant graph traversal.\n", - "- **World Change Detection**: Long-running A2A tasks (days/weeks) need temporal intelligence to detect stale context before final HITL approval.\n", - "- **End-to-End Observability**: From agent execution to business entity extraction to evaluation -- all powered by the BigQuery Agent Analytics SDK." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "184fa6c6", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-05T09:32:04.309566Z", - "iopub.status.busy": "2026-03-05T09:32:04.309371Z", - "iopub.status.idle": "2026-03-05T09:32:04.316042Z", - "shell.execute_reply": "2026-03-05T09:32:04.315383Z" - } - }, - "outputs": [ + "cell_type": "markdown", + "id": "88d77f19", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Summary\n", + "\n", + "This notebook demonstrated the **Context Graph V2: System of Reasoning** for agentic advertising:\n", + "\n", + "| Phase | Feature | Description |\n", + "|---|---|---|\n", + "| 1 | **ADCP Tools** | Deterministic Yahoo inventory, audience, budget, and GAM provisioning tools |\n", + "| 2 | **Multi-Agent System** | Yahoo Sales Agent with ADCP workflow orchestration |\n", + "| 3 | **Trace Logging** | BigQueryAgentAnalyticsPlugin captures every event |\n", + "| 4 | **Trace Visualization** | SDK Client retrieves and renders hierarchical execution DAGs |\n", + "| 5 | **Biz Entity Extraction** | AI.GENERATE extracts Products, Targeting, Campaigns from payloads |\n", + "| 6 | **Property Graph DDL** | CREATE PROPERTY GRAPH with TechNode, BizNode, Caused, Evaluated |\n", + "| 7 | **GQL Traversal** | Quantified-path queries: \"Why was Yahoo Homepage selected?\" |\n", + "| 8 | **World Change Detection** | Temporal drift check before HITL approval |\n", + "| 9 | **Full Evaluation** | Code metrics, LLM judge, trajectory matching, grader pipeline |\n", + "| 10 | **Eval Suite** | Reusable ADCP compliance tests with multi-trial pass@k |\n", + "| 11 | **AI Insights** | Multi-stage analysis with executive summary |\n", + "| 12 | **One-Shot Pipeline** | `build_context_graph()` runs extract -> cross-link -> create |\n", + "\n", + "### Key Takeaways\n", + "\n", + "- **System of Reasoning**: The Context Graph cross-links \"what the agent did\" (TechGraph) with \"why it did it\" (BizGraph), providing the automated explainability required for HITL ad planners to trust multi-thousand-dollar media decisions.\n", + "- **Native Property Graphs**: BigQuery's `CREATE PROPERTY GRAPH` + GQL replaces cumbersome recursive CTEs with elegant graph traversal.\n", + "- **World Change Detection**: Long-running A2A tasks (days/weeks) need temporal intelligence to detect stale context before final HITL approval.\n", + "- **End-to-End Observability**: From agent execution to business entity extraction to evaluation -- all powered by the BigQuery Agent Analytics SDK." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Demo complete!\n", - "Sessions: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n", - "Traces logged to: test-project-0728-467323.agent_analytics.agent_events\n", - "Context Graph: test-project-0728-467323.agent_analytics.adcp_context_graph\n" - ] + "cell_type": "code", + "execution_count": 33, + "id": "184fa6c6", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-05T09:32:04.309566Z", + "iopub.status.busy": "2026-03-05T09:32:04.309371Z", + "iopub.status.idle": "2026-03-05T09:32:04.316042Z", + "shell.execute_reply": "2026-03-05T09:32:04.315383Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Demo complete!\n", + "Sessions: ['adcp-a20d176b82af', 'adcp-7d9855e7a71b', 'adcp-2c401a645c40']\n", + "Traces logged to: test-project-0728-467323.agent_analytics.agent_events\n", + "Context Graph: test-project-0728-467323.agent_analytics.adcp_context_graph\n" + ] + } + ], + "source": [ + "# Cleanup\n", + "try:\n", + " asyncio.get_event_loop().run_until_complete(\n", + " plugin.shutdown(timeout=10.0)\n", + " )\n", + "except Exception:\n", + " pass\n", + "\n", + "print(\"\\nDemo complete!\")\n", + "print(f\"Sessions: {session_ids}\")\n", + "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")\n", + "print(f\"Context Graph: {PROJECT_ID}.{DATASET_ID}.{cg_config.graph_name}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" } - ], - "source": [ - "# Cleanup\n", - "try:\n", - " asyncio.get_event_loop().run_until_complete(\n", - " plugin.shutdown(timeout=10.0)\n", - " )\n", - "except Exception:\n", - " pass\n", - "\n", - "print(\"\\nDemo complete!\")\n", - "print(f\"Sessions: {session_ids}\")\n", - "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")\n", - "print(f\"Context Graph: {PROJECT_ID}.{DATASET_ID}.{cg_config.graph_name}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/e2e_demo.py b/examples/e2e_demo.py index ddbee3ce..9f9447d2 100644 --- a/examples/e2e_demo.py +++ b/examples/e2e_demo.py @@ -59,7 +59,7 @@ # --------------------------------------------------------------------------- from bigquery_agent_analytics import BigQueryTraceEvaluator from bigquery_agent_analytics import Client -from bigquery_agent_analytics import CodeEvaluator +from bigquery_agent_analytics import SystemEvaluator from bigquery_agent_analytics import InsightsConfig from bigquery_agent_analytics import LLMAsJudge from bigquery_agent_analytics import TraceFilter @@ -459,12 +459,12 @@ async def phase2_evaluate( print("\n--- 2b. Code-Based Evaluation ---\n") trace_filter = TraceFilter(session_ids=session_ids) presets = [ - ("latency", CodeEvaluator.latency(threshold_ms=30000)), - ("turn_count", CodeEvaluator.turn_count(max_turns=10)), - ("error_rate", CodeEvaluator.error_rate(max_error_rate=0.1)), + ("latency", SystemEvaluator.latency(threshold_ms=30000)), + ("turn_count", SystemEvaluator.turn_count(max_turns=10)), + ("error_rate", SystemEvaluator.error_rate(max_error_rate=0.1)), ( "token_efficiency", - CodeEvaluator.token_efficiency(max_tokens=100000), + SystemEvaluator.token_efficiency(max_tokens=100000), ), ] for preset_name, evaluator in presets: diff --git a/examples/e2e_notebook_demo.ipynb b/examples/e2e_notebook_demo.ipynb index 3361ed2c..d1460a60 100644 --- a/examples/e2e_notebook_demo.ipynb +++ b/examples/e2e_notebook_demo.ipynb @@ -1,12 +1,8 @@ { "cells": [ { + "id": "5910b1c7", "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "copyright-header" - }, - "outputs": [], "source": [ "# Copyright 2025 Google LLC\n", "#\n", @@ -21,90 +17,144 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "metadata": { + "id": "copyright-header" + }, + "execution_count": null }, { + "id": "bd8b517b", "cell_type": "markdown", - "metadata": { - "id": "notebook-affordances" - }, "source": [ "# Demo Plan: BigQuery for Agent Ops - Unified Platform\n", "\n", - "\n", + "\u003ctable align=\"left\"\u003e\n", "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://colab.research.google.com/github/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/colab-logo.png\" alt=\"Colab logo\"\u003e Run in Colab\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/refs/heads/main/third_party/logo/github-logo.png\" width=\"32\" alt=\"GitHub logo\"\u003e\n", " View on GitHub\n", - " \n", - " \n", - " \n", - " \"Vertex\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png\" alt=\"Vertex AI logo\" width=\"32\"\u003e\n", " Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ\n", + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca href=\"https://console.cloud.google.com/bigquery/import?url=https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK/blob/main/examples/e2e_notebook_demo.ipynb\"\u003e\n", + " \u003cimg src=\"https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw\u0026s\" alt=\"BQ logo\" width=\"35\"\u003e\n", " Open in BQ Studio\n", - " \n", - "
" - ] + " \u003c/a\u003e\n", + " \u003c/td\u003e\n", + "\u003c/table\u003e" + ], + "metadata": { + "id": "notebook-affordances" + }, + "execution_count": null }, { + "id": "533dbdc3", "cell_type": "markdown", - "metadata": {}, "source": [ "## Install Dependencies" - ] + ], + "metadata": { + "id": "dlz6_lPXp3ss" + }, + "execution_count": null }, { + "id": "4ae4aff7", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio" - ] + ], + "metadata": { + "id": "Y1R4tOtqp3ss" + }, + "execution_count": null }, { + "id": "83997bbb", "cell_type": "markdown", - "metadata": {}, "source": [ - "## Authenticate & Configure" - ] + "## Authenticate \u0026 Configure" + ], + "metadata": { + "id": "XWXWXZFXp3ss" + }, + "execution_count": null }, { + "id": "096119e4", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "import os\n\n# Colab authentication\ntry:\n from google.colab import auth\n auth.authenticate_user()\n print(\"Colab authentication successful.\")\nexcept ImportError:\n print(\"Not running in Colab — using default credentials.\")\n\n# ---------- Configuration ----------\nPROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"test-project-0728-467323\")\nDATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\nTABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\nAGENT_MODEL = os.environ.get(\"AGENT_MODEL\", \"gemini-3-flash-preview\")\nSDK_ENDPOINT = os.environ.get(\"SDK_ENDPOINT\", \"gemini-2.5-flash\")\nLOCATION = \"US\"\nAPP_NAME = \"e2e_notebook_demo\"\nUSER_ID = \"demo_user\"\n\nos.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\nos.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\nos.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n\n# Enable async in Jupyter\nimport nest_asyncio\nnest_asyncio.apply()\n\nprint(f\"Project : {PROJECT_ID}\")\nprint(f\"Dataset : {DATASET_ID}\")\nprint(f\"Table : {TABLE_ID}\")\nprint(f\"Agent model : {AGENT_MODEL}\")\nprint(f\"SDK endpoint : {SDK_ENDPOINT}\")" + "source": [ + "import os\n", + "\n", + "# Colab authentication\n", + "try:\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + " print(\"Colab authentication successful.\")\n", + "except ImportError:\n", + " print(\"Not running in Colab — using default credentials.\")\n", + "\n", + "# ---------- Configuration ----------\n", + "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"test-project-0728-467323\")\n", + "DATASET_ID = os.environ.get(\"BQ_DATASET\", \"agent_analytics\")\n", + "TABLE_ID = os.environ.get(\"BQ_TABLE\", \"agent_events\")\n", + "AGENT_MODEL = os.environ.get(\"AGENT_MODEL\", \"gemini-3-flash-preview\")\n", + "SDK_ENDPOINT = os.environ.get(\"SDK_ENDPOINT\", \"gemini-2.5-flash\")\n", + "LOCATION = \"US\"\n", + "APP_NAME = \"e2e_notebook_demo\"\n", + "USER_ID = \"demo_user\"\n", + "\n", + "os.environ[\"GOOGLE_GENAI_USE_VERTEXAI\"] = \"true\"\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "os.environ[\"GOOGLE_CLOUD_LOCATION\"] = \"global\"\n", + "\n", + "# Enable async in Jupyter\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "print(f\"Project : {PROJECT_ID}\")\n", + "print(f\"Dataset : {DATASET_ID}\")\n", + "print(f\"Table : {TABLE_ID}\")\n", + "print(f\"Agent model : {AGENT_MODEL}\")\n", + "print(f\"SDK endpoint : {SDK_ENDPOINT}\")" + ], + "metadata": { + "id": "mp85Zf7op3ss" + }, + "execution_count": null }, { + "id": "33e6bab7", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 1: Run Agent & Log Traces to BigQuery\n", + "## Phase 1: Run Agent \u0026 Log Traces to BigQuery\n", "\n", "We define a **travel planner agent** with four deterministic tools, run three conversations, and log every event to BigQuery via the `BigQueryAgentAnalyticsPlugin`." - ] + ], + "metadata": { + "id": "iRlQhXRNp3ss" + }, + "execution_count": null }, { + "id": "ea2fd325", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import hashlib\n", "import random\n", @@ -116,7 +166,7 @@ " destination: str,\n", " date: str,\n", " max_results: int = 5,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Search for available flights between two cities.\n", "\n", " Args:\n", @@ -166,7 +216,7 @@ " check_in: str,\n", " check_out: str,\n", " max_results: int = 5,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Search for hotels in a given city.\n", "\n", " Args:\n", @@ -215,7 +265,7 @@ "async def get_weather_forecast(\n", " city: str,\n", " date: str,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Get weather forecast for a city on a specific date.\n", "\n", " Args:\n", @@ -249,7 +299,7 @@ " hotels: float,\n", " daily_expenses: float,\n", " num_days: int,\n", - ") -> dict[str, Any]:\n", + ") -\u003e dict[str, Any]:\n", " \"\"\"Calculate total trip budget from component costs.\n", "\n", " Args:\n", @@ -281,20 +331,63 @@ "\n", "\n", "print(\"Tool functions defined: search_flights, search_hotels, get_weather_forecast, calculate_trip_budget\")" - ] + ], + "metadata": { + "id": "N7D41PvXp3ss" + }, + "execution_count": null }, { + "id": "aa7e30d2", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "from google.adk.agents import LlmAgent\nfrom google.genai import types\n\nTRAVEL_PLANNER_INSTRUCTION = \"\"\"\\\nYou are a helpful travel planning assistant. You help users plan trips by\nsearching for flights, hotels, checking weather forecasts, and calculating\nbudgets.\n\nGuidelines:\n- Always search for flights and hotels when the user asks to plan a trip.\n- Check the weather at the destination when relevant.\n- Provide a budget estimate when enough cost information is available.\n- Be concise but informative in your responses.\n- Present results in a clear, organized format.\n- When multiple tools are needed, call them as appropriate and then\n synthesize the results into a cohesive plan.\n\"\"\"\n\n\ndef build_agent() -> LlmAgent:\n \"\"\"Build the travel planner agent.\"\"\"\n return LlmAgent(\n name=\"travel_planner\",\n model=AGENT_MODEL,\n instruction=TRAVEL_PLANNER_INSTRUCTION,\n tools=[\n search_flights,\n search_hotels,\n get_weather_forecast,\n calculate_trip_budget,\n ],\n generate_content_config=types.GenerateContentConfig(\n temperature=1.0,\n ),\n )\n\n\nprint(\"Agent builder ready.\")" + "source": [ + "from google.adk.agents import LlmAgent\n", + "from google.genai import types\n", + "\n", + "TRAVEL_PLANNER_INSTRUCTION = \"\"\"\\\n", + "You are a helpful travel planning assistant. You help users plan trips by\n", + "searching for flights, hotels, checking weather forecasts, and calculating\n", + "budgets.\n", + "\n", + "Guidelines:\n", + "- Always search for flights and hotels when the user asks to plan a trip.\n", + "- Check the weather at the destination when relevant.\n", + "- Provide a budget estimate when enough cost information is available.\n", + "- Be concise but informative in your responses.\n", + "- Present results in a clear, organized format.\n", + "- When multiple tools are needed, call them as appropriate and then\n", + " synthesize the results into a cohesive plan.\n", + "\"\"\"\n", + "\n", + "\n", + "def build_agent() -\u003e LlmAgent:\n", + " \"\"\"Build the travel planner agent.\"\"\"\n", + " return LlmAgent(\n", + " name=\"travel_planner\",\n", + " model=AGENT_MODEL,\n", + " instruction=TRAVEL_PLANNER_INSTRUCTION,\n", + " tools=[\n", + " search_flights,\n", + " search_hotels,\n", + " get_weather_forecast,\n", + " calculate_trip_budget,\n", + " ],\n", + " generate_content_config=types.GenerateContentConfig(\n", + " temperature=1.0,\n", + " ),\n", + " )\n", + "\n", + "\n", + "print(\"Agent builder ready.\")" + ], + "metadata": { + "id": "qixVihRTp3st" + }, + "execution_count": null }, { + "id": "78f8194f", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import asyncio\n", "import uuid\n", @@ -331,7 +424,7 @@ "# Define three conversations\n", "conversations = [\n", " {\n", - " \"label\": \"Simple trip (SF -> NY)\",\n", + " \"label\": \"Simple trip (SF -\u003e NY)\",\n", " \"messages\": [\n", " (\n", " \"Plan a weekend trip from San Francisco to New York\"\n", @@ -342,7 +435,7 @@ " ],\n", " },\n", " {\n", - " \"label\": \"Complex trip (LA -> Tokyo)\",\n", + " \"label\": \"Complex trip (LA -\u003e Tokyo)\",\n", " \"messages\": [\n", " (\n", " \"I want to plan a 5-day vacation to Tokyo from\"\n", @@ -357,7 +450,7 @@ " ],\n", " },\n", " {\n", - " \"label\": \"Multi-turn (Chicago -> Paris)\",\n", + " \"label\": \"Multi-turn (Chicago -\u003e Paris)\",\n", " \"messages\": [\n", " \"What's the weather like in Paris on 2025-04-20?\",\n", " \"Find me flights from Chicago to Paris on 2025-04-20.\",\n", @@ -400,11 +493,11 @@ " if hasattr(part, \"text\") and part.text:\n", " response_parts.append(part.text)\n", " elif hasattr(part, \"function_call\") and part.function_call:\n", - " print(f\" -> Tool call: {part.function_call.name}\")\n", + " print(f\" -\u003e Tool call: {part.function_call.name}\")\n", " if response_parts:\n", " text = \"\\n\".join(response_parts)\n", " print(f\"\\n[Agent]: {text[:1000]}\")\n", - " if len(text) > 1000:\n", + " if len(text) \u003e 1000:\n", " print(f\" ... (truncated, {len(text)} chars total)\")\n", " return session_id\n", "\n", @@ -418,13 +511,15 @@ " session_ids.append(sid)\n", "\n", "print(f\"\\n\\nSession IDs: {session_ids}\")" - ] + ], + "metadata": { + "id": "zPa9pWbFp3st" + }, + "execution_count": null }, { + "id": "9c8c9555", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import time\n", "\n", @@ -439,38 +534,73 @@ "print(f\"Waiting {settle_seconds}s for BigQuery data to settle ...\")\n", "time.sleep(settle_seconds)\n", "print(\"Done.\")" - ] + ], + "metadata": { + "id": "JoDsNsxYp3st" + }, + "execution_count": null }, { + "id": "8fe9df45", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 2: Trace Retrieval & Visualization\n", + "## Phase 2: Trace Retrieval \u0026 Visualization\n", "\n", "Now that traces are in BigQuery, we use the **SDK Client** to fetch them. Each `Trace` contains a hierarchical span tree that can be rendered as a DAG. We can also inspect tool calls, the final response, and any error spans." - ] + ], + "metadata": { + "id": "xOtm5izep3st" + }, + "execution_count": null }, { + "id": "14731b45", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "from bigquery_agent_analytics import Client, TraceFilter\n\nclient = Client(\n project_id=PROJECT_ID,\n dataset_id=DATASET_ID,\n table_id=TABLE_ID,\n location=LOCATION,\n endpoint=SDK_ENDPOINT,\n)\nprint(\"SDK Client initialised.\")" + "source": [ + "from bigquery_agent_analytics import Client, TraceFilter\n", + "\n", + "client = Client(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + " location=LOCATION,\n", + " endpoint=SDK_ENDPOINT,\n", + ")\n", + "print(\"SDK Client initialised.\")" + ], + "metadata": { + "id": "ol6-WsOxp3st" + }, + "execution_count": null }, { + "id": "02347067", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Retrieve and render each trace\ntraces = []\nfor sid in session_ids:\n try:\n trace = client.get_session_trace(sid)\n traces.append(trace)\n print(f\"\\n{'=' * 60}\")\n print(f\" Trace for session: {sid}\")\n print(f\"{'=' * 60}\")\n _ = trace.render() # render() prints and returns the tree\n except Exception as exc:\n print(f\"Could not retrieve trace {sid}: {exc}\")\n traces.append(None)" + "source": [ + "# Retrieve and render each trace\n", + "traces = []\n", + "for sid in session_ids:\n", + " try:\n", + " trace = client.get_session_trace(sid)\n", + " traces.append(trace)\n", + " print(f\"\\n{'=' * 60}\")\n", + " print(f\" Trace for session: {sid}\")\n", + " print(f\"{'=' * 60}\")\n", + " _ = trace.render() # render() prints and returns the tree\n", + " except Exception as exc:\n", + " print(f\"Could not retrieve trace {sid}: {exc}\")\n", + " traces.append(None)" + ], + "metadata": { + "id": "lC4egf6Ap3st" + }, + "execution_count": null }, { + "id": "1875763a", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Inspect trace properties\n", "for i, trace in enumerate(traces):\n", @@ -486,13 +616,15 @@ " print(f\" Error spans: {len(errors)}\")\n", " for es in errors:\n", " print(f\" - {es.event_type}: {es.error_message}\")" - ] + ], + "metadata": { + "id": "zo1fEqhWp3st" + }, + "execution_count": null }, { + "id": "8a0148e4", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# List traces with filtering\n", "all_traces = client.list_traces(\n", @@ -502,35 +634,41 @@ "for t in all_traces:\n", " print(f\" - {t.session_id} spans={len(t.spans)} \"\n", " f\"tools={len(t.tool_calls)}\")" - ] + ], + "metadata": { + "id": "pEGDqFtrp3st" + }, + "execution_count": null }, { + "id": "dcced7e3", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 3: Code-Based Evaluation\n", "\n", - "The `CodeEvaluator` runs deterministic metrics over session aggregates — no LLM needed. Pre-built evaluators cover latency, turn count, error rate, token efficiency, and cost. You can also define custom metrics." - ] + "The `SystemEvaluator` runs deterministic metrics over session aggregates — no LLM needed. Pre-built evaluators cover latency, turn count, error rate, token efficiency, and cost. You can also define custom metrics." + ], + "metadata": { + "id": "PSNQk0A3p3st" + }, + "execution_count": null }, { + "id": "ec6619a3", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ - "from bigquery_agent_analytics import CodeEvaluator\n", + "from bigquery_agent_analytics import SystemEvaluator\n", "\n", "trace_filter = TraceFilter(session_ids=session_ids)\n", "\n", "presets = [\n", - " (\"latency\", CodeEvaluator.latency(threshold_ms=30000)),\n", - " (\"turn_count\", CodeEvaluator.turn_count(max_turns=10)),\n", - " (\"error_rate\", CodeEvaluator.error_rate(max_error_rate=0.1)),\n", - " (\"token_efficiency\", CodeEvaluator.token_efficiency(max_tokens=100000)),\n", - " (\"cost_per_session\", CodeEvaluator.cost_per_session(max_cost_usd=1.0)),\n", + " (\"latency\", SystemEvaluator.latency(threshold_ms=30000)),\n", + " (\"turn_count\", SystemEvaluator.turn_count(max_turns=10)),\n", + " (\"error_rate\", SystemEvaluator.error_rate(max_error_rate=0.1)),\n", + " (\"token_efficiency\", SystemEvaluator.token_efficiency(max_tokens=100000)),\n", + " (\"cost_per_session\", SystemEvaluator.cost_per_session(max_cost_usd=1.0)),\n", "]\n", "\n", "for name, evaluator in presets:\n", @@ -546,31 +684,70 @@ " print(report.summary())\n", " except Exception as exc:\n", " print(f\"\\n[{name}] Failed: {exc}\")" - ] + ], + "metadata": { + "id": "HoYrzAbwp3su" + }, + "execution_count": null }, { + "id": "df2cf024", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Custom metric: response length scoring\ndef response_length_score(session_summary: dict) -> float:\n \"\"\"Score based on response token count — longer is better up to a point.\"\"\"\n tokens = session_summary.get(\"output_tokens\") or 0\n # Ideal range: 200-2000 tokens\n if 200 <= tokens <= 2000:\n return 1.0\n elif tokens < 200:\n return tokens / 200.0\n else:\n return max(0.0, 1.0 - (tokens - 2000) / 5000.0)\n\n\ncustom_eval = (\n CodeEvaluator(\"custom_metrics\")\n .add_metric(\"response_length\", response_length_score, threshold=0.5)\n)\n\ntry:\n report = asyncio.get_event_loop().run_until_complete(\n asyncio.to_thread(\n client.evaluate,\n evaluator=custom_eval,\n filters=trace_filter,\n )\n )\n print(\"[custom: response_length]\")\n print(report.summary())\nexcept Exception as exc:\n print(f\"Custom evaluator failed: {exc}\")" + "source": [ + "# Custom metric: response length scoring\n", + "def response_length_score(session_summary: dict) -\u003e float:\n", + " \"\"\"Score based on response token count — longer is better up to a point.\"\"\"\n", + " tokens = session_summary.get(\"output_tokens\") or 0\n", + " # Ideal range: 200-2000 tokens\n", + " if 200 \u003c= tokens \u003c= 2000:\n", + " return 1.0\n", + " elif tokens \u003c 200:\n", + " return tokens / 200.0\n", + " else:\n", + " return max(0.0, 1.0 - (tokens - 2000) / 5000.0)\n", + "\n", + "\n", + "custom_eval = (\n", + " SystemEvaluator(\"custom_metrics\")\n", + " .add_metric(\"response_length\", response_length_score, threshold=0.5)\n", + ")\n", + "\n", + "try:\n", + " report = asyncio.get_event_loop().run_until_complete(\n", + " asyncio.to_thread(\n", + " client.evaluate,\n", + " evaluator=custom_eval,\n", + " filters=trace_filter,\n", + " )\n", + " )\n", + " print(\"[custom: response_length]\")\n", + " print(report.summary())\n", + "except Exception as exc:\n", + " print(f\"Custom evaluator failed: {exc}\")" + ], + "metadata": { + "id": "IMhRTNwWp3su" + }, + "execution_count": null }, { + "id": "e2d74ab9", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 4: LLM-as-Judge Evaluation\n", "\n", "Semantic evaluation using an LLM to judge agent quality. The SDK supports a 3-tier fallback: BigQuery `AI.GENERATE` → `ML.GENERATE_TEXT` → Gemini API. Pre-built judges evaluate **correctness**, **hallucination** (faithfulness), and **sentiment**." - ] + ], + "metadata": { + "id": "o0FOaPmfp3su" + }, + "execution_count": null }, { + "id": "acefb4ef", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import LLMAsJudge\n", "\n", @@ -594,13 +771,15 @@ " print(f\" Feedback: {ss.llm_feedback[:200]}\")\n", "except Exception as exc:\n", " print(f\"Correctness judge failed: {exc}\")" - ] + ], + "metadata": { + "id": "lzZ_V592p3su" + }, + "execution_count": null }, { + "id": "4bd972cd", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Hallucination (faithfulness) evaluation\n", "judge_hallucination = LLMAsJudge.hallucination()\n", @@ -616,13 +795,15 @@ " print(report.summary())\n", "except Exception as exc:\n", " print(f\"Hallucination judge failed: {exc}\")" - ] + ], + "metadata": { + "id": "up5XD8Gup3su" + }, + "execution_count": null }, { + "id": "920c9d37", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Sentiment evaluation\n", "judge_sentiment = LLMAsJudge.sentiment()\n", @@ -638,11 +819,15 @@ " print(report.summary())\n", "except Exception as exc:\n", " print(f\"Sentiment judge failed: {exc}\")" - ] + ], + "metadata": { + "id": "jLMFHY0Hp3su" + }, + "execution_count": null }, { + "id": "24bef541", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -652,16 +837,18 @@ "\n", "| `MatchType` | Description |\n", "|---|---|\n", - "| `EXACT` | Tool calls must match exactly (order & count) |\n", + "| `EXACT` | Tool calls must match exactly (order \u0026 count) |\n", "| `IN_ORDER` | Expected tools appear in order, extra tools allowed between |\n", "| `ANY_ORDER` | All expected tools present, any order |\n" - ] + ], + "metadata": { + "id": "DUeF1Ynpp3su" + }, + "execution_count": null }, { + "id": "3d4ee664", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", "from bigquery_agent_analytics.trace_evaluator import MatchType\n", @@ -672,13 +859,15 @@ " table_id=TABLE_ID,\n", ")\n", "print(\"BigQueryTraceEvaluator ready.\")" - ] + ], + "metadata": { + "id": "ZqN2EG0Up3su" + }, + "execution_count": null }, { + "id": "f0a6e5b2", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "import json\n", "\n", @@ -707,13 +896,15 @@ " print(f\" Details : {json.dumps(result.details, indent=2)}\")\n", "except Exception as exc:\n", " print(f\"Trajectory evaluation failed: {exc}\")" - ] + ], + "metadata": { + "id": "B2q5to58p3su" + }, + "execution_count": null }, { + "id": "802977ea", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Compare match types: EXACT vs ANY_ORDER on the same session\n", "for match_type in [MatchType.EXACT, MatchType.ANY_ORDER]:\n", @@ -730,13 +921,15 @@ " f\"Scores: {result.scores}\")\n", " except Exception as exc:\n", " print(f\" {match_type.value} failed: {exc}\")" - ] + ], + "metadata": { + "id": "9AbSR-Cmp3su" + }, + "execution_count": null }, { + "id": "f79cb30f", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Batch evaluation across all sessions\n", "eval_dataset = [\n", @@ -774,24 +967,30 @@ " f\"scores={r.scores}\")\n", "except Exception as exc:\n", " print(f\"Batch evaluation failed: {exc}\")" - ] + ], + "metadata": { + "id": "gAfKngThp3sv" + }, + "execution_count": null }, { + "id": "94ddda5f", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", "## Phase 6: Grader Pipeline\n", "\n", "Compose multiple evaluators (code + LLM) into a single **GraderPipeline** with configurable voting strategies: `WeightedStrategy`, `BinaryStrategy`, or `MajorityStrategy`." - ] + ], + "metadata": { + "id": "z6F7hA1yp3sv" + }, + "execution_count": null }, { + "id": "274131b3", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import (\n", " GraderPipeline,\n", @@ -804,11 +1003,11 @@ "pipeline = (\n", " GraderPipeline(WeightedStrategy(threshold=0.6))\n", " .add_code_grader(\n", - " CodeEvaluator.latency(threshold_ms=30000),\n", + " SystemEvaluator.latency(threshold_ms=30000),\n", " weight=1.0,\n", " )\n", " .add_code_grader(\n", - " CodeEvaluator.error_rate(max_error_rate=0.1),\n", + " SystemEvaluator.error_rate(max_error_rate=0.1),\n", " weight=1.0,\n", " )\n", " .add_llm_grader(\n", @@ -817,38 +1016,147 @@ " )\n", ")\n", "print(\"GraderPipeline built (weighted: code=1.0 + code=1.0 + llm=2.0).\")" - ] + ], + "metadata": { + "id": "_ONYUGcnp3sv" + }, + "execution_count": null }, { + "id": "b315654e", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Construct session_summary from trace metadata and evaluate\n# We use the Tokyo trip trace (index 1) as an example.\nimport io, contextlib\n\ntrace_idx = 1\nif traces[trace_idx] is not None:\n trace = traces[trace_idx]\n session_summary = {\n \"session_id\": trace.session_id,\n \"total_events\": len(trace.spans),\n \"tool_calls\": len(trace.tool_calls),\n \"tool_errors\": len(trace.error_spans),\n \"llm_calls\": sum(\n 1 for s in trace.spans\n if s.event_type in (\"llm_request\", \"llm_response\")\n ),\n \"avg_latency_ms\": (\n trace.total_latency_ms / max(len(trace.spans), 1)\n if trace.total_latency_ms\n else 0.0\n ),\n \"max_latency_ms\": max(\n (s.latency_ms or 0 for s in trace.spans), default=0\n ),\n \"total_latency_ms\": trace.total_latency_ms or 0.0,\n \"turn_count\": sum(\n 1 for s in trace.spans if s.event_type == \"user_message\"\n ),\n \"has_error\": len(trace.error_spans) > 0,\n \"input_tokens\": sum(\n s.attributes.get(\"input_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"output_tokens\": sum(\n s.attributes.get(\"output_tokens\", 0) or 0\n for s in trace.spans\n ),\n \"total_tokens\": sum(\n s.attributes.get(\"total_tokens\", 0) or 0\n for s in trace.spans\n ),\n }\n\n # Get trace text (suppress render's print) and final response\n buf = io.StringIO()\n with contextlib.redirect_stdout(buf):\n trace_text = trace.render(format=\"tree\")\n if not isinstance(trace_text, str):\n trace_text = buf.getvalue()\n final_response = trace.final_response or \"\"\n\n verdict = asyncio.get_event_loop().run_until_complete(\n pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n\n print(f\"[GraderPipeline — Weighted]\")\n print(f\" Final score : {verdict.final_score:.3f}\")\n print(f\" Passed : {verdict.passed}\")\n print(f\" Strategy : {verdict.strategy_name}\")\n print(f\" Grader breakdown:\")\n for gr in verdict.grader_results:\n print(f\" - {gr.grader_name}: scores={gr.scores} \"\n f\"passed={gr.passed}\")\nelse:\n print(\"Trace not available — skipping pipeline evaluation.\")" + "source": [ + "# Construct session_summary from trace metadata and evaluate\n", + "# We use the Tokyo trip trace (index 1) as an example.\n", + "import io, contextlib\n", + "\n", + "trace_idx = 1\n", + "if traces[trace_idx] is not None:\n", + " trace = traces[trace_idx]\n", + " session_summary = {\n", + " \"session_id\": trace.session_id,\n", + " \"total_events\": len(trace.spans),\n", + " \"tool_calls\": len(trace.tool_calls),\n", + " \"tool_errors\": len(trace.error_spans),\n", + " \"llm_calls\": sum(\n", + " 1 for s in trace.spans\n", + " if s.event_type in (\"llm_request\", \"llm_response\")\n", + " ),\n", + " \"avg_latency_ms\": (\n", + " trace.total_latency_ms / max(len(trace.spans), 1)\n", + " if trace.total_latency_ms\n", + " else 0.0\n", + " ),\n", + " \"max_latency_ms\": max(\n", + " (s.latency_ms or 0 for s in trace.spans), default=0\n", + " ),\n", + " \"total_latency_ms\": trace.total_latency_ms or 0.0,\n", + " \"turn_count\": sum(\n", + " 1 for s in trace.spans if s.event_type == \"user_message\"\n", + " ),\n", + " \"has_error\": len(trace.error_spans) \u003e 0,\n", + " \"input_tokens\": sum(\n", + " s.attributes.get(\"input_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " \"output_tokens\": sum(\n", + " s.attributes.get(\"output_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " \"total_tokens\": sum(\n", + " s.attributes.get(\"total_tokens\", 0) or 0\n", + " for s in trace.spans\n", + " ),\n", + " }\n", + "\n", + " # Get trace text (suppress render's print) and final response\n", + " buf = io.StringIO()\n", + " with contextlib.redirect_stdout(buf):\n", + " trace_text = trace.render(format=\"tree\")\n", + " if not isinstance(trace_text, str):\n", + " trace_text = buf.getvalue()\n", + " final_response = trace.final_response or \"\"\n", + "\n", + " verdict = asyncio.get_event_loop().run_until_complete(\n", + " pipeline.evaluate(\n", + " session_summary=session_summary,\n", + " trace_text=trace_text,\n", + " final_response=final_response,\n", + " )\n", + " )\n", + "\n", + " print(f\"[GraderPipeline — Weighted]\")\n", + " print(f\" Final score : {verdict.final_score:.3f}\")\n", + " print(f\" Passed : {verdict.passed}\")\n", + " print(f\" Strategy : {verdict.strategy_name}\")\n", + " print(f\" Grader breakdown:\")\n", + " for gr in verdict.grader_results:\n", + " print(f\" - {gr.grader_name}: scores={gr.scores} \"\n", + " f\"passed={gr.passed}\")\n", + "else:\n", + " print(\"Trace not available — skipping pipeline evaluation.\")" + ], + "metadata": { + "id": "u8bk4AHSp3sv" + }, + "execution_count": null }, { + "id": "fdcac032", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Demo alternative strategies: Binary and Majority\nif traces[trace_idx] is not None:\n for strategy_cls, strategy_name in [\n (BinaryStrategy, \"Binary (all must pass)\"),\n (MajorityStrategy, \"Majority\"),\n ]:\n alt_pipeline = (\n GraderPipeline(strategy_cls())\n .add_code_grader(\n CodeEvaluator.latency(threshold_ms=30000),\n )\n .add_code_grader(\n CodeEvaluator.error_rate(max_error_rate=0.1),\n )\n .add_llm_grader(\n LLMAsJudge.correctness(threshold=0.6),\n )\n )\n v = asyncio.get_event_loop().run_until_complete(\n alt_pipeline.evaluate(\n session_summary=session_summary,\n trace_text=trace_text,\n final_response=final_response,\n )\n )\n print(f\"\\n[GraderPipeline — {strategy_name}]\")\n print(f\" Final score: {v.final_score:.3f} \"\n f\"Passed: {v.passed}\")" + "source": [ + "# Demo alternative strategies: Binary and Majority\n", + "if traces[trace_idx] is not None:\n", + " for strategy_cls, strategy_name in [\n", + " (BinaryStrategy, \"Binary (all must pass)\"),\n", + " (MajorityStrategy, \"Majority\"),\n", + " ]:\n", + " alt_pipeline = (\n", + " GraderPipeline(strategy_cls())\n", + " .add_code_grader(\n", + " SystemEvaluator.latency(threshold_ms=30000),\n", + " )\n", + " .add_code_grader(\n", + " SystemEvaluator.error_rate(max_error_rate=0.1),\n", + " )\n", + " .add_llm_grader(\n", + " LLMAsJudge.correctness(threshold=0.6),\n", + " )\n", + " )\n", + " v = asyncio.get_event_loop().run_until_complete(\n", + " alt_pipeline.evaluate(\n", + " session_summary=session_summary,\n", + " trace_text=trace_text,\n", + " final_response=final_response,\n", + " )\n", + " )\n", + " print(f\"\\n[GraderPipeline — {strategy_name}]\")\n", + " print(f\" Final score: {v.final_score:.3f} \"\n", + " f\"Passed: {v.passed}\")" + ], + "metadata": { + "id": "9d8w3k-ip3sv" + }, + "execution_count": null }, { + "id": "7b7413cc", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 7: Eval Suite & Validator\n", + "## Phase 7: Eval Suite \u0026 Validator\n", "\n", "The **EvalSuite** manages evaluation task definitions, supports capability-to-regression graduation, and exports to eval datasets. The **EvalValidator** performs sanity checks (ambiguity, balance, threshold consistency, duplicates, saturation)." - ] + ], + "metadata": { + "id": "oG0oPAIsp3sv" + }, + "execution_count": null }, { + "id": "7976c89c", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import (\n", " EvalSuite,\n", @@ -863,7 +1171,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"simple_trip_sf_ny\",\n", " session_id=session_ids[0],\n", - " description=\"Simple SF->NY weekend trip — should call flights + hotels.\",\n", + " description=\"Simple SF-\u003eNY weekend trip — should call flights + hotels.\",\n", " category=EvalCategory.CAPABILITY,\n", " expected_trajectory=[\n", " {\"tool_name\": \"search_flights\"},\n", @@ -876,7 +1184,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"complex_trip_tokyo\",\n", " session_id=session_ids[1],\n", - " description=\"Complex LA->Tokyo 5-day trip — all 4 tools expected.\",\n", + " description=\"Complex LA-\u003eTokyo 5-day trip — all 4 tools expected.\",\n", " category=EvalCategory.CAPABILITY,\n", " expected_trajectory=golden_tokyo,\n", " thresholds={\"trajectory_match\": 0.9, \"latency\": 0.6},\n", @@ -886,7 +1194,7 @@ "suite.add_task(EvalTaskDef(\n", " task_id=\"multiturn_paris\",\n", " session_id=session_ids[2],\n", - " description=\"Multi-turn Chicago->Paris — weather, flights, hotels across 3 turns.\",\n", + " description=\"Multi-turn Chicago-\u003eParis — weather, flights, hotels across 3 turns.\",\n", " category=EvalCategory.REGRESSION,\n", " expected_trajectory=[\n", " {\"tool_name\": \"get_weather_forecast\"},\n", @@ -900,13 +1208,15 @@ "print(f\"EvalSuite '{suite.name}' — {len(suite.get_tasks())} tasks added.\")\n", "for t in suite.get_tasks():\n", " print(f\" [{t.category.value}] {t.task_id}: {t.description}\")" - ] + ], + "metadata": { + "id": "uxSPhjfbp3sv" + }, + "execution_count": null }, { + "id": "52c9f717", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Suite health check\n", "pass_history = {\n", @@ -928,13 +1238,15 @@ " print(f\" Warnings:\")\n", " for w in health.warnings:\n", " print(f\" - {w}\")" - ] + ], + "metadata": { + "id": "hdjMktF1p3sw" + }, + "execution_count": null }, { + "id": "4fcadb6e", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Validate suite\n", "warnings = EvalValidator.validate_suite(\n", @@ -946,13 +1258,15 @@ " f\"(task={w.task_id}): {w.message}\")\n", "if not warnings:\n", " print(\" No warnings — suite looks healthy!\")" - ] + ], + "metadata": { + "id": "c40X0-Fjp3sw" + }, + "execution_count": null }, { + "id": "adcd8413", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Export to eval dataset\n", "eval_ds = suite.to_eval_dataset()\n", @@ -961,11 +1275,15 @@ " print(f\" session_id={entry['session_id']} \"\n", " f\"trajectory_len=\"\n", " f\"{len(entry.get('expected_trajectory', []))}\")" - ] + ], + "metadata": { + "id": "koUNcDQcp3sw" + }, + "execution_count": null }, { + "id": "7cbf3e5b", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -976,13 +1294,15 @@ "- **pass^k** — probability that all k trials pass\n", "- **per_trial_pass_rate** — fraction of trials that passed\n", "- **mean_scores** and **score_std_dev** — statistics across trials" - ] + ], + "metadata": { + "id": "FbwSbuuyp3sw" + }, + "execution_count": null }, { + "id": "7f009e2a", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import TrialRunner\n", "\n", @@ -1013,11 +1333,15 @@ " f\"scores={tr.scores}\")\n", "except Exception as exc:\n", " print(f\"Multi-trial evaluation failed: {exc}\")" - ] + ], + "metadata": { + "id": "LBnbsALjp3sw" + }, + "execution_count": null }, { + "id": "866d7a58", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -1029,13 +1353,15 @@ "3. Cross-session aggregation\n", "4. Multi-prompt analysis (7 specialised prompts)\n", "5. Executive summary generation" - ] + ], + "metadata": { + "id": "Nk6OnETAp3sw" + }, + "execution_count": null }, { + "id": "c3695f72", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import InsightsConfig\n", "\n", @@ -1055,13 +1381,15 @@ " print(insights_report.summary())\n", "except Exception as exc:\n", " print(f\"Insights generation failed: {exc}\")" - ] + ], + "metadata": { + "id": "2i-psQC1p3sw" + }, + "execution_count": null }, { + "id": "b79448f7", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Executive summary\n", "try:\n", @@ -1072,48 +1400,76 @@ " print(\"No executive summary generated.\")\n", "except NameError:\n", " print(\"Insights report not available — run previous cell first.\")" - ] + ], + "metadata": { + "id": "83z7Kjurp3sw" + }, + "execution_count": null }, { + "id": "7fbe2466", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Analysis sections\n", "try:\n", " for section in insights_report.analysis_sections:\n", " print(f\"\\n## {section.title}\")\n", " print(section.content[:2000])\n", - " if len(section.content) > 2000:\n", + " if len(section.content) \u003e 2000:\n", " print(\" ... (truncated)\")\n", "except NameError:\n", " print(\"Insights report not available — run previous cells first.\")" - ] + ], + "metadata": { + "id": "4u0384Qhp3sw" + }, + "execution_count": null }, { + "id": "1e4d2ed5", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Per-session facets\ntry:\n print(\"[Session Facets]\")\n for facet in insights_report.session_facets:\n print(f\"\\n Session: {facet.session_id}\")\n if facet.goal_categories:\n print(f\" Goal categories : {facet.goal_categories}\")\n if facet.outcome:\n print(f\" Outcome : {facet.outcome}\")\n if facet.satisfaction:\n print(f\" Satisfaction : {facet.satisfaction}\")\n if facet.key_topics:\n print(f\" Key topics : {facet.key_topics}\")\n print(f\" Effectiveness : {facet.agent_effectiveness}\")\n print(f\" Primary success : {facet.primary_success}\")\nexcept NameError:\n print(\"Insights report not available — run previous cells first.\")" + "source": [ + "# Per-session facets\n", + "try:\n", + " print(\"[Session Facets]\")\n", + " for facet in insights_report.session_facets:\n", + " print(f\"\\n Session: {facet.session_id}\")\n", + " if facet.goal_categories:\n", + " print(f\" Goal categories : {facet.goal_categories}\")\n", + " if facet.outcome:\n", + " print(f\" Outcome : {facet.outcome}\")\n", + " if facet.satisfaction:\n", + " print(f\" Satisfaction : {facet.satisfaction}\")\n", + " if facet.key_topics:\n", + " print(f\" Key topics : {facet.key_topics}\")\n", + " print(f\" Effectiveness : {facet.agent_effectiveness}\")\n", + " print(f\" Primary success : {facet.primary_success}\")\n", + "except NameError:\n", + " print(\"Insights report not available — run previous cells first.\")" + ], + "metadata": { + "id": "5hDmltKOp3sw" + }, + "execution_count": null }, { + "id": "34a1461b", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", - "## Phase 10: Deep Analysis & Drift Detection\n", + "## Phase 10: Deep Analysis \u0026 Drift Detection\n", "\n", "**Deep analysis** performs question distribution analysis — grouping user queries into semantic categories. **Drift detection** compares production questions against a golden dataset to measure coverage." - ] + ], + "metadata": { + "id": "GV-OHByNp3sw" + }, + "execution_count": null }, { + "id": "dd6cbd63", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bigquery_agent_analytics import AnalysisConfig\n", "\n", @@ -1137,13 +1493,15 @@ " print(f\" - {ex}\")\n", "except Exception as exc:\n", " print(f\"Deep analysis failed: {exc}\")" - ] + ], + "metadata": { + "id": "Ogza-ndNp3sw" + }, + "execution_count": null }, { + "id": "b5453b51", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Drift detection requires a golden dataset table in BigQuery.\n", "# Below shows the API pattern — uncomment and provide your golden table.\n", @@ -1164,11 +1522,15 @@ "# print(f\" New questions: {drift_report.new_questions}\")\n", "\n", "print(\"Drift detection requires a golden dataset table — see commented code above.\")" - ] + ], + "metadata": { + "id": "C6f1eZRup3sw" + }, + "execution_count": null }, { + "id": "4a9f07fb", "cell_type": "markdown", - "metadata": {}, "source": [ "---\n", "\n", @@ -1197,13 +1559,15 @@ "- **Suite management**: `EvalSuite` + `EvalValidator` support capability-to-regression graduation and health monitoring.\n", "- **Non-determinism handling**: `TrialRunner` repeats evaluations to compute robust pass@k/pass^k metrics.\n", "- **AI-powered insights**: The insights pipeline and deep analysis provide actionable intelligence about agent behavior at scale." - ] + ], + "metadata": { + "id": "xHuSa9xup3sw" + }, + "execution_count": null }, { + "id": "e7c2c9af", "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# Cleanup\n", "try:\n", @@ -1216,7 +1580,11 @@ "print(\"\\nDemo complete!\")\n", "print(f\"Sessions: {session_ids}\")\n", "print(f\"Traces logged to: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}\")" - ] + ], + "metadata": { + "id": "Lcf5ZXbIp3sw" + }, + "execution_count": null } ], "metadata": { @@ -1238,6 +1606,6 @@ "version": "3.10.0" } }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "nbformat_minor": 4, + "nbformat": 4 +} diff --git a/examples/nba_agent_trace_analysis_notebook.ipynb b/examples/nba_agent_trace_analysis_notebook.ipynb index b5d805c9..201948f4 100644 --- a/examples/nba_agent_trace_analysis_notebook.ipynb +++ b/examples/nba_agent_trace_analysis_notebook.ipynb @@ -1,514 +1,533 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Copyright 2026 Google LLC\n", - "\n", - "Licensed under the Apache License, Version 2.0.\n", - "\n", - "# NBA Agent Trace Analytics Notebook (ADK + BigQuery Agent Analytics SDK)\n", - "\n", - "This notebook demonstrates an end-to-end workflow:\n", - "\n", - "```mermaid\n", - "flowchart LR\n", - " A[Configure gcloud / gh / ADK env] --> B[Run NBA ADK agent]\n", - " B --> C[BigQuery Agent Analytics Plugin logs events]\n", - " C --> D[BigQuery dataset: agent_trace]\n", - " D --> E[SDK Client reconstructs traces]\n", - " E --> F[Code + trajectory evaluation]\n", - " F --> G[Insights for NBA conversations]\n", - "```\n", - "\n", - "Target environment defaults used in this notebook:\n", - "- **GCP project**: `your-project-id`\n", - "- **BigQuery dataset**: `agent_trace`\n", - "- **Repository**: `haiyuan-eng-google/BigQuery-Agent-Analytics-SDK`\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"Vertex Open in Vertex AI Workbench\n", - " \n", - " \n", - " \n", - " \"BQ Open in BQ Studio\n", - " \n", - "
\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Copyright 2026 Google LLC\n", + "\n", + "Licensed under the Apache License, Version 2.0.\n", + "\n", + "# NBA Agent Trace Analytics Notebook (ADK + BigQuery Agent Analytics SDK)\n", + "\n", + "This notebook demonstrates an end-to-end workflow:\n", + "\n", + "```mermaid\n", + "flowchart LR\n", + " A[Configure gcloud / gh / ADK env] --> B[Run NBA ADK agent]\n", + " B --> C[BigQuery Agent Analytics Plugin logs events]\n", + " C --> D[BigQuery dataset: agent_trace]\n", + " D --> E[SDK Client reconstructs traces]\n", + " E --> F[Code + trajectory evaluation]\n", + " F --> G[Insights for NBA conversations]\n", + "```\n", + "\n", + "Target environment defaults used in this notebook:\n", + "- **GCP project**: `your-project-id`\n", + "- **BigQuery dataset**: `agent_trace`\n", + "- **Repository**: `haiyuan-eng-google/BigQuery-Agent-Analytics-SDK`\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"Vertex Open in Vertex AI Workbench\n", + " \n", + " \n", + " \n", + " \"BQ Open in BQ Studio\n", + " \n", + "
\n" + ], + "id": "TirSFqa3qBVV" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Install dependencies" + ], + "id": "JgPD3sAHqBVV" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio pandas || pip install -q google-adk google-cloud-bigquery nest-asyncio pandas \"git+https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK.git@main\"\n" + ], + "id": "lAkwAvnAqBVV" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Environment setup (gcloud / gh / ADK)" + ], + "id": "yO9HsIFcqBVV" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import os\n", + "import subprocess\n", + "\n", + "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"your-project-id\")\n", + "DATASET_ID = \"agent_trace\"\n", + "TABLE_ID = \"agent_events\"\n", + "BQ_LOCATION = \"US\"\n", + "MODEL_NAME = \"gemini-2.5-flash\"\n", + "\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "os.environ[\"BQ_DATASET\"] = DATASET_ID\n", + "os.environ[\"BQ_TABLE\"] = TABLE_ID\n", + "os.environ.setdefault(\"GOOGLE_CLOUD_LOCATION\", \"global\")\n", + "os.environ.setdefault(\"GOOGLE_GENAI_USE_VERTEXAI\", \"true\")\n", + "\n", + "print(\"Environment configured:\")\n", + "for k in [\"GOOGLE_CLOUD_PROJECT\", \"BQ_DATASET\", \"BQ_TABLE\", \"GOOGLE_CLOUD_LOCATION\", \"GOOGLE_GENAI_USE_VERTEXAI\"]:\n", + " print(f\" {k}={os.environ[k]}\")\n", + "\n", + "\n", + "def _run(cmd: str):\n", + " print(f\"\\n$ {cmd}\")\n", + " p = subprocess.run(cmd, shell=True, text=True, capture_output=True)\n", + " if p.stdout:\n", + " print(p.stdout.strip())\n", + " if p.returncode != 0 and p.stderr:\n", + " print(p.stderr.strip())\n", + "\n", + "# Quick health checks for local tools\n", + "_run(\"gcloud --version\")\n", + "_run(\"gh --version\")\n", + "\n", + "print(\"\"\"\n", + "If auth is needed, run manually in a terminal:\n", + " gcloud auth login\n", + " gcloud auth application-default login\n", + " gcloud config set project $GOOGLE_CLOUD_PROJECT\n", + " bq --location=US mk --dataset --if_not_exists ${GOOGLE_CLOUD_PROJECT}:agent_trace\n", + " gh auth login\n", + "\"\"\")\n" + ], + "id": "_1aybBruqBVV" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Define a deterministic NBA toolset for the test agent" + ], + "id": "8w5CspuXqBVV" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import hashlib\n", + "import random\n", + "from typing import Any\n", + "\n", + "\n", + "def _rng_from(*parts: str) -> random.Random:\n", + " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", + " return random.Random(seed)\n", + "\n", + "\n", + "async def get_nba_team_snapshot(team_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + " \"\"\"Return deterministic synthetic team stats for demo trace generation.\"\"\"\n", + " rng = _rng_from(team_name, season)\n", + " wins = rng.randint(28, 62)\n", + " losses = 82 - wins\n", + " pace = round(rng.uniform(95.0, 103.0), 1)\n", + " off = round(rng.uniform(108.0, 124.0), 1)\n", + " deff = round(rng.uniform(106.0, 121.0), 1)\n", + " return {\n", + " \"team\": team_name,\n", + " \"season\": season,\n", + " \"wins\": wins,\n", + " \"losses\": losses,\n", + " \"net_rating\": round(off - deff, 1),\n", + " \"off_rating\": off,\n", + " \"def_rating\": deff,\n", + " \"pace\": pace,\n", + " }\n", + "\n", + "\n", + "async def get_nba_player_snapshot(player_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + " \"\"\"Return deterministic synthetic player box-score style metrics.\"\"\"\n", + " rng = _rng_from(player_name, season)\n", + " return {\n", + " \"player\": player_name,\n", + " \"season\": season,\n", + " \"games\": rng.randint(45, 82),\n", + " \"ppg\": round(rng.uniform(8.0, 34.0), 1),\n", + " \"rpg\": round(rng.uniform(1.5, 13.5), 1),\n", + " \"apg\": round(rng.uniform(1.0, 11.5), 1),\n", + " \"ts_pct\": round(rng.uniform(0.50, 0.69), 3),\n", + " }\n", + "\n", + "\n", + "async def compare_matchup(home_team: str, away_team: str, season: str = \"2024-25\") -> dict[str, Any]:\n", + " \"\"\"Create a deterministic matchup projection and rationale.\"\"\"\n", + " home = await get_nba_team_snapshot(home_team, season)\n", + " away = await get_nba_team_snapshot(away_team, season)\n", + " margin = round((home[\"net_rating\"] - away[\"net_rating\"]) + 2.1, 1)\n", + " favorite = home_team if margin >= 0 else away_team\n", + " return {\n", + " \"season\": season,\n", + " \"home_team\": home_team,\n", + " \"away_team\": away_team,\n", + " \"favorite\": favorite,\n", + " \"projected_margin\": abs(margin),\n", + " \"reason\": (\n", + " f\"{favorite} favored due to relative net rating edge and home-court adjustment.\"\n", + " ),\n", + " \"home_snapshot\": home,\n", + " \"away_snapshot\": away,\n", + " }\n" + ], + "id": "n4MUbvMHqBVW" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4) Build ADK NBA agent with BigQuery analytics plugin" + ], + "id": "BRUtKsQrqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import asyncio\n", + "import nest_asyncio\n", + "import time\n", + "import uuid\n", + "\n", + "from google.adk.agents import LlmAgent\n", + "from google.adk.plugins.bigquery_agent_analytics_plugin import BigQueryAgentAnalyticsPlugin\n", + "from google.adk.plugins.bigquery_agent_analytics_plugin import BigQueryLoggerConfig\n", + "from google.adk.runners import Runner\n", + "from google.adk.sessions import InMemorySessionService\n", + "from google.genai import types\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "plugin = BigQueryAgentAnalyticsPlugin(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + " location=BQ_LOCATION,\n", + " config=BigQueryLoggerConfig(),\n", + ")\n", + "\n", + "NBA_AGENT_INSTRUCTION = \"\"\"\n", + "You are an NBA analytics assistant.\n", + "- Always reason from available tool outputs.\n", + "- Be explicit when data is synthetic for demo purposes.\n", + "- Provide concise basketball insights in bullet points.\n", + "\"\"\"\n", + "\n", + "agent = LlmAgent(\n", + " name=\"nba_analytics_agent\",\n", + " model=MODEL_NAME,\n", + " instruction=NBA_AGENT_INSTRUCTION,\n", + " tools=[get_nba_team_snapshot, get_nba_player_snapshot, compare_matchup],\n", + ")\n", + "\n", + "session_service = InMemorySessionService()\n", + "runner = Runner(\n", + " app_name=\"nba_agent_trace_demo\",\n", + " agent=agent,\n", + " session_service=session_service,\n", + " plugins=[plugin],\n", + ")\n", + "\n", + "async def run_conversation(session_id: str, user_prompt: str) -> str:\n", + " await session_service.create_session(\n", + " app_name=\"nba_agent_trace_demo\",\n", + " user_id=\"demo_user\",\n", + " session_id=session_id,\n", + " )\n", + " final_text = \"\"\n", + " async for event in runner.run_async(\n", + " user_id=\"demo_user\",\n", + " session_id=session_id,\n", + " new_message=types.Content(role=\"user\", parts=[types.Part(text=user_prompt)]),\n", + " ):\n", + " if event.is_final_response() and event.content and event.content.parts:\n", + " final_text = event.content.parts[0].text or \"\"\n", + " return final_text\n", + "\n", + "prompts = [\n", + " \"Compare Lakers vs Celtics in a hypothetical Finals matchup and explain key edges.\",\n", + " \"Give me a scouting snapshot for Nikola Jokic, then compare Nuggets vs Bucks.\",\n", + " \"Who has the better 2024-25 profile: Knicks or 76ers? Include pace and ratings.\",\n", + " \"Break down Warriors vs Suns: expected favorite, projected margin, and why.\",\n", + " \"Create quick team snapshots for Heat and Cavaliers, then recommend who has a better playoff profile.\",\n", + "]\n", + "\n", + "session_ids = []\n", + "for prompt in prompts:\n", + " sid = f\"nba-{uuid.uuid4().hex[:10]}\"\n", + " session_ids.append(sid)\n", + " print(f\"\\n=== Running session {sid} ===\")\n", + " print(\"User:\", prompt)\n", + " response = asyncio.run(run_conversation(sid, prompt))\n", + " print(\"Agent:\", response[:600], \"...\" if len(response) > 600 else \"\")\n", + "\n", + "# Give plugin time to flush in notebook environments.\n", + "time.sleep(3)\n", + "print(\"\\nTrace sessions:\", session_ids)\n", + "\n", + "# Wait/poll until plugin writes become visible in BigQuery.\n", + "from google.cloud import bigquery\n", + "\n", + "bq_client = bigquery.Client(project=PROJECT_ID, location=BQ_LOCATION)\n", + "max_wait_s = 120\n", + "poll_s = 10\n", + "elapsed = 0\n", + "rows_visible = 0\n", + "\n", + "while elapsed <= max_wait_s:\n", + " query = f\"\"\"\n", + " SELECT COUNT(*) AS c\n", + " FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`\n", + " WHERE session_id IN UNNEST(@session_ids)\n", + " \"\"\"\n", + " job = bq_client.query(\n", + " query,\n", + " job_config=bigquery.QueryJobConfig(\n", + " query_parameters=[\n", + " bigquery.ArrayQueryParameter(\"session_ids\", \"STRING\", session_ids)\n", + " ]\n", + " ),\n", + " )\n", + " rows_visible = int(list(job.result())[0][\"c\"])\n", + " print(f\"Elapsed {elapsed:>3}s -> visible_rows={rows_visible}\")\n", + " if rows_visible > 0:\n", + " break\n", + " time.sleep(poll_s)\n", + " elapsed += poll_s\n", + "\n", + "print(\"Final visible rows for current notebook sessions:\", rows_visible)\n" + ], + "id": "tCtKdxSiqBVW" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5) Analyze traces with BigQuery Agent Analytics SDK APIs\n" + ], + "id": "TjnmAKbKqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", + "from bigquery_agent_analytics import Client\n", + "from bigquery_agent_analytics import SystemEvaluator\n", + "from bigquery_agent_analytics.trace_evaluator import MatchType\n", + "\n", + "client = Client(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + " location=BQ_LOCATION,\n", + ")\n", + "\n", + "target_session = session_ids[0]\n", + "traces = client.list_traces()\n", + "trace = next(t for t in traces if t.session_id == target_session)\n", + "print(\"Loaded trace for session:\", target_session, \"trace_id:\", trace.trace_id)\n", + "\n", + "# In notebooks, Trace.render() returns a graphviz object if available.\n", + "rendered = trace.render()\n", + "rendered\n" + ], + "id": "Shgs3TxyqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Quick trace table instead of raw SQL\n", + "trace_rows = []\n", + "for t in client.list_traces():\n", + " trace_rows.append({\n", + " \"session_id\": t.session_id,\n", + " \"trace_id\": t.trace_id,\n", + " \"user_id\": t.user_id,\n", + " \"span_count\": len(t.spans),\n", + " \"total_latency_ms\": t.total_latency_ms,\n", + " })\n", + "\n", + "import pandas as pd\n", + "pd.DataFrame(trace_rows).head(10)\n" + ], + "id": "OvNS0hdmqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Deterministic code-based evaluation examples (SDK client-side)\n", + "latency_eval = SystemEvaluator.latency(threshold_ms=12000)\n", + "turn_eval = SystemEvaluator.turn_count(max_turns=16)\n", + "\n", + "report_latency = client.evaluate(latency_eval)\n", + "report_turns = client.evaluate(turn_eval)\n", + "print(\"latency pass_rate:\", report_latency.pass_rate)\n", + "print(\"turn_count pass_rate:\", report_turns.pass_rate)\n" + ], + "id": "pqXW4nsLqBVW" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1) LLM-as-Judge example (factuality / tactical depth proxy)\n", + "This uses the SDK `LLMAsJudge` path over the same generated NBA sessions.\n" + ], + "id": "e2z1FOJKqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bigquery_agent_analytics import LLMAsJudge, TraceFilter\n", + "\n", + "judge = LLMAsJudge.correctness(threshold=0.6, model=MODEL_NAME)\n", + "judge_report = client.evaluate(\n", + " judge,\n", + " filters=TraceFilter(session_ids=session_ids),\n", + ")\n", + "\n", + "print(\"judge evaluator:\", judge_report.evaluator_name)\n", + "print(\"sessions evaluated:\", judge_report.total_sessions)\n", + "print(\"pass_rate:\", judge_report.pass_rate)\n", + "judge_report.session_scores[:3]\n" + ], + "id": "8Gr267jNqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Trajectory matching against expected tool behavior\n", + "trajectory_evaluator = BigQueryTraceEvaluator(\n", + " project_id=PROJECT_ID,\n", + " dataset_id=DATASET_ID,\n", + " table_id=TABLE_ID,\n", + ")\n", + "\n", + "expected = [\n", + " {\"tool_name\": \"get_nba_team_snapshot\"},\n", + " {\"tool_name\": \"compare_matchup\"},\n", + "]\n", + "\n", + "traj_result = asyncio.run(\n", + " trajectory_evaluator.evaluate_session(\n", + " session_id=target_session,\n", + " golden_trajectory=expected,\n", + " match_type=MatchType.ANY_ORDER,\n", + " )\n", + ")\n", + "\n", + "print(\"Eval status:\", traj_result.eval_status)\n", + "print(\"Overall score:\", traj_result.overall_score)\n", + "print(\"Scores:\", traj_result.scores)\n", + "print(\"Details:\", traj_result.details)\n" + ], + "id": "5TgySzVFqBVW" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6) Optional: Generate insights report for NBA sessions" + ], + "id": "7rg68eizqBVW" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from bigquery_agent_analytics import InsightsConfig\n", + "from bigquery_agent_analytics import TraceFilter\n", + "\n", + "config = InsightsConfig(max_sessions=len(session_ids))\n", + "\n", + "insights = client.insights(\n", + " config=config,\n", + " filters=TraceFilter(session_ids=session_ids),\n", + ")\n", + "insights\n" + ], + "id": "7AGevjfEqBVW" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## 7) Next steps\n", + "\n", + "- Swap synthetic tool outputs with a real NBA data source (e.g., trusted sports API).\n", + "- Track model versions in metadata to compare trace quality over time.\n" + ], + "id": "fOdUbljfqBVX" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1) Install dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -q google-adk bigquery-agent-analytics google-cloud-bigquery nest-asyncio pandas || pip install -q google-adk google-cloud-bigquery nest-asyncio pandas \"git+https://github.com/haiyuan-eng-google/BigQuery-Agent-Analytics-SDK.git@main\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2) Environment setup (gcloud / gh / ADK)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "import os\n", - "import subprocess\n", - "\n", - "PROJECT_ID = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", \"your-project-id\")\n", - "DATASET_ID = \"agent_trace\"\n", - "TABLE_ID = \"agent_events\"\n", - "BQ_LOCATION = \"US\"\n", - "MODEL_NAME = \"gemini-2.5-flash\"\n", - "\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", - "os.environ[\"BQ_DATASET\"] = DATASET_ID\n", - "os.environ[\"BQ_TABLE\"] = TABLE_ID\n", - "os.environ.setdefault(\"GOOGLE_CLOUD_LOCATION\", \"global\")\n", - "os.environ.setdefault(\"GOOGLE_GENAI_USE_VERTEXAI\", \"true\")\n", - "\n", - "print(\"Environment configured:\")\n", - "for k in [\"GOOGLE_CLOUD_PROJECT\", \"BQ_DATASET\", \"BQ_TABLE\", \"GOOGLE_CLOUD_LOCATION\", \"GOOGLE_GENAI_USE_VERTEXAI\"]:\n", - " print(f\" {k}={os.environ[k]}\")\n", - "\n", - "\n", - "def _run(cmd: str):\n", - " print(f\"\\n$ {cmd}\")\n", - " p = subprocess.run(cmd, shell=True, text=True, capture_output=True)\n", - " if p.stdout:\n", - " print(p.stdout.strip())\n", - " if p.returncode != 0 and p.stderr:\n", - " print(p.stderr.strip())\n", - "\n", - "# Quick health checks for local tools\n", - "_run(\"gcloud --version\")\n", - "_run(\"gh --version\")\n", - "\n", - "print(\"\"\"\n", - "If auth is needed, run manually in a terminal:\n", - " gcloud auth login\n", - " gcloud auth application-default login\n", - " gcloud config set project $GOOGLE_CLOUD_PROJECT\n", - " bq --location=US mk --dataset --if_not_exists ${GOOGLE_CLOUD_PROJECT}:agent_trace\n", - " gh auth login\n", - "\"\"\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3) Define a deterministic NBA toolset for the test agent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "import hashlib\n", - "import random\n", - "from typing import Any\n", - "\n", - "\n", - "def _rng_from(*parts: str) -> random.Random:\n", - " seed = int(hashlib.md5(\"|\".join(parts).encode()).hexdigest()[:8], 16)\n", - " return random.Random(seed)\n", - "\n", - "\n", - "async def get_nba_team_snapshot(team_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", - " \"\"\"Return deterministic synthetic team stats for demo trace generation.\"\"\"\n", - " rng = _rng_from(team_name, season)\n", - " wins = rng.randint(28, 62)\n", - " losses = 82 - wins\n", - " pace = round(rng.uniform(95.0, 103.0), 1)\n", - " off = round(rng.uniform(108.0, 124.0), 1)\n", - " deff = round(rng.uniform(106.0, 121.0), 1)\n", - " return {\n", - " \"team\": team_name,\n", - " \"season\": season,\n", - " \"wins\": wins,\n", - " \"losses\": losses,\n", - " \"net_rating\": round(off - deff, 1),\n", - " \"off_rating\": off,\n", - " \"def_rating\": deff,\n", - " \"pace\": pace,\n", - " }\n", - "\n", - "\n", - "async def get_nba_player_snapshot(player_name: str, season: str = \"2024-25\") -> dict[str, Any]:\n", - " \"\"\"Return deterministic synthetic player box-score style metrics.\"\"\"\n", - " rng = _rng_from(player_name, season)\n", - " return {\n", - " \"player\": player_name,\n", - " \"season\": season,\n", - " \"games\": rng.randint(45, 82),\n", - " \"ppg\": round(rng.uniform(8.0, 34.0), 1),\n", - " \"rpg\": round(rng.uniform(1.5, 13.5), 1),\n", - " \"apg\": round(rng.uniform(1.0, 11.5), 1),\n", - " \"ts_pct\": round(rng.uniform(0.50, 0.69), 3),\n", - " }\n", - "\n", - "\n", - "async def compare_matchup(home_team: str, away_team: str, season: str = \"2024-25\") -> dict[str, Any]:\n", - " \"\"\"Create a deterministic matchup projection and rationale.\"\"\"\n", - " home = await get_nba_team_snapshot(home_team, season)\n", - " away = await get_nba_team_snapshot(away_team, season)\n", - " margin = round((home[\"net_rating\"] - away[\"net_rating\"]) + 2.1, 1)\n", - " favorite = home_team if margin >= 0 else away_team\n", - " return {\n", - " \"season\": season,\n", - " \"home_team\": home_team,\n", - " \"away_team\": away_team,\n", - " \"favorite\": favorite,\n", - " \"projected_margin\": abs(margin),\n", - " \"reason\": (\n", - " f\"{favorite} favored due to relative net rating edge and home-court adjustment.\"\n", - " ),\n", - " \"home_snapshot\": home,\n", - " \"away_snapshot\": away,\n", - " }\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4) Build ADK NBA agent with BigQuery analytics plugin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "import asyncio\n", - "import nest_asyncio\n", - "import time\n", - "import uuid\n", - "\n", - "from google.adk.agents import LlmAgent\n", - "from google.adk.plugins.bigquery_agent_analytics_plugin import BigQueryAgentAnalyticsPlugin\n", - "from google.adk.plugins.bigquery_agent_analytics_plugin import BigQueryLoggerConfig\n", - "from google.adk.runners import Runner\n", - "from google.adk.sessions import InMemorySessionService\n", - "from google.genai import types\n", - "\n", - "nest_asyncio.apply()\n", - "\n", - "plugin = BigQueryAgentAnalyticsPlugin(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - " location=BQ_LOCATION,\n", - " config=BigQueryLoggerConfig(),\n", - ")\n", - "\n", - "NBA_AGENT_INSTRUCTION = \"\"\"\n", - "You are an NBA analytics assistant.\n", - "- Always reason from available tool outputs.\n", - "- Be explicit when data is synthetic for demo purposes.\n", - "- Provide concise basketball insights in bullet points.\n", - "\"\"\"\n", - "\n", - "agent = LlmAgent(\n", - " name=\"nba_analytics_agent\",\n", - " model=MODEL_NAME,\n", - " instruction=NBA_AGENT_INSTRUCTION,\n", - " tools=[get_nba_team_snapshot, get_nba_player_snapshot, compare_matchup],\n", - ")\n", - "\n", - "session_service = InMemorySessionService()\n", - "runner = Runner(\n", - " app_name=\"nba_agent_trace_demo\",\n", - " agent=agent,\n", - " session_service=session_service,\n", - " plugins=[plugin],\n", - ")\n", - "\n", - "async def run_conversation(session_id: str, user_prompt: str) -> str:\n", - " await session_service.create_session(\n", - " app_name=\"nba_agent_trace_demo\",\n", - " user_id=\"demo_user\",\n", - " session_id=session_id,\n", - " )\n", - " final_text = \"\"\n", - " async for event in runner.run_async(\n", - " user_id=\"demo_user\",\n", - " session_id=session_id,\n", - " new_message=types.Content(role=\"user\", parts=[types.Part(text=user_prompt)]),\n", - " ):\n", - " if event.is_final_response() and event.content and event.content.parts:\n", - " final_text = event.content.parts[0].text or \"\"\n", - " return final_text\n", - "\n", - "prompts = [\n", - " \"Compare Lakers vs Celtics in a hypothetical Finals matchup and explain key edges.\",\n", - " \"Give me a scouting snapshot for Nikola Jokic, then compare Nuggets vs Bucks.\",\n", - " \"Who has the better 2024-25 profile: Knicks or 76ers? Include pace and ratings.\",\n", - " \"Break down Warriors vs Suns: expected favorite, projected margin, and why.\",\n", - " \"Create quick team snapshots for Heat and Cavaliers, then recommend who has a better playoff profile.\",\n", - "]\n", - "\n", - "session_ids = []\n", - "for prompt in prompts:\n", - " sid = f\"nba-{uuid.uuid4().hex[:10]}\"\n", - " session_ids.append(sid)\n", - " print(f\"\\n=== Running session {sid} ===\")\n", - " print(\"User:\", prompt)\n", - " response = asyncio.run(run_conversation(sid, prompt))\n", - " print(\"Agent:\", response[:600], \"...\" if len(response) > 600 else \"\")\n", - "\n", - "# Give plugin time to flush in notebook environments.\n", - "time.sleep(3)\n", - "print(\"\\nTrace sessions:\", session_ids)\n", - "\n", - "# Wait/poll until plugin writes become visible in BigQuery.\n", - "from google.cloud import bigquery\n", - "\n", - "bq_client = bigquery.Client(project=PROJECT_ID, location=BQ_LOCATION)\n", - "max_wait_s = 120\n", - "poll_s = 10\n", - "elapsed = 0\n", - "rows_visible = 0\n", - "\n", - "while elapsed <= max_wait_s:\n", - " query = f\"\"\"\n", - " SELECT COUNT(*) AS c\n", - " FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`\n", - " WHERE session_id IN UNNEST(@session_ids)\n", - " \"\"\"\n", - " job = bq_client.query(\n", - " query,\n", - " job_config=bigquery.QueryJobConfig(\n", - " query_parameters=[\n", - " bigquery.ArrayQueryParameter(\"session_ids\", \"STRING\", session_ids)\n", - " ]\n", - " ),\n", - " )\n", - " rows_visible = int(list(job.result())[0][\"c\"])\n", - " print(f\"Elapsed {elapsed:>3}s -> visible_rows={rows_visible}\")\n", - " if rows_visible > 0:\n", - " break\n", - " time.sleep(poll_s)\n", - " elapsed += poll_s\n", - "\n", - "print(\"Final visible rows for current notebook sessions:\", rows_visible)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5) Analyze traces with BigQuery Agent Analytics SDK APIs\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "from bigquery_agent_analytics import BigQueryTraceEvaluator\n", - "from bigquery_agent_analytics import Client\n", - "from bigquery_agent_analytics import CodeEvaluator\n", - "from bigquery_agent_analytics.trace_evaluator import MatchType\n", - "\n", - "client = Client(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - " location=BQ_LOCATION,\n", - ")\n", - "\n", - "target_session = session_ids[0]\n", - "traces = client.list_traces()\n", - "trace = next(t for t in traces if t.session_id == target_session)\n", - "print(\"Loaded trace for session:\", target_session, \"trace_id:\", trace.trace_id)\n", - "\n", - "# In notebooks, Trace.render() returns a graphviz object if available.\n", - "rendered = trace.render()\n", - "rendered\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Quick trace table instead of raw SQL\n", - "trace_rows = []\n", - "for t in client.list_traces():\n", - " trace_rows.append({\n", - " \"session_id\": t.session_id,\n", - " \"trace_id\": t.trace_id,\n", - " \"user_id\": t.user_id,\n", - " \"span_count\": len(t.spans),\n", - " \"total_latency_ms\": t.total_latency_ms,\n", - " })\n", - "\n", - "import pandas as pd\n", - "pd.DataFrame(trace_rows).head(10)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Deterministic code-based evaluation examples (SDK client-side)\n", - "latency_eval = CodeEvaluator.latency(threshold_ms=12000)\n", - "turn_eval = CodeEvaluator.turn_count(max_turns=16)\n", - "\n", - "report_latency = client.evaluate(latency_eval)\n", - "report_turns = client.evaluate(turn_eval)\n", - "print(\"latency pass_rate:\", report_latency.pass_rate)\n", - "print(\"turn_count pass_rate:\", report_turns.pass_rate)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5.1) LLM-as-Judge example (factuality / tactical depth proxy)\n", - "This uses the SDK `LLMAsJudge` path over the same generated NBA sessions.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bigquery_agent_analytics import LLMAsJudge, TraceFilter\n", - "\n", - "judge = LLMAsJudge.correctness(threshold=0.6, model=MODEL_NAME)\n", - "judge_report = client.evaluate(\n", - " judge,\n", - " filters=TraceFilter(session_ids=session_ids),\n", - ")\n", - "\n", - "print(\"judge evaluator:\", judge_report.evaluator_name)\n", - "print(\"sessions evaluated:\", judge_report.total_sessions)\n", - "print(\"pass_rate:\", judge_report.pass_rate)\n", - "judge_report.session_scores[:3]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Trajectory matching against expected tool behavior\n", - "trajectory_evaluator = BigQueryTraceEvaluator(\n", - " project_id=PROJECT_ID,\n", - " dataset_id=DATASET_ID,\n", - " table_id=TABLE_ID,\n", - ")\n", - "\n", - "expected = [\n", - " {\"tool_name\": \"get_nba_team_snapshot\"},\n", - " {\"tool_name\": \"compare_matchup\"},\n", - "]\n", - "\n", - "traj_result = asyncio.run(\n", - " trajectory_evaluator.evaluate_session(\n", - " session_id=target_session,\n", - " golden_trajectory=expected,\n", - " match_type=MatchType.ANY_ORDER,\n", - " )\n", - ")\n", - "\n", - "print(\"Eval status:\", traj_result.eval_status)\n", - "print(\"Overall score:\", traj_result.overall_score)\n", - "print(\"Scores:\", traj_result.scores)\n", - "print(\"Details:\", traj_result.details)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6) Optional: Generate insights report for NBA sessions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "from bigquery_agent_analytics import InsightsConfig\n", - "from bigquery_agent_analytics import TraceFilter\n", - "\n", - "config = InsightsConfig(max_sessions=len(session_ids))\n", - "\n", - "insights = client.insights(\n", - " config=config,\n", - " filters=TraceFilter(session_ids=session_ids),\n", - ")\n", - "insights\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## 7) Next steps\n", - "\n", - "- Swap synthetic tool outputs with a real NBA data source (e.g., trusted sports API).\n", - "- Track model versions in metadata to compare trace quality over time.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/src/bigquery_agent_analytics/__init__.py b/src/bigquery_agent_analytics/__init__.py index e16ceab7..6f0294b4 100644 --- a/src/bigquery_agent_analytics/__init__.py +++ b/src/bigquery_agent_analytics/__init__.py @@ -69,6 +69,7 @@ from .evaluators import EvaluationReport from .evaluators import LLMAsJudge from .evaluators import SessionScore + from .evaluators import SystemEvaluator from .feedback import AnalysisConfig from .feedback import DriftReport from .feedback import QuestionDistribution @@ -96,6 +97,7 @@ "TraceFilter", "ViewManager", "CodeEvaluator", + "SystemEvaluator", "LLMAsJudge", "EvaluationReport", "SessionScore", diff --git a/src/bigquery_agent_analytics/_streaming_evaluation.py b/src/bigquery_agent_analytics/_streaming_evaluation.py index 6b7e1c32..f338e3ca 100644 --- a/src/bigquery_agent_analytics/_streaming_evaluation.py +++ b/src/bigquery_agent_analytics/_streaming_evaluation.py @@ -25,9 +25,9 @@ import json from typing import Any -from bigquery_agent_analytics import CodeEvaluator from bigquery_agent_analytics import EvaluationReport from bigquery_agent_analytics import serialize +from bigquery_agent_analytics import SystemEvaluator from bigquery_agent_analytics import udf_kernels STREAMING_EVALUATOR_PROFILE = "streaming_observability_v1" @@ -69,12 +69,12 @@ def is_final(self) -> bool: return self.trigger_kind == TRIGGER_KIND_SESSION_TERMINAL -def build_streaming_observability_evaluator() -> CodeEvaluator: +def build_streaming_observability_evaluator() -> SystemEvaluator: """Build the fixed launch evaluator profile for streaming observability. Uses raw-budget gates (a session passes iff the observed metric is within the configured budget) for consistency with - ``CodeEvaluator.latency`` / ``.error_rate`` / ``.turn_count``. + ``SystemEvaluator.latency`` / ``.error_rate`` / ``.turn_count``. Prior implementation used normalized scores with a 0.5 pass cutoff, which caused gates to fire at roughly half the configured budget. """ @@ -102,7 +102,7 @@ def _score_turn_count(session_summary: dict[str, Any]) -> float: observed = session_summary.get("turn_count", 0) or 0 return 1.0 if observed <= _MAX_TURNS else 0.0 - evaluator = CodeEvaluator(name=STREAMING_EVALUATOR_PROFILE) + evaluator = SystemEvaluator(name=STREAMING_EVALUATOR_PROFILE) evaluator.add_metric( "latency", _score_latency, diff --git a/src/bigquery_agent_analytics/categorical_evaluator.py b/src/bigquery_agent_analytics/categorical_evaluator.py index 7d80603b..9ace3a18 100644 --- a/src/bigquery_agent_analytics/categorical_evaluator.py +++ b/src/bigquery_agent_analytics/categorical_evaluator.py @@ -16,7 +16,7 @@ Classifies agent sessions into user-defined categories using BigQuery's native ``AI.GENERATE``, with Gemini API fallback when BigQuery-native -execution is unavailable. Unlike the numeric ``CodeEvaluator`` and +execution is unavailable. Unlike the numeric ``SystemEvaluator`` and ``LLMAsJudge`` report paths, this module returns label-valued results with strict category validation. diff --git a/src/bigquery_agent_analytics/cli.py b/src/bigquery_agent_analytics/cli.py index 623b8bec..e8a599d6 100644 --- a/src/bigquery_agent_analytics/cli.py +++ b/src/bigquery_agent_analytics/cli.py @@ -39,9 +39,9 @@ import typer -from .evaluators import CodeEvaluator from .evaluators import EvaluationReport from .evaluators import LLMAsJudge +from .evaluators import SystemEvaluator from .formatter import format_output from .trace import TraceFilter @@ -215,28 +215,28 @@ def _load_spec_from_args( _CODE_EVALUATORS = { "latency": ( - lambda t: CodeEvaluator.latency(threshold_ms=t), - lambda: CodeEvaluator.latency(), + lambda t: SystemEvaluator.latency(threshold_ms=t), + lambda: SystemEvaluator.latency(), ), "error_rate": ( - lambda t: CodeEvaluator.error_rate(max_error_rate=t), - lambda: CodeEvaluator.error_rate(), + lambda t: SystemEvaluator.error_rate(max_error_rate=t), + lambda: SystemEvaluator.error_rate(), ), "turn_count": ( - lambda t: CodeEvaluator.turn_count(max_turns=int(t)), - lambda: CodeEvaluator.turn_count(), + lambda t: SystemEvaluator.turn_count(max_turns=int(t)), + lambda: SystemEvaluator.turn_count(), ), "token_efficiency": ( - lambda t: CodeEvaluator.token_efficiency(max_tokens=int(t)), - lambda: CodeEvaluator.token_efficiency(), + lambda t: SystemEvaluator.token_efficiency(max_tokens=int(t)), + lambda: SystemEvaluator.token_efficiency(), ), "ttft": ( - lambda t: CodeEvaluator.ttft(threshold_ms=t), - lambda: CodeEvaluator.ttft(), + lambda t: SystemEvaluator.ttft(threshold_ms=t), + lambda: SystemEvaluator.ttft(), ), "cost": ( - lambda t: CodeEvaluator.cost_per_session(max_cost_usd=t), - lambda: CodeEvaluator.cost_per_session(), + lambda t: SystemEvaluator.cost_per_session(max_cost_usd=t), + lambda: SystemEvaluator.cost_per_session(), ), } @@ -430,7 +430,7 @@ def evaluate( } if threshold is not None: kwargs["min_hit_rate"] = threshold - ev = CodeEvaluator.context_cache_hit_rate(**kwargs) + ev = SystemEvaluator.context_cache_hit_rate(**kwargs) else: entry = _CODE_EVALUATORS.get(evaluator) if not entry: @@ -494,7 +494,7 @@ def _emit_evaluate_failures( """Emit readable FAIL lines for failing sessions before --exit-code exits. One line per (session_id, metric_name) that failed its threshold. - Prefers the raw observed + budget pair (``CodeEvaluator`` prebuilts); + Prefers the raw observed + budget pair (``SystemEvaluator`` prebuilts); falls back to score + threshold when the metric didn't declare observed/budget (custom ``add_metric`` users, ``LLMAsJudge`` criteria). For LLM-judge failures the line also carries a bounded diff --git a/src/bigquery_agent_analytics/client.py b/src/bigquery_agent_analytics/client.py index 75614e80..24e3c584 100644 --- a/src/bigquery_agent_analytics/client.py +++ b/src/bigquery_agent_analytics/client.py @@ -33,11 +33,11 @@ # Run evaluation from bigquery_agent_analytics import ( - CodeEvaluator, LLMAsJudge, TraceFilter, + SystemEvaluator, LLMAsJudge, TraceFilter, ) report = client.evaluate( filters=TraceFilter(agent_id="my_agent"), - evaluator=CodeEvaluator.latency(threshold_ms=3000), + evaluator=SystemEvaluator.latency(threshold_ms=3000), ) print(report.summary()) """ @@ -73,7 +73,6 @@ from .categorical_evaluator import parse_classify_row from .evaluators import _parse_json_from_text from .evaluators import AI_GENERATE_JUDGE_BATCH_QUERY -from .evaluators import CodeEvaluator from .evaluators import DEFAULT_ENDPOINT from .evaluators import EvaluationReport from .evaluators import LLM_JUDGE_BATCH_QUERY @@ -82,6 +81,7 @@ from .evaluators import SESSION_SUMMARY_QUERY from .evaluators import SessionScore from .evaluators import split_judge_prompt_template +from .evaluators import SystemEvaluator from .feedback import AnalysisConfig from .feedback import compute_drift from .feedback import compute_question_distribution @@ -869,7 +869,7 @@ def list_traces( def evaluate( self, - evaluator: CodeEvaluator | LLMAsJudge, + evaluator: SystemEvaluator | LLMAsJudge, filters: Optional[TraceFilter] = None, dataset: Optional[str] = None, strict: bool = False, @@ -877,12 +877,12 @@ def evaluate( """Runs batch evaluation over traces. Uses BigQuery native execution for scalable assessment. - ``CodeEvaluator`` metrics are computed from session + ``SystemEvaluator`` metrics are computed from session aggregates. ``LLMAsJudge`` metrics use BQML's ``ML.GENERATE_TEXT`` for zero-ETL evaluation. Args: - evaluator: A CodeEvaluator or LLMAsJudge instance. + evaluator: A SystemEvaluator or LLMAsJudge instance. filters: Optional trace filters. dataset: Optional table name override. strict: When ``True``, sessions with unparseable or @@ -900,7 +900,7 @@ def evaluate( filt = filters or TraceFilter() where, params = filt.to_sql_conditions() - if isinstance(evaluator, CodeEvaluator): + if isinstance(evaluator, SystemEvaluator): return self._evaluate_code( evaluator, table, @@ -923,7 +923,7 @@ def evaluate( def _evaluate_code( self, - evaluator: CodeEvaluator, + evaluator: SystemEvaluator, table: str, where: str, params: list, diff --git a/src/bigquery_agent_analytics/evaluators.py b/src/bigquery_agent_analytics/evaluators.py index b891d177..498e862b 100644 --- a/src/bigquery_agent_analytics/evaluators.py +++ b/src/bigquery_agent_analytics/evaluators.py @@ -14,7 +14,7 @@ """Evaluation engine for BigQuery Agent Analytics SDK. -Provides ``CodeEvaluator`` for deterministic, code-based metrics and +Provides ``SystemEvaluator`` for deterministic, code-based metrics and ``LLMAsJudge`` for semantic evaluation using LLM-as-a-judge. The ``evaluate()`` function orchestrates batch evaluation using BigQuery's native AI functions for scalable, zero-ETL assessment. @@ -22,11 +22,11 @@ Example usage:: from bigquery_agent_analytics.evaluators import ( - CodeEvaluator, LLMAsJudge, + SystemEvaluator, LLMAsJudge, ) # Deterministic evaluation - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) # LLM-based semantic evaluation judge = LLMAsJudge.correctness() @@ -166,7 +166,7 @@ class _MetricDef: detail_fn: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None -class CodeEvaluator: +class SystemEvaluator: """Deterministic evaluator using code-based metric functions. Metrics operate on a session summary dict containing:: @@ -189,7 +189,7 @@ class CodeEvaluator: def __init__( self, - name: str = "code_evaluator", + name: str = "system_evaluator", metrics: Optional[list[_MetricDef]] = None, ) -> None: self.name = name @@ -204,7 +204,7 @@ def add_metric( budget: Optional[float] = None, observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None, detail_fn: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Adds a custom metric function. Args: @@ -322,7 +322,7 @@ def evaluate_session(self, session_summary: dict[str, Any]) -> SessionScore: @staticmethod def latency( threshold_ms: float = 5000.0, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator that fails when average latency exceeds the budget. Pass/fail is a raw comparison: ``avg_latency_ms <= threshold_ms`` @@ -333,14 +333,14 @@ def latency( threshold_ms: Maximum acceptable average latency in ms. Returns: - CodeEvaluator configured for latency checking. + SystemEvaluator configured for latency checking. """ def _score(s: dict[str, Any]) -> float: observed = s.get("avg_latency_ms", 0) or 0 return 1.0 if observed <= threshold_ms else 0.0 - evaluator = CodeEvaluator(name="latency_evaluator") + evaluator = SystemEvaluator(name="latency_evaluator") evaluator.add_metric( "latency", _score, @@ -351,7 +351,7 @@ def _score(s: dict[str, Any]) -> float: return evaluator @staticmethod - def turn_count(max_turns: int = 10) -> CodeEvaluator: + def turn_count(max_turns: int = 10) -> SystemEvaluator: """Pre-built evaluator that fails when turn count exceeds the budget. Pass/fail is a raw comparison: ``turn_count <= max_turns`` passes, @@ -361,14 +361,14 @@ def turn_count(max_turns: int = 10) -> CodeEvaluator: max_turns: Maximum acceptable number of turns. Returns: - CodeEvaluator configured for turn count checking. + SystemEvaluator configured for turn count checking. """ def _score(s: dict[str, Any]) -> float: observed = s.get("turn_count", 0) or 0 return 1.0 if observed <= max_turns else 0.0 - evaluator = CodeEvaluator(name="turn_count_evaluator") + evaluator = SystemEvaluator(name="turn_count_evaluator") evaluator.add_metric( "turn_count", _score, @@ -381,7 +381,7 @@ def _score(s: dict[str, Any]) -> float: @staticmethod def error_rate( max_error_rate: float = 0.1, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator that fails when tool error rate exceeds the budget. Pass/fail is a raw comparison: ``(tool_errors / tool_calls) <= max_error_rate`` @@ -392,7 +392,7 @@ def error_rate( max_error_rate: Maximum acceptable tool error fraction. Returns: - CodeEvaluator configured for error rate checking. + SystemEvaluator configured for error rate checking. """ def _observed(s: dict[str, Any]) -> float: @@ -408,7 +408,7 @@ def _score(s: dict[str, Any]) -> float: return 1.0 return 1.0 if _observed(s) <= max_error_rate else 0.0 - evaluator = CodeEvaluator(name="error_rate_evaluator") + evaluator = SystemEvaluator(name="error_rate_evaluator") evaluator.add_metric( "error_rate", _score, @@ -421,7 +421,7 @@ def _score(s: dict[str, Any]) -> float: @staticmethod def token_efficiency( max_tokens: int = 50000, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator that fails when total tokens exceed the budget. Pass/fail is a raw comparison: ``total_tokens <= max_tokens`` @@ -431,14 +431,14 @@ def token_efficiency( max_tokens: Maximum acceptable total token count. Returns: - CodeEvaluator configured for token efficiency. + SystemEvaluator configured for token efficiency. """ def _score(s: dict[str, Any]) -> float: observed = s.get("total_tokens", 0) or 0 return 1.0 if observed <= max_tokens else 0.0 - evaluator = CodeEvaluator(name="token_efficiency_evaluator") + evaluator = SystemEvaluator(name="token_efficiency_evaluator") evaluator.add_metric( "token_efficiency", _score, @@ -451,7 +451,7 @@ def _score(s: dict[str, Any]) -> float: @staticmethod def ttft( threshold_ms: float = 1000.0, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator that fails when TTFT exceeds the budget. Pass/fail is a raw comparison: ``avg_ttft_ms <= threshold_ms`` @@ -461,14 +461,14 @@ def ttft( threshold_ms: Maximum acceptable average TTFT in ms. Returns: - CodeEvaluator configured for TTFT checking. + SystemEvaluator configured for TTFT checking. """ def _score(s: dict[str, Any]) -> float: observed = s.get("avg_ttft_ms", 0) or 0 return 1.0 if observed <= threshold_ms else 0.0 - evaluator = CodeEvaluator(name="ttft_evaluator") + evaluator = SystemEvaluator(name="ttft_evaluator") evaluator.add_metric( "ttft", _score, @@ -483,7 +483,7 @@ def cost_per_session( max_cost_usd: float = 1.0, input_cost_per_1k: float = 0.00025, output_cost_per_1k: float = 0.00125, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator that fails when per-session cost exceeds the budget. Pass/fail is a raw comparison: ``estimated_cost_usd <= max_cost_usd`` @@ -495,7 +495,7 @@ def cost_per_session( output_cost_per_1k: Cost per 1K output tokens. Returns: - CodeEvaluator configured for cost checking. + SystemEvaluator configured for cost checking. """ def _observed(s: dict[str, Any]) -> float: @@ -508,7 +508,7 @@ def _observed(s: dict[str, Any]) -> float: def _score(s: dict[str, Any]) -> float: return 1.0 if _observed(s) <= max_cost_usd else 0.0 - evaluator = CodeEvaluator(name="cost_evaluator") + evaluator = SystemEvaluator(name="cost_evaluator") evaluator.add_metric( "cost", _score, @@ -524,7 +524,7 @@ def context_cache_hit_rate( fail_on_missing_telemetry: bool = False, cold_start_rate: float = 0.1, warm_rate: float = 0.9, - ) -> CodeEvaluator: + ) -> SystemEvaluator: """Pre-built evaluator for Gemini context cache prefix hit rate. The observed rate is ``cached_tokens / input_tokens``. The session @@ -545,7 +545,7 @@ def context_cache_hit_rate( ``"warm"``. Returns: - CodeEvaluator configured for context cache efficiency. + SystemEvaluator configured for context cache efficiency. """ try: min_hit_rate = float(min_hit_rate) @@ -617,7 +617,7 @@ def _details(s: dict[str, Any]) -> dict[str, Any]: "fail_on_missing_telemetry": fail_on_missing_telemetry, } - evaluator = CodeEvaluator(name="context_cache_hit_rate_evaluator") + evaluator = SystemEvaluator(name="context_cache_hit_rate_evaluator") evaluator.add_metric( "context_cache_hit_rate", _score, @@ -629,6 +629,10 @@ def _details(s: dict[str, Any]) -> dict[str, Any]: return evaluator +# Keep alias for backward compatibility +CodeEvaluator = SystemEvaluator + + # ------------------------------------------------------------------ # # LLM-as-Judge Evaluator # # ------------------------------------------------------------------ # diff --git a/src/bigquery_agent_analytics/grader_pipeline.py b/src/bigquery_agent_analytics/grader_pipeline.py index 181572d6..6c306ac9 100644 --- a/src/bigquery_agent_analytics/grader_pipeline.py +++ b/src/bigquery_agent_analytics/grader_pipeline.py @@ -14,21 +14,21 @@ """Grader composition pipeline for combining multiple evaluators. -Composes ``CodeEvaluator``, ``LLMAsJudge``, and custom graders into a +Composes ``SystemEvaluator``, ``LLMAsJudge``, and custom graders into a single verdict using configurable scoring strategies (weighted average, binary all-pass, or majority vote). Example usage:: from bigquery_agent_analytics import ( - CodeEvaluator, GraderPipeline, LLMAsJudge, WeightedStrategy, + SystemEvaluator, GraderPipeline, LLMAsJudge, WeightedStrategy, ) pipeline = ( GraderPipeline(WeightedStrategy( weights={"latency": 0.3, "correctness": 0.7}, )) - .add_code_grader(CodeEvaluator.latency(), weight=0.3) + .add_code_grader(SystemEvaluator.latency(), weight=0.3) .add_llm_grader(LLMAsJudge.correctness(), weight=0.7) ) @@ -48,8 +48,8 @@ from pydantic import BaseModel from pydantic import Field -from .evaluators import CodeEvaluator from .evaluators import LLMAsJudge +from .evaluators import SystemEvaluator logger = logging.getLogger("bigquery_agent_analytics." + __name__) @@ -250,14 +250,14 @@ def __init__( class GraderPipeline: """Composes multiple graders into a single evaluation pipeline. - Supports ``CodeEvaluator``, ``LLMAsJudge``, and arbitrary custom + Supports ``SystemEvaluator``, ``LLMAsJudge``, and arbitrary custom grader functions combined via a configurable ``ScoringStrategy``. Example:: pipeline = ( GraderPipeline(WeightedStrategy(threshold=0.6)) - .add_code_grader(CodeEvaluator.latency()) + .add_code_grader(SystemEvaluator.latency()) .add_llm_grader(LLMAsJudge.correctness()) ) verdict = await pipeline.evaluate( @@ -278,13 +278,13 @@ def __init__(self, strategy: ScoringStrategy) -> None: def add_code_grader( self, - evaluator: CodeEvaluator, + evaluator: SystemEvaluator, weight: float = 1.0, ) -> GraderPipeline: - """Adds a CodeEvaluator grader to the pipeline. + """Adds a SystemEvaluator grader to the pipeline. Args: - evaluator: A CodeEvaluator instance. + evaluator: A SystemEvaluator instance. weight: Weight for weighted strategies. Returns: @@ -363,7 +363,7 @@ async def evaluate( Args: session_summary: Dict with session metrics (for - CodeEvaluator graders). + SystemEvaluator graders). trace_text: Formatted trace text (for LLMAsJudge graders). final_response: Final agent response. @@ -402,7 +402,7 @@ async def _run_grader( """Runs a single grader and returns its result.""" evaluator = entry.evaluate_fn - if isinstance(evaluator, CodeEvaluator): + if isinstance(evaluator, SystemEvaluator): score = evaluator.evaluate_session(session_summary) return GraderResult( grader_name=entry.name, diff --git a/src/bigquery_agent_analytics/udf_kernels.py b/src/bigquery_agent_analytics/udf_kernels.py index 86c65cc6..045d4d90 100644 --- a/src/bigquery_agent_analytics/udf_kernels.py +++ b/src/bigquery_agent_analytics/udf_kernels.py @@ -25,7 +25,7 @@ These kernels serve two purposes: 1. They are the single source of truth shared by both the Python SDK - (``CodeEvaluator`` factories in ``evaluators.py``) and the BigQuery + (``SystemEvaluator`` factories in ``evaluators.py``) and the BigQuery Python UDF registration SQL. 2. They can be tested in isolation with simple scalar assertions. diff --git a/tests/test_grader_pipeline.py b/tests/test_grader_pipeline.py index 36278e1e..1c6f30fb 100644 --- a/tests/test_grader_pipeline.py +++ b/tests/test_grader_pipeline.py @@ -20,9 +20,9 @@ import pytest -from bigquery_agent_analytics.evaluators import CodeEvaluator from bigquery_agent_analytics.evaluators import LLMAsJudge from bigquery_agent_analytics.evaluators import SessionScore +from bigquery_agent_analytics.evaluators import SystemEvaluator from bigquery_agent_analytics.grader_pipeline import AggregateVerdict from bigquery_agent_analytics.grader_pipeline import BinaryStrategy from bigquery_agent_analytics.grader_pipeline import GraderPipeline @@ -167,7 +167,7 @@ class TestGraderPipeline: async def test_code_grader(self): """Test pipeline with a code grader.""" pipeline = GraderPipeline(WeightedStrategy(threshold=0.5)).add_code_grader( - CodeEvaluator.latency(threshold_ms=5000) + SystemEvaluator.latency(threshold_ms=5000) ) verdict = await pipeline.evaluate( @@ -239,7 +239,7 @@ async def test_mixed_graders(self): pipeline = ( GraderPipeline(BinaryStrategy()) - .add_code_grader(CodeEvaluator.latency(threshold_ms=5000)) + .add_code_grader(SystemEvaluator.latency(threshold_ms=5000)) .add_llm_grader(judge) ) @@ -260,8 +260,8 @@ async def test_chaining_api(self): """Test fluent builder chaining.""" pipeline = ( GraderPipeline(WeightedStrategy()) - .add_code_grader(CodeEvaluator.latency()) - .add_code_grader(CodeEvaluator.error_rate()) + .add_code_grader(SystemEvaluator.latency()) + .add_code_grader(SystemEvaluator.error_rate()) ) # Verify chaining works assert len(pipeline._graders) == 2 diff --git a/tests/test_pr16_fixes.py b/tests/test_pr16_fixes.py index d985d9cc..8166ad37 100644 --- a/tests/test_pr16_fixes.py +++ b/tests/test_pr16_fixes.py @@ -45,9 +45,9 @@ from bigquery_agent_analytics.client import _merge_criterion_reports from bigquery_agent_analytics.client import _run_sync from bigquery_agent_analytics.client import Client -from bigquery_agent_analytics.evaluators import CodeEvaluator from bigquery_agent_analytics.evaluators import EvaluationReport from bigquery_agent_analytics.evaluators import SessionScore +from bigquery_agent_analytics.evaluators import SystemEvaluator from bigquery_agent_analytics.trace import Span from bigquery_agent_analytics.trace import Trace from bigquery_agent_analytics.trace import TraceFilter diff --git a/tests/test_pr17_fixes.py b/tests/test_pr17_fixes.py index 4f8f511d..30aa0674 100644 --- a/tests/test_pr17_fixes.py +++ b/tests/test_pr17_fixes.py @@ -224,7 +224,7 @@ def test_no_golden_dataset_param(self): def test_evaluate_still_works(self): """evaluate() should still work with remaining params.""" - from bigquery_agent_analytics import CodeEvaluator + from bigquery_agent_analytics import SystemEvaluator client = Client( project_id="p", @@ -246,7 +246,7 @@ def test_evaluate_still_works(self): ] ), ) - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) report = client.evaluate(evaluator) assert report.total_sessions == 1 diff --git a/tests/test_sdk_client.py b/tests/test_sdk_client.py index 74cf3da6..55fa2b36 100644 --- a/tests/test_sdk_client.py +++ b/tests/test_sdk_client.py @@ -25,8 +25,8 @@ from bigquery_agent_analytics.categorical_evaluator import CategoricalMetricCategory from bigquery_agent_analytics.categorical_evaluator import CategoricalMetricDefinition from bigquery_agent_analytics.client import Client -from bigquery_agent_analytics.evaluators import CodeEvaluator from bigquery_agent_analytics.evaluators import EvaluationReport +from bigquery_agent_analytics.evaluators import SystemEvaluator from bigquery_agent_analytics.trace import TraceFilter @@ -300,7 +300,7 @@ def test_evaluate_code_evaluator(self): bq_client=mock_bq, ) - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) report = client.evaluate(evaluator=evaluator) assert isinstance(report, EvaluationReport) diff --git a/tests/test_sdk_evaluators.py b/tests/test_sdk_evaluators.py index 867f0ba0..6cecc322 100644 --- a/tests/test_sdk_evaluators.py +++ b/tests/test_sdk_evaluators.py @@ -22,20 +22,20 @@ from bigquery_agent_analytics.evaluators import _parse_json_from_text from bigquery_agent_analytics.evaluators import AI_GENERATE_JUDGE_BATCH_QUERY -from bigquery_agent_analytics.evaluators import CodeEvaluator from bigquery_agent_analytics.evaluators import DEFAULT_ENDPOINT from bigquery_agent_analytics.evaluators import EvaluationReport from bigquery_agent_analytics.evaluators import LLM_JUDGE_BATCH_QUERY from bigquery_agent_analytics.evaluators import LLMAsJudge from bigquery_agent_analytics.evaluators import SESSION_SUMMARY_QUERY from bigquery_agent_analytics.evaluators import SessionScore +from bigquery_agent_analytics.evaluators import SystemEvaluator -class TestCodeEvaluator: - """Tests for CodeEvaluator class.""" +class TestSystemEvaluator: + """Tests for SystemEvaluator class.""" def test_custom_metric(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="custom", fn=lambda s: 0.8, @@ -50,7 +50,7 @@ def test_custom_metric(self): assert score.passed is True def test_custom_metric_fail(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="custom", fn=lambda s: 0.2, @@ -63,7 +63,7 @@ def test_custom_metric_fail(self): assert score.passed is False def test_metric_exception_handled(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="broken", fn=lambda s: 1 / 0, @@ -76,7 +76,7 @@ def test_metric_exception_handled(self): assert score.passed is False def test_metric_clamping(self): - evaluator = CodeEvaluator(name="test") + evaluator = SystemEvaluator(name="test") evaluator.add_metric( name="over", fn=lambda s: 1.5, @@ -95,7 +95,7 @@ def test_metric_clamping(self): def test_chaining(self): evaluator = ( - CodeEvaluator(name="chain") + SystemEvaluator(name="chain") .add_metric("a", lambda s: 0.9) .add_metric("b", lambda s: 0.7) ) @@ -104,11 +104,11 @@ def test_chaining(self): assert "b" in score.scores -class TestCodeEvaluatorPrebuilt: - """Tests for pre-built CodeEvaluator factories.""" +class TestSystemEvaluatorPrebuilt: + """Tests for pre-built SystemEvaluator factories.""" def test_latency_pass(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -119,7 +119,7 @@ def test_latency_pass(self): assert score.scores["latency"] == 1.0 def test_latency_fail(self): - evaluator = CodeEvaluator.latency(threshold_ms=1000) + evaluator = SystemEvaluator.latency(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -130,7 +130,7 @@ def test_latency_fail(self): assert score.scores["latency"] == 0.0 def test_latency_zero(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -140,7 +140,7 @@ def test_latency_zero(self): assert score.scores["latency"] == 1.0 def test_turn_count_pass(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) score = evaluator.evaluate_session( { "session_id": "s1", @@ -151,7 +151,7 @@ def test_turn_count_pass(self): assert score.scores["turn_count"] == 1.0 def test_turn_count_fail(self): - evaluator = CodeEvaluator.turn_count(max_turns=5) + evaluator = SystemEvaluator.turn_count(max_turns=5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -161,7 +161,7 @@ def test_turn_count_fail(self): assert score.passed is False def test_error_rate_pass(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -172,7 +172,7 @@ def test_error_rate_pass(self): assert score.passed is True def test_error_rate_fail(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -183,7 +183,7 @@ def test_error_rate_fail(self): assert score.passed is False def test_error_rate_no_calls(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( { "session_id": "s1", @@ -199,13 +199,13 @@ class TestPrebuiltRawBudgetBoundaries: Prior implementation used normalized scores with a 0.5 pass cutoff, which caused every gate to effectively fire at ``budget / 2`` (e.g. - ``CodeEvaluator.latency(threshold_ms=5000)`` failed at observed > + ``SystemEvaluator.latency(threshold_ms=5000)`` failed at observed > 2500 ms). These tests lock in the new raw-budget semantics and guard against regressions. """ def test_latency_boundary_inclusive(self): - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 5000} ) @@ -222,7 +222,7 @@ def test_latency_boundary_inclusive(self): def test_latency_old_midpoint_now_passes(self): # The old normalized impl failed at 2501ms with threshold=5000; under # the new impl this is nowhere near the budget and must pass. - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 2501} ) @@ -230,7 +230,7 @@ def test_latency_old_midpoint_now_passes(self): assert score.scores["latency"] == 1.0 def test_turn_count_boundary_inclusive(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) at_budget = evaluator.evaluate_session( {"session_id": "s1", "turn_count": 10} ) @@ -241,13 +241,13 @@ def test_turn_count_boundary_inclusive(self): assert just_over.passed is False def test_turn_count_old_midpoint_now_passes(self): - evaluator = CodeEvaluator.turn_count(max_turns=10) + evaluator = SystemEvaluator.turn_count(max_turns=10) score = evaluator.evaluate_session({"session_id": "s1", "turn_count": 6}) # Old impl: 1.0 - 6/10 = 0.4 -> fail. New: 6 <= 10 -> pass. assert score.passed is True def test_error_rate_boundary_inclusive(self): - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) at_budget = evaluator.evaluate_session( {"session_id": "s1", "tool_calls": 10, "tool_errors": 1} ) @@ -258,7 +258,7 @@ def test_error_rate_boundary_inclusive(self): assert just_over.passed is False def test_token_efficiency_boundary_inclusive(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "total_tokens": 50000} ) @@ -269,7 +269,7 @@ def test_token_efficiency_boundary_inclusive(self): assert just_over.passed is False def test_ttft_boundary_inclusive(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) at_budget = evaluator.evaluate_session( {"session_id": "s1", "avg_ttft_ms": 1000} ) @@ -280,7 +280,7 @@ def test_ttft_boundary_inclusive(self): assert just_over.passed is False def test_cost_per_session_boundary_inclusive(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=0.001, output_cost_per_1k=0.001, @@ -298,7 +298,7 @@ def test_cost_per_session_boundary_inclusive(self): def test_observed_key_and_budget_in_details(self): """Per-metric detail must expose observed/budget for CLI output.""" - evaluator = CodeEvaluator.latency(threshold_ms=5000) + evaluator = SystemEvaluator.latency(threshold_ms=5000) score = evaluator.evaluate_session( {"session_id": "s1", "avg_latency_ms": 6000} ) @@ -310,7 +310,7 @@ def test_observed_key_and_budget_in_details(self): def test_error_rate_observed_fn_in_details(self): """Computed observed (errors/calls) surfaces in details via observed_fn.""" - evaluator = CodeEvaluator.error_rate(max_error_rate=0.1) + evaluator = SystemEvaluator.error_rate(max_error_rate=0.1) score = evaluator.evaluate_session( {"session_id": "s1", "tool_calls": 10, "tool_errors": 5} ) @@ -322,7 +322,7 @@ def test_error_rate_observed_fn_in_details(self): def test_cost_observed_fn_in_details(self): """Computed cost surfaces in details via observed_fn.""" - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=0.001, output_cost_per_1k=0.001, @@ -494,10 +494,10 @@ def test_contains_cache_telemetry_events(self): class TestTokenEfficiencyPrebuilt: - """Tests for CodeEvaluator.token_efficiency() preset.""" + """Tests for SystemEvaluator.token_efficiency() preset.""" def test_zero_tokens(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -508,7 +508,7 @@ def test_zero_tokens(self): assert score.passed is True def test_under_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -520,7 +520,7 @@ def test_under_budget(self): assert score.passed is True def test_over_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -531,7 +531,7 @@ def test_over_budget(self): assert score.passed is False def test_exactly_at_budget(self): - evaluator = CodeEvaluator.token_efficiency(max_tokens=50000) + evaluator = SystemEvaluator.token_efficiency(max_tokens=50000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -544,10 +544,10 @@ def test_exactly_at_budget(self): class TestContextCacheHitRatePrebuilt: - """Tests for CodeEvaluator.context_cache_hit_rate() preset.""" + """Tests for SystemEvaluator.context_cache_hit_rate() preset.""" def test_warm_cache_passes(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -567,7 +567,7 @@ def test_warm_cache_passes(self): assert detail["cache_state"] == "warm" def test_cold_cache_fails(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -583,7 +583,7 @@ def test_cold_cache_fails(self): assert detail["cache_state"] == "cold_start" def test_partial_cache_at_threshold_passes(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -598,7 +598,7 @@ def test_partial_cache_at_threshold_passes(self): assert detail["cache_state"] == "partial" def test_missing_cache_telemetry_passes_by_default(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -614,7 +614,7 @@ def test_missing_cache_telemetry_passes_by_default(self): assert detail["cache_state"] == "no_cache_telemetry" def test_missing_cache_telemetry_can_fail(self): - evaluator = CodeEvaluator.context_cache_hit_rate( + evaluator = SystemEvaluator.context_cache_hit_rate( min_hit_rate=0.5, fail_on_missing_telemetry=True, ) @@ -633,7 +633,7 @@ def test_missing_cache_telemetry_can_fail(self): assert detail["cache_state"] == "no_cache_telemetry" def test_true_zero_cached_tokens_is_cold_start(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -649,7 +649,7 @@ def test_true_zero_cached_tokens_is_cold_start(self): assert detail["cache_state"] == "cold_start" def test_no_llm_input_passes(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -665,7 +665,7 @@ def test_no_llm_input_passes(self): assert detail["cache_state"] == "no_llm_input" def test_cached_tokens_clamps_above_input_tokens(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -681,7 +681,7 @@ def test_cached_tokens_clamps_above_input_tokens(self): assert detail["cache_state"] == "warm" def test_non_numeric_cached_tokens_fall_back_to_zero(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -697,7 +697,7 @@ def test_non_numeric_cached_tokens_fall_back_to_zero(self): assert detail["cache_state"] == "cold_start" def test_legacy_cached_tokens_without_telemetry_count_is_observed(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate=0.5) + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate=0.5) score = evaluator.evaluate_session( { "session_id": "s1", @@ -713,21 +713,21 @@ def test_legacy_cached_tokens_without_telemetry_count_is_observed(self): def test_invalid_cache_state_thresholds_raise(self): with pytest.raises(ValueError, match="cold_start_rate"): - CodeEvaluator.context_cache_hit_rate( + SystemEvaluator.context_cache_hit_rate( cold_start_rate=0.9, warm_rate=0.1, ) def test_invalid_min_hit_rate_negative_raises(self): with pytest.raises(ValueError, match="min_hit_rate"): - CodeEvaluator.context_cache_hit_rate(min_hit_rate=-0.1) + SystemEvaluator.context_cache_hit_rate(min_hit_rate=-0.1) def test_invalid_min_hit_rate_above_one_raises(self): with pytest.raises(ValueError, match="min_hit_rate"): - CodeEvaluator.context_cache_hit_rate(min_hit_rate=1.1) + SystemEvaluator.context_cache_hit_rate(min_hit_rate=1.1) def test_string_min_hit_rate_is_coerced(self): - evaluator = CodeEvaluator.context_cache_hit_rate(min_hit_rate="0.5") + evaluator = SystemEvaluator.context_cache_hit_rate(min_hit_rate="0.5") score = evaluator.evaluate_session( { "session_id": "s1", @@ -743,10 +743,10 @@ def test_string_min_hit_rate_is_coerced(self): class TestCostPerSessionPrebuilt: - """Tests for CodeEvaluator.cost_per_session() preset.""" + """Tests for SystemEvaluator.cost_per_session() preset.""" def test_zero_tokens(self): - evaluator = CodeEvaluator.cost_per_session(max_cost_usd=1.0) + evaluator = SystemEvaluator.cost_per_session(max_cost_usd=1.0) score = evaluator.evaluate_session( { "session_id": "s1", @@ -758,7 +758,7 @@ def test_zero_tokens(self): assert score.passed is True def test_under_budget(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=1.0, input_cost_per_1k=0.001, output_cost_per_1k=0.002, @@ -775,7 +775,7 @@ def test_under_budget(self): assert score.passed is True def test_over_budget(self): - evaluator = CodeEvaluator.cost_per_session( + evaluator = SystemEvaluator.cost_per_session( max_cost_usd=0.01, input_cost_per_1k=1.0, output_cost_per_1k=1.0, @@ -792,7 +792,7 @@ def test_over_budget(self): assert score.passed is False def test_missing_tokens_defaults_to_zero(self): - evaluator = CodeEvaluator.cost_per_session(max_cost_usd=1.0) + evaluator = SystemEvaluator.cost_per_session(max_cost_usd=1.0) score = evaluator.evaluate_session( { "session_id": "s1", @@ -802,10 +802,10 @@ def test_missing_tokens_defaults_to_zero(self): class TestTTFTPrebuilt: - """Tests for CodeEvaluator.ttft() preset.""" + """Tests for SystemEvaluator.ttft() preset.""" def test_zero_ttft(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -816,7 +816,7 @@ def test_zero_ttft(self): assert score.passed is True def test_under_threshold(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -827,7 +827,7 @@ def test_under_threshold(self): assert score.passed is True def test_over_threshold(self): - evaluator = CodeEvaluator.ttft(threshold_ms=500) + evaluator = SystemEvaluator.ttft(threshold_ms=500) score = evaluator.evaluate_session( { "session_id": "s1", @@ -838,7 +838,7 @@ def test_over_threshold(self): assert score.passed is False def test_none_ttft_defaults_to_zero(self): - evaluator = CodeEvaluator.ttft(threshold_ms=1000) + evaluator = SystemEvaluator.ttft(threshold_ms=1000) score = evaluator.evaluate_session( { "session_id": "s1", @@ -848,7 +848,7 @@ def test_none_ttft_defaults_to_zero(self): assert score.scores["ttft"] == 1.0 def test_evaluator_name(self): - evaluator = CodeEvaluator.ttft() + evaluator = SystemEvaluator.ttft() assert evaluator.name == "ttft_evaluator" diff --git a/tests/test_udf_kernels.py b/tests/test_udf_kernels.py index 73d693bd..0d0e8e26 100644 --- a/tests/test_udf_kernels.py +++ b/tests/test_udf_kernels.py @@ -22,7 +22,7 @@ and intentionally keeps the normalized ``1.0 - (observed / budget)`` score for BigQuery SQL compatibility. 2. **Prebuilt divergence tests** — document that the Python - ``CodeEvaluator.{latency, error_rate, ...}`` prebuilts *no longer* + ``SystemEvaluator.{latency, error_rate, ...}`` prebuilts *no longer* mirror the SQL kernel scores. They return a binary 1.0/0.0 gate against the raw observed value instead, so the same input yields different numeric scores via the two paths while the pass/fail @@ -31,7 +31,7 @@ import pytest -from bigquery_agent_analytics.evaluators import CodeEvaluator +from bigquery_agent_analytics.evaluators import SystemEvaluator from bigquery_agent_analytics.udf_kernels import extract_response_text from bigquery_agent_analytics.udf_kernels import is_error_event from bigquery_agent_analytics.udf_kernels import score_cost @@ -264,7 +264,7 @@ def test_default_pricing(self): # ------------------------------------------------------------------ # -# Prebuilt divergence: CodeEvaluator prebuilts are binary gates, +# Prebuilt divergence: SystemEvaluator prebuilts are binary gates, # ``udf_kernels`` stays on the normalized ``1.0 - observed/budget``. # These two paths now intentionally disagree on the numeric score; # they still agree on the user-intent boundary (observed <= budget). @@ -272,7 +272,7 @@ def test_default_pricing(self): class TestPrebuiltBinaryLatency: - """CodeEvaluator.latency returns 1.0/0.0 against the raw budget.""" + """SystemEvaluator.latency returns 1.0/0.0 against the raw budget.""" @pytest.mark.parametrize( "avg,threshold,expected_score,expected_pass", @@ -285,7 +285,7 @@ class TestPrebuiltBinaryLatency: ], ) def test_binary(self, avg, threshold, expected_score, expected_pass): - ev = CodeEvaluator.latency(threshold_ms=threshold) + ev = SystemEvaluator.latency(threshold_ms=threshold) result = ev.evaluate_session({"session_id": "s1", "avg_latency_ms": avg}) assert result.scores["latency"] == pytest.approx(expected_score) assert result.passed is expected_pass @@ -310,7 +310,7 @@ class TestPrebuiltBinaryErrorRate: ], ) def test_binary(self, calls, errors, max_rate, expected_score, expected_pass): - ev = CodeEvaluator.error_rate(max_error_rate=max_rate) + ev = SystemEvaluator.error_rate(max_error_rate=max_rate) result = ev.evaluate_session( {"session_id": "s1", "tool_calls": calls, "tool_errors": errors} ) @@ -331,7 +331,7 @@ class TestPrebuiltBinaryTurnCount: ], ) def test_binary(self, turns, max_t, expected_score, expected_pass): - ev = CodeEvaluator.turn_count(max_turns=max_t) + ev = SystemEvaluator.turn_count(max_turns=max_t) result = ev.evaluate_session({"session_id": "s1", "turn_count": turns}) assert result.scores["turn_count"] == pytest.approx(expected_score) assert result.passed is expected_pass @@ -350,7 +350,7 @@ class TestPrebuiltBinaryTokenEfficiency: ], ) def test_binary(self, tokens, max_t, expected_score, expected_pass): - ev = CodeEvaluator.token_efficiency(max_tokens=max_t) + ev = SystemEvaluator.token_efficiency(max_tokens=max_t) result = ev.evaluate_session({"session_id": "s1", "total_tokens": tokens}) assert result.scores["token_efficiency"] == pytest.approx(expected_score) assert result.passed is expected_pass @@ -369,7 +369,7 @@ class TestPrebuiltBinaryTtft: ], ) def test_binary(self, avg, threshold, expected_score, expected_pass): - ev = CodeEvaluator.ttft(threshold_ms=threshold) + ev = SystemEvaluator.ttft(threshold_ms=threshold) result = ev.evaluate_session({"session_id": "s1", "avg_ttft_ms": avg}) assert result.scores["ttft"] == pytest.approx(expected_score) assert result.passed is expected_pass @@ -402,7 +402,7 @@ def test_binary( expected_score, expected_pass, ): - ev = CodeEvaluator.cost_per_session( + ev = SystemEvaluator.cost_per_session( max_cost_usd=max_c, input_cost_per_1k=inp_rate, output_cost_per_1k=out_rate, From e6d0c78e18fe94be5fa6e2d574e11e9871e65c89 Mon Sep 17 00:00:00 2001 From: Gigi Stark Date: Sun, 3 May 2026 17:57:46 +0000 Subject: [PATCH 2/2] Stage identical baseline copies and test renames for lineage tracking --- .../aggregate_grader.py | 430 ++++++ .../multi_trial_performance_evaluator.py | 336 ++++ .../performance_evaluator.py | 1073 +++++++++++++ .../system_evaluator.py | 1374 +++++++++++++++++ ...r_pipeline.py => test_aggregate_grader.py} | 0 ...test_multi_trial_performance_evaluator.py} | 0 ...uator.py => test_performance_evaluator.py} | 0 ...evaluators.py => test_system_evaluator.py} | 0 8 files changed, 3213 insertions(+) create mode 100644 src/bigquery_agent_analytics/aggregate_grader.py create mode 100644 src/bigquery_agent_analytics/multi_trial_performance_evaluator.py create mode 100644 src/bigquery_agent_analytics/performance_evaluator.py create mode 100644 src/bigquery_agent_analytics/system_evaluator.py rename tests/{test_grader_pipeline.py => test_aggregate_grader.py} (100%) rename tests/{test_multi_trial.py => test_multi_trial_performance_evaluator.py} (100%) rename tests/{test_trace_evaluator.py => test_performance_evaluator.py} (100%) rename tests/{test_sdk_evaluators.py => test_system_evaluator.py} (100%) diff --git a/src/bigquery_agent_analytics/aggregate_grader.py b/src/bigquery_agent_analytics/aggregate_grader.py new file mode 100644 index 00000000..6c306ac9 --- /dev/null +++ b/src/bigquery_agent_analytics/aggregate_grader.py @@ -0,0 +1,430 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Grader composition pipeline for combining multiple evaluators. + +Composes ``SystemEvaluator``, ``LLMAsJudge``, and custom graders into a +single verdict using configurable scoring strategies (weighted average, +binary all-pass, or majority vote). + +Example usage:: + + from bigquery_agent_analytics import ( + SystemEvaluator, GraderPipeline, LLMAsJudge, WeightedStrategy, + ) + + pipeline = ( + GraderPipeline(WeightedStrategy( + weights={"latency": 0.3, "correctness": 0.7}, + )) + .add_code_grader(SystemEvaluator.latency(), weight=0.3) + .add_llm_grader(LLMAsJudge.correctness(), weight=0.7) + ) + + verdict = await pipeline.evaluate( + session_summary={"session_id": "s1", "avg_latency_ms": 2000}, + trace_text="User: hello\\nAgent: hi", + final_response="hi", + ) +""" + +from __future__ import annotations + +import abc +import logging +from typing import Any, Callable + +from pydantic import BaseModel +from pydantic import Field + +from .evaluators import LLMAsJudge +from .evaluators import SystemEvaluator + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +# ------------------------------------------------------------------ # +# Data Models # +# ------------------------------------------------------------------ # + + +class GraderResult(BaseModel): + """Result from a single grader.""" + + grader_name: str = Field(description="Name of the grader.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric scores from this grader.", + ) + passed: bool = Field( + default=True, + description="Whether this grader passed.", + ) + + +class AggregateVerdict(BaseModel): + """Aggregated verdict from all graders in the pipeline.""" + + grader_results: list[GraderResult] = Field( + default_factory=list, + description="Individual grader results.", + ) + final_score: float = Field( + default=0.0, + description="Final aggregated score.", + ) + passed: bool = Field( + default=False, + description="Whether the overall evaluation passed.", + ) + strategy_name: str = Field( + default="", + description="Name of the scoring strategy used.", + ) + + +# ------------------------------------------------------------------ # +# Scoring Strategies # +# ------------------------------------------------------------------ # + + +class ScoringStrategy(abc.ABC): + """Abstract base class for scoring strategies.""" + + @abc.abstractmethod + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + """Aggregates grader results into a single verdict. + + Args: + grader_results: List of individual grader results. + + Returns: + AggregateVerdict with final score and pass/fail. + """ + + +class WeightedStrategy(ScoringStrategy): + """Weighted average of grader scores; pass if >= threshold.""" + + def __init__( + self, + weights: dict[str, float] | None = None, + threshold: float = 0.5, + ) -> None: + """Initializes the weighted strategy. + + Args: + weights: Mapping of grader name to weight. If None, + all graders are weighted equally. + threshold: Minimum weighted score to pass. + """ + self.weights = weights or {} + self.threshold = threshold + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="weighted") + + total_weight = 0.0 + weighted_sum = 0.0 + + for result in grader_results: + weight = self.weights.get(result.grader_name, 1.0) + # Average the grader's metric scores + if result.scores: + avg_score = sum(result.scores.values()) / len(result.scores) + else: + avg_score = 1.0 if result.passed else 0.0 + weighted_sum += avg_score * weight + total_weight += weight + + final_score = weighted_sum / total_weight if total_weight > 0 else 0.0 + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=final_score >= self.threshold, + strategy_name="weighted", + ) + + +class BinaryStrategy(ScoringStrategy): + """All graders must pass independently.""" + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="binary") + + all_passed = all(r.passed for r in grader_results) + + # Average of all scores + all_scores = [] + for r in grader_results: + all_scores.extend(r.scores.values()) + final_score = ( + sum(all_scores) / len(all_scores) + if all_scores + else (1.0 if all_passed else 0.0) + ) + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=all_passed, + strategy_name="binary", + ) + + +class MajorityStrategy(ScoringStrategy): + """Majority of graders must pass.""" + + def aggregate( + self, + grader_results: list[GraderResult], + ) -> AggregateVerdict: + if not grader_results: + return AggregateVerdict(strategy_name="majority") + + num_passed = sum(1 for r in grader_results if r.passed) + majority = num_passed > len(grader_results) / 2 + + # Average of all scores + all_scores = [] + for r in grader_results: + all_scores.extend(r.scores.values()) + final_score = ( + sum(all_scores) / len(all_scores) + if all_scores + else (1.0 if majority else 0.0) + ) + + return AggregateVerdict( + grader_results=grader_results, + final_score=final_score, + passed=majority, + strategy_name="majority", + ) + + +# ------------------------------------------------------------------ # +# Grader Pipeline # +# ------------------------------------------------------------------ # + + +class _GraderEntry: + """Internal wrapper for a grader in the pipeline.""" + + def __init__( + self, + name: str, + evaluate_fn: Any, + weight: float = 1.0, + is_async: bool = False, + ) -> None: + self.name = name + self.evaluate_fn = evaluate_fn + self.weight = weight + self.is_async = is_async + + +class GraderPipeline: + """Composes multiple graders into a single evaluation pipeline. + + Supports ``SystemEvaluator``, ``LLMAsJudge``, and arbitrary custom + grader functions combined via a configurable ``ScoringStrategy``. + + Example:: + + pipeline = ( + GraderPipeline(WeightedStrategy(threshold=0.6)) + .add_code_grader(SystemEvaluator.latency()) + .add_llm_grader(LLMAsJudge.correctness()) + ) + verdict = await pipeline.evaluate( + session_summary={...}, + trace_text="...", + final_response="...", + ) + """ + + def __init__(self, strategy: ScoringStrategy) -> None: + """Initializes the pipeline with a scoring strategy. + + Args: + strategy: The strategy used to aggregate grader results. + """ + self.strategy = strategy + self._graders: list[_GraderEntry] = [] + + def add_code_grader( + self, + evaluator: SystemEvaluator, + weight: float = 1.0, + ) -> GraderPipeline: + """Adds a SystemEvaluator grader to the pipeline. + + Args: + evaluator: A SystemEvaluator instance. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=evaluator.name, + evaluate_fn=evaluator, + weight=weight, + is_async=False, + ) + ) + return self + + def add_llm_grader( + self, + judge: LLMAsJudge, + weight: float = 1.0, + ) -> GraderPipeline: + """Adds an LLMAsJudge grader to the pipeline. + + Args: + judge: An LLMAsJudge instance. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=judge.name, + evaluate_fn=judge, + weight=weight, + is_async=True, + ) + ) + return self + + def add_custom_grader( + self, + name: str, + fn: Callable[[dict[str, Any]], GraderResult], + weight: float = 1.0, + ) -> GraderPipeline: + """Adds a custom grader function to the pipeline. + + The function receives a dict with ``session_summary``, + ``trace_text``, and ``final_response`` keys. + + Args: + name: Name for the grader. + fn: Function returning a GraderResult. + weight: Weight for weighted strategies. + + Returns: + Self for chaining. + """ + self._graders.append( + _GraderEntry( + name=name, + evaluate_fn=fn, + weight=weight, + is_async=False, + ) + ) + return self + + async def evaluate( + self, + session_summary: dict[str, Any] | None = None, + trace_text: str = "", + final_response: str = "", + ) -> AggregateVerdict: + """Evaluates using all graders and aggregates results. + + Args: + session_summary: Dict with session metrics (for + SystemEvaluator graders). + trace_text: Formatted trace text (for LLMAsJudge + graders). + final_response: Final agent response. + + Returns: + AggregateVerdict with combined results. + """ + session_summary = session_summary or {} + grader_results: list[GraderResult] = [] + + for entry in self._graders: + try: + result = await self._run_grader( + entry, session_summary, trace_text, final_response + ) + grader_results.append(result) + except Exception as e: + logger.warning("Grader %s failed: %s", entry.name, e) + grader_results.append( + GraderResult( + grader_name=entry.name, + scores={}, + passed=False, + ) + ) + + return self.strategy.aggregate(grader_results) + + async def _run_grader( + self, + entry: _GraderEntry, + session_summary: dict[str, Any], + trace_text: str, + final_response: str, + ) -> GraderResult: + """Runs a single grader and returns its result.""" + evaluator = entry.evaluate_fn + + if isinstance(evaluator, SystemEvaluator): + score = evaluator.evaluate_session(session_summary) + return GraderResult( + grader_name=entry.name, + scores=score.scores, + passed=score.passed, + ) + + if isinstance(evaluator, LLMAsJudge): + score = await evaluator.evaluate_session( + trace_text=trace_text, + final_response=final_response, + ) + return GraderResult( + grader_name=entry.name, + scores=score.scores, + passed=score.passed, + ) + + # Custom grader function + context = { + "session_summary": session_summary, + "trace_text": trace_text, + "final_response": final_response, + } + return evaluator(context) diff --git a/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py b/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py new file mode 100644 index 00000000..4c0f592f --- /dev/null +++ b/src/bigquery_agent_analytics/multi_trial_performance_evaluator.py @@ -0,0 +1,336 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Multi-trial evaluation runner with pass@k / pass^k metrics. + +Wraps any ``BigQueryTraceEvaluator`` to run N trials per task and +compute probabilistic pass-rate metrics that account for agent +non-determinism. + +Example usage:: + + from bigquery_agent_analytics import ( + BigQueryTraceEvaluator, TrialRunner, + ) + + evaluator = BigQueryTraceEvaluator( + project_id="my-project", + dataset_id="analytics", + ) + runner = TrialRunner(evaluator, num_trials=5) + + report = await runner.run_trials( + session_id="sess-123", + golden_trajectory=[{"tool_name": "search", "args": {}}], + ) + print(report.pass_at_k, report.pass_pow_k) +""" + +from __future__ import annotations + +import asyncio +import logging +import math +import statistics +from typing import Any, Optional + +from pydantic import BaseModel +from pydantic import Field + +from .trace_evaluator import BigQueryTraceEvaluator +from .trace_evaluator import EvalStatus +from .trace_evaluator import MatchType + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +# ------------------------------------------------------------------ # +# Data Models # +# ------------------------------------------------------------------ # + + +class TrialResult(BaseModel): + """Result of a single trial.""" + + trial_index: int = Field(description="Zero-based trial index.") + passed: bool = Field(description="Whether this trial passed.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric scores for this trial.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional trial details.", + ) + + +class MultiTrialReport(BaseModel): + """Aggregate report across N trials of one task.""" + + session_id: str = Field(description="The session ID evaluated.") + num_trials: int = Field(description="Number of trials run.") + trial_results: list[TrialResult] = Field( + default_factory=list, + description="Individual trial results.", + ) + pass_at_k: float = Field( + default=0.0, + description="P(>=1 pass in k trials).", + ) + pass_pow_k: float = Field( + default=0.0, + description="P(all k trials pass).", + ) + per_trial_pass_rate: float = Field( + default=0.0, + description="Fraction of trials that passed.", + ) + mean_scores: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric across trials.", + ) + score_std_dev: dict[str, float] = Field( + default_factory=dict, + description="Standard deviation per metric across trials.", + ) + + +# ------------------------------------------------------------------ # +# Static Helpers # +# ------------------------------------------------------------------ # + + +def compute_pass_at_k( + num_trials: int, + num_passed: int, +) -> float: + """Computes pass@k: P(>=1 pass in k trials). + + Uses the formula: 1 - C(n-c, k) / C(n, k) + where n = num_trials, c = num_passed, k = num_trials. + + Args: + num_trials: Total number of trials (k). + num_passed: Number of trials that passed (c). + + Returns: + Probability that at least one trial passes. + """ + if num_trials <= 0: + return 0.0 + if num_passed <= 0: + return 0.0 + if num_passed >= num_trials: + return 1.0 + + # 1 - C(n-c, k) / C(n, k) + n = num_trials + c = num_passed + k = num_trials + + # C(n-c, k) / C(n, k) -- if n-c < k then C(n-c,k)=0 => pass@k=1 + if n - c < k: + return 1.0 + + # Use log to avoid overflow for large values + log_numerator = sum(math.log(n - c - i) for i in range(k)) + log_denominator = sum(math.log(n - i) for i in range(k)) + + return 1.0 - math.exp(log_numerator - log_denominator) + + +def compute_pass_pow_k( + num_trials: int, + num_passed: int, +) -> float: + """Computes pass^k: P(all k trials pass). + + Uses the formula: (num_passed / num_trials) ** num_trials. + + Args: + num_trials: Total number of trials. + num_passed: Number of trials that passed. + + Returns: + Probability that all trials pass. + """ + if num_trials <= 0: + return 0.0 + if num_passed <= 0: + return 0.0 + rate = num_passed / num_trials + return rate**num_trials + + +# ------------------------------------------------------------------ # +# TrialRunner # +# ------------------------------------------------------------------ # + + +class TrialRunner: + """Runs multiple evaluation trials and computes aggregate metrics. + + Wraps a ``BigQueryTraceEvaluator`` and runs N trials per task, + computing pass@k and pass^k metrics that account for agent + non-determinism (e.g. LLM judges produce different scores each + call). + + Example:: + + runner = TrialRunner(evaluator, num_trials=5, concurrency=3) + report = await runner.run_trials( + session_id="sess-123", + golden_trajectory=[...], + ) + """ + + def __init__( + self, + evaluator: BigQueryTraceEvaluator, + num_trials: int = 5, + concurrency: int = 3, + ) -> None: + """Initializes the TrialRunner. + + Args: + evaluator: The trace evaluator to wrap. + num_trials: Number of trials to run per task. + concurrency: Maximum concurrent evaluations. + """ + self.evaluator = evaluator + self.num_trials = num_trials + self.concurrency = concurrency + + async def run_trials( + self, + session_id: str, + golden_trajectory: Optional[list[dict]] = None, + golden_response: Optional[str] = None, + match_type: MatchType = MatchType.EXACT, + task_description: Optional[str] = None, + use_llm_judge: bool = False, + thresholds: Optional[dict[str, float]] = None, + ) -> MultiTrialReport: + """Runs N trials of evaluation for a single session. + + Args: + session_id: The session ID to evaluate. + golden_trajectory: Expected tool call sequence. + golden_response: Expected final response. + match_type: Type of trajectory matching. + task_description: Task description for LLM judge. + use_llm_judge: Whether to use LLM-as-judge. + thresholds: Metric thresholds for pass/fail. + + Returns: + MultiTrialReport with aggregate metrics. + """ + semaphore = asyncio.Semaphore(self.concurrency) + trial_results: list[TrialResult] = [] + + async def _run_one(trial_index: int) -> TrialResult: + async with semaphore: + result = await self.evaluator.evaluate_session( + session_id=session_id, + golden_trajectory=golden_trajectory, + golden_response=golden_response, + match_type=match_type, + task_description=task_description, + use_llm_judge=use_llm_judge, + thresholds=thresholds, + ) + return TrialResult( + trial_index=trial_index, + passed=result.eval_status == EvalStatus.PASSED, + scores=result.scores, + details=result.details, + ) + + tasks = [_run_one(i) for i in range(self.num_trials)] + trial_results = list(await asyncio.gather(*tasks)) + + return self._build_report(session_id, trial_results) + + async def run_trials_batch( + self, + eval_dataset: list[dict[str, Any]], + match_type: MatchType = MatchType.EXACT, + use_llm_judge: bool = False, + ) -> list[MultiTrialReport]: + """Runs multi-trial evaluation for a batch of tasks. + + Args: + eval_dataset: List of dicts with session_id, + expected_trajectory, etc. + match_type: Type of trajectory matching. + use_llm_judge: Whether to use LLM-as-judge. + + Returns: + List of MultiTrialReport, one per task. + """ + reports = [] + for item in eval_dataset: + report = await self.run_trials( + session_id=item["session_id"], + golden_trajectory=item.get("expected_trajectory"), + golden_response=item.get("expected_response"), + match_type=match_type, + task_description=item.get("task_description"), + use_llm_judge=use_llm_judge, + thresholds=item.get("thresholds"), + ) + reports.append(report) + return reports + + def _build_report( + self, + session_id: str, + trial_results: list[TrialResult], + ) -> MultiTrialReport: + """Builds a MultiTrialReport from trial results.""" + num_trials = len(trial_results) + if num_trials == 0: + return MultiTrialReport( + session_id=session_id, + num_trials=0, + ) + + num_passed = sum(1 for t in trial_results if t.passed) + + # Aggregate scores + all_metric_names: set[str] = set() + for t in trial_results: + all_metric_names.update(t.scores.keys()) + + mean_scores: dict[str, float] = {} + score_std_dev: dict[str, float] = {} + + for metric in sorted(all_metric_names): + values = [t.scores.get(metric, 0.0) for t in trial_results] + mean_scores[metric] = statistics.mean(values) + if len(values) >= 2: + score_std_dev[metric] = statistics.stdev(values) + else: + score_std_dev[metric] = 0.0 + + return MultiTrialReport( + session_id=session_id, + num_trials=num_trials, + trial_results=trial_results, + pass_at_k=compute_pass_at_k(num_trials, num_passed), + pass_pow_k=compute_pass_pow_k(num_trials, num_passed), + per_trial_pass_rate=num_passed / num_trials, + mean_scores=mean_scores, + score_std_dev=score_std_dev, + ) diff --git a/src/bigquery_agent_analytics/performance_evaluator.py b/src/bigquery_agent_analytics/performance_evaluator.py new file mode 100644 index 00000000..cdb87536 --- /dev/null +++ b/src/bigquery_agent_analytics/performance_evaluator.py @@ -0,0 +1,1073 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Trace-Based Evaluation Harness for ADK Agents. + +This module provides capabilities to evaluate agent behavior using stored +traces in BigQuery. It supports: + +- Trajectory matching (exact, in-order, any-order) +- LLM-as-judge evaluation +- Custom metric scoring +- Deterministic replay for debugging + +Example usage: + evaluator = BigQueryTraceEvaluator( + project_id="my-project", + dataset_id="agent_analytics", + ) + + results = await evaluator.evaluate_session( + session_id="session-123", + golden_trajectory=[ + {"tool_name": "search", "args": {"query": "weather"}}, + {"tool_name": "format_response", "args": {}}, + ], + golden_response="The weather is sunny.", + ) +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass +from dataclasses import field +from datetime import datetime +from enum import Enum +import json +import logging +from typing import Any, Callable, Optional + +from google.cloud import bigquery +from pydantic import BaseModel +from pydantic import Field + +from bigquery_agent_analytics.evaluators import strip_markdown_fences + +from ._telemetry import LabeledBigQueryClient +from ._telemetry import make_bq_client +from ._telemetry import with_sdk_labels + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + + +class MatchType(Enum): + """The type of trajectory matching to use.""" + + EXACT = "exact" + """Requires perfect match between actual and expected tool calls.""" + + IN_ORDER = "in_order" + """Requires tools in same order, allows extra tools between.""" + + ANY_ORDER = "any_order" + """Requires all expected tools present, any order allowed.""" + + +class EvalStatus(Enum): + """Status of an evaluation.""" + + PASSED = "passed" + FAILED = "failed" + NOT_EVALUATED = "not_evaluated" + + +@dataclass +class TraceEvent: + """Represents a single event from a trace.""" + + event_type: str + agent: Optional[str] + timestamp: datetime + content: dict[str, Any] + attributes: dict[str, Any] + span_id: Optional[str] = None + parent_span_id: Optional[str] = None + latency_ms: Optional[int] = None + status: str = "OK" + error_message: Optional[str] = None + + @classmethod + def from_bigquery_row(cls, row: dict[str, Any]) -> "TraceEvent": + """Creates a TraceEvent from a BigQuery row.""" + content = row.get("content") + if isinstance(content, str): + try: + content = json.loads(content) + except (json.JSONDecodeError, TypeError): + content = {"raw": content} + elif content is None: + content = {} + + attributes = row.get("attributes") + if isinstance(attributes, str): + try: + attributes = json.loads(attributes) + except (json.JSONDecodeError, TypeError): + attributes = {} + elif attributes is None: + attributes = {} + + latency_ms = row.get("latency_ms") + if isinstance(latency_ms, str): + try: + latency_data = json.loads(latency_ms) + latency_ms = latency_data.get("total_ms") + except (json.JSONDecodeError, TypeError): + latency_ms = None + elif isinstance(latency_ms, dict): + latency_ms = latency_ms.get("total_ms") + + return cls( + event_type=row.get("event_type", "UNKNOWN"), + agent=row.get("agent"), + timestamp=row.get("timestamp", datetime.now()), + content=content, + attributes=attributes, + span_id=row.get("span_id"), + parent_span_id=row.get("parent_span_id"), + latency_ms=latency_ms, + status=row.get("status", "OK"), + error_message=row.get("error_message"), + ) + + +@dataclass +class ToolCall: + """Represents a tool call extracted from a trace.""" + + tool_name: str + args: dict[str, Any] + result: Optional[dict[str, Any]] = None + status: str = "OK" + error_message: Optional[str] = None + latency_ms: Optional[int] = None + + +@dataclass +class SessionTrace: + """Complete trace for a session.""" + + session_id: str + user_id: Optional[str] + events: list[TraceEvent] + tool_calls: list[ToolCall] = field(default_factory=list) + final_response: Optional[str] = None + total_latency_ms: Optional[int] = None + + def extract_tool_trajectory(self) -> list[ToolCall]: + """Extracts the tool call trajectory from events.""" + tool_calls = [] + tool_starts: dict[str, TraceEvent] = {} + + for event in self.events: + if event.event_type == "TOOL_STARTING": + tool_name = event.content.get("tool", "unknown") + tool_starts[event.span_id or tool_name] = event + + elif event.event_type == "TOOL_COMPLETED": + tool_name = event.content.get("tool", "unknown") + start_event = tool_starts.pop(event.span_id or tool_name, None) + + args = {} + if start_event: + args = start_event.content.get("args", {}) + + tool_calls.append( + ToolCall( + tool_name=tool_name, + args=args, + result=event.content.get("result"), + status="OK", + latency_ms=event.latency_ms, + ) + ) + + elif event.event_type == "TOOL_ERROR": + tool_name = event.content.get("tool", "unknown") + start_event = tool_starts.pop(event.span_id or tool_name, None) + + args = {} + if start_event: + args = start_event.content.get("args", {}) + + tool_calls.append( + ToolCall( + tool_name=tool_name, + args=args, + status="ERROR", + error_message=event.error_message, + latency_ms=event.latency_ms, + ) + ) + + self.tool_calls = tool_calls + return tool_calls + + def extract_final_response(self) -> Optional[str]: + """Extracts the final agent response from events. + + Checks LLM_RESPONSE first (most reliable response source), + then falls back to AGENT_COMPLETED. + """ + # Prefer the last LLM_RESPONSE (most reliable response source) + for event in reversed(self.events): + if event.event_type == "LLM_RESPONSE": + content = event.content + if isinstance(content, dict): + return content.get("response") or content.get("text_summary") + return str(content) if content else None + + # Fallback to AGENT_COMPLETED + for event in reversed(self.events): + if event.event_type == "AGENT_COMPLETED": + content = event.content + if isinstance(content, dict): + return content.get("response") or content.get("text_summary") + return str(content) if content else None + + return None + + +class TrajectoryMetrics: + """Computes trajectory-based evaluation metrics.""" + + @staticmethod + def compute_exact_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes exact match score between trajectories. + + Args: + actual: List of actual tool calls from trace. + expected: List of expected tool calls with tool_name and args. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 if not actual else 0.0 + + if len(actual) != len(expected): + return 0.0 + + matches = 0 + for act, exp in zip(actual, expected): + if act.tool_name == exp.get("tool_name"): + # Check args if specified + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + matches += 1 + + return matches / len(expected) + + @staticmethod + def compute_in_order_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes in-order match score. + + Checks if expected tools appear in order within actual calls. + + Args: + actual: List of actual tool calls. + expected: List of expected tool calls. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 + + expected_idx = 0 + for act in actual: + if expected_idx >= len(expected): + break + + exp = expected[expected_idx] + if act.tool_name == exp.get("tool_name"): + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + expected_idx += 1 + + return expected_idx / len(expected) + + @staticmethod + def compute_any_order_match( + actual: list[ToolCall], + expected: list[dict[str, Any]], + ) -> float: + """Computes any-order match score. + + Checks if all expected tools appear in actual calls (any order). + + Args: + actual: List of actual tool calls. + expected: List of expected tool calls. + + Returns: + Score between 0.0 and 1.0. + """ + if not expected: + return 1.0 + + remaining = list(expected) + for act in actual: + for i, exp in enumerate(remaining): + if act.tool_name == exp.get("tool_name"): + exp_args = exp.get("args", {}) + if not exp_args or TrajectoryMetrics._args_match(act.args, exp_args): + remaining.pop(i) + break + + matched = len(expected) - len(remaining) + return matched / len(expected) + + @staticmethod + def _args_match(actual: dict[str, Any], expected: dict[str, Any]) -> bool: + """Checks if actual args contain expected args.""" + for key, value in expected.items(): + if key not in actual: + return False + if value is not None and actual[key] != value: + return False + return True + + @staticmethod + def compute_step_efficiency( + actual_steps: int, + optimal_steps: int, + ) -> float: + """Computes step efficiency score. + + Args: + actual_steps: Number of steps taken by agent. + optimal_steps: Optimal number of steps. + + Returns: + Score between 0.0 and 1.0 (1.0 = optimal or better). + """ + if optimal_steps <= 0: + return 1.0 if actual_steps == 0 else 0.0 + + if actual_steps <= optimal_steps: + return 1.0 + + # Penalize extra steps with diminishing returns + efficiency = optimal_steps / actual_steps + return max(0.0, efficiency) + + +class EvaluationResult(BaseModel): + """Result of evaluating a session trace.""" + + session_id: str = Field(description="The session ID that was evaluated.") + eval_status: EvalStatus = Field(description="Overall evaluation status.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Individual metric scores.", + ) + overall_score: Optional[float] = Field( + default=None, + description="Overall weighted score if computed.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional evaluation details.", + ) + llm_judge_feedback: Optional[str] = Field( + default=None, + description="Feedback from LLM judge if used.", + ) + + +class BigQueryTraceEvaluator: + """Evaluates agent traces stored in BigQuery. + + This evaluator retrieves trace data from BigQuery and computes various + metrics including trajectory matching, response quality, and custom metrics. + + Example: + evaluator = BigQueryTraceEvaluator( + project_id="my-project", + dataset_id="agent_analytics", + ) + + result = await evaluator.evaluate_session( + session_id="sess-123", + golden_trajectory=[{"tool_name": "search", "args": {"q": "test"}}], + ) + """ + + # SQL query to retrieve complete session trace + _DEFAULT_EVENT_TYPES = [ + "USER_MESSAGE_RECEIVED", + "AGENT_STARTING", + "AGENT_COMPLETED", + "TOOL_STARTING", + "TOOL_COMPLETED", + "TOOL_ERROR", + "LLM_REQUEST", + "LLM_RESPONSE", + "LLM_ERROR", + "INVOCATION_STARTING", + "INVOCATION_COMPLETED", + "STATE_DELTA", + "HITL_CONFIRMATION_REQUEST", + "HITL_CONFIRMATION_REQUEST_COMPLETED", + "HITL_CREDENTIAL_REQUEST", + "HITL_CREDENTIAL_REQUEST_COMPLETED", + "HITL_INPUT_REQUEST", + "HITL_INPUT_REQUEST_COMPLETED", + ] + + _SESSION_TRACE_QUERY = """ + SELECT + event_type, + agent, + timestamp, + content, + attributes, + span_id, + parent_span_id, + latency_ms, + status, + error_message, + user_id + FROM `{project}.{dataset}.{table}` + WHERE session_id = @session_id + AND event_type IN UNNEST(@event_types) + ORDER BY timestamp ASC + """ + + # Default LLM judge prompt for trajectory evaluation + _LLM_JUDGE_PROMPT = """You are evaluating an AI agent's task execution trajectory. + +## Task Description +{task_description} + +## Agent Trajectory +{trajectory_json} + +## Expected Trajectory (if provided) +{expected_trajectory} + +## Final Response +{final_response} + +## Evaluation Criteria +Score each criterion from 0 to 10: +1. task_completion: Did the agent successfully complete the task? +2. efficiency: Were the steps taken necessary and minimal? +3. tool_usage: Were the right tools used with correct arguments? +4. reasoning: Was the agent's reasoning sound? +5. overall: Overall score averaging the above. + +IMPORTANT: You MUST respond with ONLY a valid JSON object. No explanation before or after. +Keep justification brief (under 100 characters). + +Required JSON format: +{{"task_completion": 7, "efficiency": 8, "tool_usage": 9, "reasoning": 7, "overall": 8, "justification": "Brief reason"}} +""" + + def __init__( + self, + project_id: str, + dataset_id: str, + table_id: str = "agent_events", + client: Optional[bigquery.Client] = None, + llm_judge_model: Optional[str] = None, + include_event_types: Optional[list[str]] = None, + ) -> None: + """Initializes the BigQueryTraceEvaluator. + + Args: + project_id: Google Cloud project ID. + dataset_id: BigQuery dataset ID containing trace data. + table_id: BigQuery table ID. Defaults to "agent_events". + client: Optional BigQuery client. Created if not provided. + llm_judge_model: Optional model name for LLM-as-judge evaluation. + include_event_types: Optional list of event types to include + when fetching session traces. Defaults to all standard + ADK event types including HITL and STATE_DELTA. Pass a + custom list to restrict or extend the event types + evaluated without patching SQL templates. + """ + self.project_id = project_id + self.dataset_id = dataset_id + self.table_id = table_id + self.table_ref = f"{project_id}.{dataset_id}.{table_id}" + self._client = client + self._warned_unlabeled_client = False + self.llm_judge_model = llm_judge_model or "gemini-2.5-flash" + self.include_event_types = include_event_types or self._DEFAULT_EVENT_TYPES + + @property + def client(self) -> bigquery.Client: + """Lazily initializes and returns the BigQuery client.""" + if self._client is None: + self._client = make_bq_client(self.project_id) + elif isinstance(self._client, bigquery.Client) and not isinstance( + self._client, LabeledBigQueryClient + ): + if not self._warned_unlabeled_client: + logger.warning( + "User-provided bigquery.Client is not a " + "LabeledBigQueryClient; SDK telemetry labels will not be " + "applied to jobs from this client. To opt in, construct " + "the client via bigquery_agent_analytics.make_bq_client() " + "or pass a LabeledBigQueryClient directly." + ) + self._warned_unlabeled_client = True + return self._client + + async def get_session_trace(self, session_id: str) -> SessionTrace: + """Retrieves the complete trace for a session. + + Args: + session_id: The session ID to retrieve. + + Returns: + SessionTrace containing all events for the session. + """ + query = self._SESSION_TRACE_QUERY.format( + project=self.project_id, + dataset=self.dataset_id, + table=self.table_id, + ) + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "session_id", + "STRING", + session_id, + ), + bigquery.ArrayQueryParameter( + "event_types", + "STRING", + self.include_event_types, + ), + ] + ) + # Apply labels BEFORE executor dispatch so they materialize on the + # QueryJobConfig in the caller's thread. + job_config = with_sdk_labels(job_config, feature="trace-read") + + # Run query in executor to avoid blocking + loop = asyncio.get_event_loop() + query_job = await loop.run_in_executor( + None, + lambda: self.client.query(query, job_config=job_config), + ) + + results = await loop.run_in_executor(None, lambda: list(query_job.result())) + + events = [TraceEvent.from_bigquery_row(dict(row)) for row in results] + + user_id = None + if results: + user_id = results[0].get("user_id") + + trace = SessionTrace( + session_id=session_id, + user_id=user_id, + events=events, + ) + + # Extract tool trajectory and final response + trace.extract_tool_trajectory() + trace.final_response = trace.extract_final_response() + + # Compute total latency + if events: + start = min(e.timestamp for e in events) + end = max(e.timestamp for e in events) + trace.total_latency_ms = int((end - start).total_seconds() * 1000) + + return trace + + async def evaluate_session( + self, + session_id: str, + golden_trajectory: Optional[list[dict[str, Any]]] = None, + golden_response: Optional[str] = None, + match_type: MatchType = MatchType.EXACT, + task_description: Optional[str] = None, + use_llm_judge: bool = False, + custom_metrics: Optional[dict[str, Callable]] = None, + thresholds: Optional[dict[str, float]] = None, + ) -> EvaluationResult: + """Evaluates a single session against golden data. + + Args: + session_id: The session ID to evaluate. + golden_trajectory: Expected tool call sequence. + golden_response: Expected final response. + match_type: Type of trajectory matching to use. + task_description: Description of the task for LLM judge. + use_llm_judge: Whether to use LLM-as-judge evaluation. + custom_metrics: Dict of custom metric functions. + thresholds: Dict of metric name to threshold for pass/fail. + + Returns: + EvaluationResult with scores and status. + """ + # Retrieve trace + trace = await self.get_session_trace(session_id) + + scores: dict[str, float] = {} + details: dict[str, Any] = { + "actual_tool_calls": len(trace.tool_calls), + "expected_tool_calls": ( + len(golden_trajectory) if golden_trajectory else 0 + ), + } + + # Compute trajectory score + if golden_trajectory is not None: + if match_type == MatchType.EXACT: + scores["trajectory_exact_match"] = ( + TrajectoryMetrics.compute_exact_match( + trace.tool_calls, golden_trajectory + ) + ) + elif match_type == MatchType.IN_ORDER: + scores["trajectory_in_order"] = ( + TrajectoryMetrics.compute_in_order_match( + trace.tool_calls, golden_trajectory + ) + ) + elif match_type == MatchType.ANY_ORDER: + scores["trajectory_any_order"] = ( + TrajectoryMetrics.compute_any_order_match( + trace.tool_calls, golden_trajectory + ) + ) + + # Step efficiency + if golden_trajectory: + scores["step_efficiency"] = TrajectoryMetrics.compute_step_efficiency( + len(trace.tool_calls), + len(golden_trajectory), + ) + + # Response matching (simple text comparison) + if golden_response is not None and trace.final_response is not None: + scores["response_match"] = self._compute_response_match( + trace.final_response, golden_response + ) + + # LLM-as-judge evaluation + llm_feedback = None + if use_llm_judge: + llm_scores, llm_feedback = await self._llm_judge_evaluate( + trace=trace, + task_description=task_description or "Complete the user's request.", + expected_trajectory=golden_trajectory, + ) + scores.update(llm_scores) + + # Custom metrics + if custom_metrics: + for metric_name, metric_fn in custom_metrics.items(): + try: + score = metric_fn(trace, golden_trajectory, golden_response) + scores[metric_name] = float(score) + except Exception as e: + logger.warning("Custom metric %s failed: %s", metric_name, e) + scores[metric_name] = 0.0 + + # Determine overall status + thresholds = thresholds or {} + passed = True + for metric_name, score in scores.items(): + threshold = thresholds.get(metric_name, 0.5) + if score < threshold: + passed = False + details[f"{metric_name}_threshold"] = threshold + + # Compute overall score as mean + overall_score = None + if scores: + overall_score = sum(scores.values()) / len(scores) + + return EvaluationResult( + session_id=session_id, + eval_status=EvalStatus.PASSED if passed else EvalStatus.FAILED, + scores=scores, + overall_score=overall_score, + details=details, + llm_judge_feedback=llm_feedback, + ) + + async def evaluate_batch( + self, + eval_dataset: list[dict[str, Any]], + match_type: MatchType = MatchType.EXACT, + use_llm_judge: bool = False, + concurrency: int = 5, + ) -> list[EvaluationResult]: + """Evaluates multiple sessions from an eval dataset. + + Args: + eval_dataset: List of dicts with session_id, expected_trajectory, etc. + match_type: Type of trajectory matching. + use_llm_judge: Whether to use LLM judge. + concurrency: Max concurrent evaluations. + + Returns: + List of EvaluationResult for each session. + """ + semaphore = asyncio.Semaphore(concurrency) + + async def evaluate_one(item: dict[str, Any]) -> EvaluationResult: + async with semaphore: + return await self.evaluate_session( + session_id=item["session_id"], + golden_trajectory=item.get("expected_trajectory"), + golden_response=item.get("expected_response"), + match_type=match_type, + task_description=item.get("task_description"), + use_llm_judge=use_llm_judge, + thresholds=item.get("thresholds"), + ) + + tasks = [evaluate_one(item) for item in eval_dataset] + return await asyncio.gather(*tasks) + + def _compute_response_match( + self, + actual: str, + expected: str, + ) -> float: + """Computes simple response match score. + + Args: + actual: Actual response text. + expected: Expected response text. + + Returns: + Score between 0.0 and 1.0. + """ + if not actual or not expected: + return 0.0 if actual != expected else 1.0 + + # Normalize strings + actual_norm = actual.lower().strip() + expected_norm = expected.lower().strip() + + if actual_norm == expected_norm: + return 1.0 + + # Simple word overlap score + actual_words = set(actual_norm.split()) + expected_words = set(expected_norm.split()) + + if not expected_words: + return 1.0 if not actual_words else 0.0 + + intersection = actual_words & expected_words + return len(intersection) / len(expected_words) + + async def _llm_judge_evaluate( + self, + trace: SessionTrace, + task_description: str, + expected_trajectory: Optional[list[dict[str, Any]]], + ) -> tuple[dict[str, float], str]: + """Uses LLM as judge to evaluate the trace. + + Args: + trace: The session trace to evaluate. + task_description: Description of the task. + expected_trajectory: Expected tool calls if available. + + Returns: + Tuple of (scores dict, feedback string). + """ + try: + from google import genai + from google.genai import types + except ImportError: + logger.warning("google-genai not installed, skipping LLM judge.") + return {}, "LLM judge unavailable - google-genai not installed" + + # Format trajectory for prompt + trajectory_data = [ + { + "tool": tc.tool_name, + "args": tc.args, + "status": tc.status, + } + for tc in trace.tool_calls + ] + + prompt = self._LLM_JUDGE_PROMPT.format( + task_description=task_description, + trajectory_json=json.dumps(trajectory_data, indent=2), + expected_trajectory=json.dumps(expected_trajectory, indent=2) + if expected_trajectory + else "Not provided", + final_response=trace.final_response or "No response captured", + ) + + try: + client = genai.Client() + response = await client.aio.models.generate_content( + model=self.llm_judge_model, + contents=prompt, + config=types.GenerateContentConfig( + temperature=0.1, + max_output_tokens=1024, + ), + ) + + response_text = response.text.strip() + + # Strip markdown fences and extract JSON + if response_text.startswith("```"): + json_str = strip_markdown_fences(response_text) + else: + json_str = None + if not json_str: + # No fences found — try to extract JSON object directly + if "{" in response_text: + try: + start = response_text.index("{") + brace_count = 0 + end = start + for i, char in enumerate(response_text[start:], start): + if char == "{": + brace_count += 1 + elif char == "}": + brace_count -= 1 + if brace_count == 0: + end = i + 1 + break + json_str = response_text[start:end] + except (ValueError, IndexError): + pass + + if not json_str: + return {}, response_text + + # Clean up the JSON string - handle common issues + json_str = json_str.strip() + # Remove control characters that break JSON parsing + json_str = "".join( + char for char in json_str if char >= " " or char in "\n\r\t" + ) + + try: + result = json.loads(json_str) + except json.JSONDecodeError: + # Try to fix common JSON issues + import re + + # Replace unescaped newlines in strings + fixed_json = re.sub(r"(? None: + """Injects a recorded LLM response for replay.""" + self.llm_responses[self.current_step] = response + self.current_step += 1 + + def inject_tool_response(self, tool_name: str, response: Any) -> None: + """Injects a recorded tool response for replay.""" + self.tool_responses[tool_name] = response + + def get_llm_response(self, step: int) -> Optional[str]: + """Gets injected LLM response for a step.""" + return self.llm_responses.get(step) + + def get_tool_response(self, tool_name: str) -> Optional[Any]: + """Gets injected tool response.""" + return self.tool_responses.get(tool_name) + + +class TraceReplayRunner: + """Replays agent sessions deterministically for debugging. + + This runner uses recorded traces to replay agent execution with + deterministic outcomes, useful for debugging and root cause analysis. + + Example: + replay_runner = TraceReplayRunner(evaluator) + result = await replay_runner.replay_session( + session_id="sess-123", + replay_mode="step", + ) + """ + + def __init__(self, evaluator: BigQueryTraceEvaluator) -> None: + """Initializes the replay runner. + + Args: + evaluator: BigQueryTraceEvaluator for trace retrieval. + """ + self.evaluator = evaluator + + async def replay_session( + self, + session_id: str, + replay_mode: str = "full", + step_callback: Optional[ + Callable[[TraceEvent, ReplayContext], None] + ] = None, + ) -> ReplayContext: + """Replays a recorded session step by step. + + Args: + session_id: The session ID to replay. + replay_mode: "full" for all events, "step" for pause at each step, + "tool_only" for only tool calls. + step_callback: Optional callback invoked at each step. + + Returns: + ReplayContext with all injected responses. + """ + trace = await self.evaluator.get_session_trace(session_id) + + replay_context = ReplayContext() + + for event in trace.events: + # Filter by mode + if replay_mode == "tool_only" and event.event_type not in [ + "TOOL_STARTING", + "TOOL_COMPLETED", + "TOOL_ERROR", + ]: + continue + + # Inject responses for replay + if event.event_type == "LLM_RESPONSE": + content = event.content + response_text = "" + if isinstance(content, dict): + response_text = content.get("response", "") + elif content: + response_text = str(content) + replay_context.inject_llm_response(response_text) + + elif event.event_type == "TOOL_COMPLETED": + tool_name = event.content.get("tool", "unknown") + result = event.content.get("result") + replay_context.inject_tool_response(tool_name, result) + + # Invoke callback if provided + if step_callback: + step_callback(event, replay_context) + + return replay_context + + async def compare_replays( + self, + session_id_1: str, + session_id_2: str, + ) -> dict[str, Any]: + """Compares two session replays to identify differences. + + Args: + session_id_1: First session ID. + session_id_2: Second session ID. + + Returns: + Dict with comparison results. + """ + trace1 = await self.evaluator.get_session_trace(session_id_1) + trace2 = await self.evaluator.get_session_trace(session_id_2) + + differences = { + "event_count_diff": len(trace1.events) - len(trace2.events), + "tool_count_diff": len(trace1.tool_calls) - len(trace2.tool_calls), + "tool_differences": [], + "response_match": False, + } + + # Compare tool calls + max_tools = max(len(trace1.tool_calls), len(trace2.tool_calls)) + for i in range(max_tools): + tc1 = trace1.tool_calls[i] if i < len(trace1.tool_calls) else None + tc2 = trace2.tool_calls[i] if i < len(trace2.tool_calls) else None + + if tc1 is None or tc2 is None: + differences["tool_differences"].append( + { + "index": i, + "trace1": tc1.tool_name if tc1 else None, + "trace2": tc2.tool_name if tc2 else None, + } + ) + elif tc1.tool_name != tc2.tool_name or tc1.args != tc2.args: + differences["tool_differences"].append( + { + "index": i, + "trace1": {"name": tc1.tool_name, "args": tc1.args}, + "trace2": {"name": tc2.tool_name, "args": tc2.args}, + } + ) + + # Compare responses + if trace1.final_response and trace2.final_response: + differences["response_match"] = ( + trace1.final_response.strip() == trace2.final_response.strip() + ) + + return differences diff --git a/src/bigquery_agent_analytics/system_evaluator.py b/src/bigquery_agent_analytics/system_evaluator.py new file mode 100644 index 00000000..498e862b --- /dev/null +++ b/src/bigquery_agent_analytics/system_evaluator.py @@ -0,0 +1,1374 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Evaluation engine for BigQuery Agent Analytics SDK. + +Provides ``SystemEvaluator`` for deterministic, code-based metrics and +``LLMAsJudge`` for semantic evaluation using LLM-as-a-judge. The +``evaluate()`` function orchestrates batch evaluation using BigQuery's +native AI functions for scalable, zero-ETL assessment. + +Example usage:: + + from bigquery_agent_analytics.evaluators import ( + SystemEvaluator, LLMAsJudge, + ) + + # Deterministic evaluation + evaluator = SystemEvaluator.latency(threshold_ms=5000) + + # LLM-based semantic evaluation + judge = LLMAsJudge.correctness() +""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from datetime import timezone +import json +import logging +import re +from typing import Any, Callable, Optional + +from pydantic import BaseModel +from pydantic import Field + +from bigquery_agent_analytics import udf_kernels + +logger = logging.getLogger("bigquery_agent_analytics." + __name__) + +DEFAULT_ENDPOINT = "gemini-2.5-flash" + + +# ------------------------------------------------------------------ # +# Evaluation Report # +# ------------------------------------------------------------------ # + + +class SessionScore(BaseModel): + """Scores for a single evaluated session.""" + + session_id: str = Field(description="The session ID evaluated.") + scores: dict[str, float] = Field( + default_factory=dict, + description="Metric name to score (0.0 - 1.0).", + ) + passed: bool = Field( + default=True, + description="Whether the session passed all thresholds.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description="Additional per-session details.", + ) + llm_feedback: Optional[str] = Field( + default=None, + description="LLM judge feedback if applicable.", + ) + + +class EvaluationReport(BaseModel): + """Aggregate report from an evaluation run.""" + + dataset: str = Field(description="Dataset or filter description.") + evaluator_name: str = Field(description="Name of evaluator used.") + total_sessions: int = Field(default=0) + passed_sessions: int = Field(default=0) + failed_sessions: int = Field(default=0) + aggregate_scores: dict[str, float] = Field( + default_factory=dict, + description="Average scores across all sessions.", + ) + details: dict[str, Any] = Field( + default_factory=dict, + description=( + "Operational metadata (parse_errors, fallback_mode, etc.)." + " Separated from aggregate_scores so downstream consumers" + " can treat scores as purely normalized metrics." + ), + ) + session_scores: list[SessionScore] = Field( + default_factory=list, + ) + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + ) + + @property + def pass_rate(self) -> float: + """Fraction of sessions that passed.""" + if self.total_sessions == 0: + return 0.0 + return self.passed_sessions / self.total_sessions + + def summary(self) -> str: + """Returns a human-readable summary.""" + lines = [ + f"Evaluation Report: {self.evaluator_name}", + f" Dataset: {self.dataset}", + f" Sessions: {self.total_sessions}", + f" Passed: {self.passed_sessions} ({self.pass_rate:.0%})", + f" Failed: {self.failed_sessions}", + ] + if self.aggregate_scores: + lines.append(" Aggregate Scores:") + for name, score in sorted(self.aggregate_scores.items()): + lines.append(f" {name}: {score:.3f}") + return "\n".join(lines) + + +# ------------------------------------------------------------------ # +# Code-Based Evaluator # +# ------------------------------------------------------------------ # + + +@dataclass +class _MetricDef: + """Internal definition of a code metric. + + ``observed_key``, ``observed_fn``, ``detail_fn``, and ``budget`` are optional + reporting metadata used by the prebuilt evaluators (latency, + error_rate, turn_count, …) to surface the raw observed value and + the user-supplied budget in ``SessionScore.details``. They don't + affect pass/fail computation — that still goes through ``fn`` + + ``threshold`` — but they let downstream consumers (CLI + ``--exit-code`` output, dashboards) emit readable failure lines + without having to re-run the scorer. + + When ``observed_fn`` is set it takes precedence over + ``observed_key``; use it for metrics whose observed value is + computed from multiple summary fields (e.g. ``tool_errors / + tool_calls`` for error rate). + + When ``detail_fn`` is set, its returned key/value pairs are merged + into the metric's detail payload after the common observed/budget/ + threshold/score/passed fields are populated. + """ + + name: str + fn: Callable[[dict[str, Any]], float] + threshold: float = 0.5 + observed_key: Optional[str] = None + budget: Optional[float] = None + observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None + detail_fn: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None + + +class SystemEvaluator: + """Deterministic evaluator using code-based metric functions. + + Metrics operate on a session summary dict containing:: + + { + "session_id": str, + "total_events": int, + "tool_calls": int, + "tool_errors": int, + "llm_calls": int, + "avg_latency_ms": float, + "max_latency_ms": float, + "total_latency_ms": float, + "turn_count": int, + "has_error": bool, + } + + Each metric function returns a score between 0.0 and 1.0. + """ + + def __init__( + self, + name: str = "system_evaluator", + metrics: Optional[list[_MetricDef]] = None, + ) -> None: + self.name = name + self._metrics: list[_MetricDef] = metrics or [] + + def add_metric( + self, + name: str, + fn: Callable[[dict[str, Any]], float], + threshold: float = 0.5, + observed_key: Optional[str] = None, + budget: Optional[float] = None, + observed_fn: Optional[Callable[[dict[str, Any]], Any]] = None, + detail_fn: Optional[Callable[[dict[str, Any]], dict[str, Any]]] = None, + ) -> SystemEvaluator: + """Adds a custom metric function. + + Args: + name: Metric name. + fn: Function taking session summary, returning 0-1 score. + The score is compared to ``threshold``; a session passes + the metric when ``score >= threshold``. + threshold: Pass/fail threshold applied to ``fn``'s score. + observed_key: Optional session-summary key whose value is the + raw observed metric (e.g. ``"avg_latency_ms"``). When set, + ``evaluate_session`` stashes the observed value + ``budget`` + under ``SessionScore.details`` for downstream reporting. + budget: Optional raw-budget value corresponding to the metric + (e.g. the latency-ms threshold the user supplied). Reported + alongside ``observed_key``; not used for pass/fail. + observed_fn: Optional callable that derives the observed value + from the session summary. Used when the observed metric is + computed (e.g. ``tool_errors/tool_calls``) rather than + stored directly. Takes precedence over ``observed_key``. + detail_fn: Optional callable that derives additional + JSON-serializable detail fields from the session summary. + + Returns: + Self for chaining. + """ + self._metrics.append( + _MetricDef( + name=name, + fn=fn, + threshold=threshold, + observed_key=observed_key, + budget=budget, + observed_fn=observed_fn, + detail_fn=detail_fn, + ) + ) + return self + + def evaluate_session(self, session_summary: dict[str, Any]) -> SessionScore: + """Evaluates a single session summary. + + Args: + session_summary: Dict with session metrics. + + Returns: + SessionScore with computed scores. + """ + scores: dict[str, float] = {} + details: dict[str, Any] = {} + passed = True + + for metric in self._metrics: + try: + score = metric.fn(session_summary) + score = max(0.0, min(1.0, float(score))) + scores[metric.name] = score + metric_passed = score >= metric.threshold + if not metric_passed: + passed = False + except Exception as e: + logger.warning("Metric %s failed: %s", metric.name, e) + scores[metric.name] = 0.0 + metric_passed = False + passed = False + + # Stash per-metric reporting detail for *every* metric so the CLI + # ``--exit-code`` failure output always has a threshold / score / + # passed triple to emit, even for custom metrics that didn't + # declare observed_key / observed_fn. Observed / budget are only + # included when the metric supplied them. Keys are prefixed with + # ``metric_`` to avoid colliding with other details callers. + observed_value: Optional[Any] = None + if metric.observed_fn is not None: + try: + observed_value = metric.observed_fn(session_summary) + except Exception: # pylint: disable=broad-except + logger.debug( + "Metric %s observed_fn failed", metric.name, exc_info=True + ) + observed_value = None + elif metric.observed_key is not None: + observed_value = session_summary.get(metric.observed_key) + metric_details = { + "observed": observed_value, + "budget": metric.budget, + "threshold": metric.threshold, + "score": scores[metric.name], + "passed": metric_passed, + } + if metric.detail_fn is not None: + try: + metric_details.update(metric.detail_fn(session_summary)) + except Exception: # pylint: disable=broad-except + logger.debug("Metric %s detail_fn failed", metric.name, exc_info=True) + details[f"metric_{metric.name}"] = metric_details + + return SessionScore( + session_id=session_summary.get("session_id", "unknown"), + scores=scores, + passed=passed, + details=details, + ) + + # ---- Pre-built evaluators ---- # + + # The prebuilt evaluators below use raw-budget gates: they fail iff + # the observed metric exceeds the user-supplied budget. Historically + # these ran the normalized ``udf_kernels.score_*`` functions under a + # 0.5 score cutoff, which caused ``--threshold=5000`` on latency to + # fail near 2500ms — the gate was at half the budget the user typed. + # See CHANGELOG and the related blog-post-#2 plan (#77) for context. + # ``udf_kernels.score_*`` is unchanged; it still powers the SQL-native + # UDF path in ``udf_sql_templates.py``, which has its own semantics. + + @staticmethod + def latency( + threshold_ms: float = 5000.0, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when average latency exceeds the budget. + + Pass/fail is a raw comparison: ``avg_latency_ms <= threshold_ms`` + passes, strictly greater fails. The returned evaluator's score for + a session is ``1.0`` on pass and ``0.0`` on fail. + + Args: + threshold_ms: Maximum acceptable average latency in ms. + + Returns: + SystemEvaluator configured for latency checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("avg_latency_ms", 0) or 0 + return 1.0 if observed <= threshold_ms else 0.0 + + evaluator = SystemEvaluator(name="latency_evaluator") + evaluator.add_metric( + "latency", + _score, + threshold=1.0, + observed_key="avg_latency_ms", + budget=threshold_ms, + ) + return evaluator + + @staticmethod + def turn_count(max_turns: int = 10) -> SystemEvaluator: + """Pre-built evaluator that fails when turn count exceeds the budget. + + Pass/fail is a raw comparison: ``turn_count <= max_turns`` passes, + strictly greater fails. + + Args: + max_turns: Maximum acceptable number of turns. + + Returns: + SystemEvaluator configured for turn count checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("turn_count", 0) or 0 + return 1.0 if observed <= max_turns else 0.0 + + evaluator = SystemEvaluator(name="turn_count_evaluator") + evaluator.add_metric( + "turn_count", + _score, + threshold=1.0, + observed_key="turn_count", + budget=max_turns, + ) + return evaluator + + @staticmethod + def error_rate( + max_error_rate: float = 0.1, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when tool error rate exceeds the budget. + + Pass/fail is a raw comparison: ``(tool_errors / tool_calls) <= max_error_rate`` + passes, strictly greater fails. Sessions with zero tool calls pass + trivially (nothing to fail). + + Args: + max_error_rate: Maximum acceptable tool error fraction. + + Returns: + SystemEvaluator configured for error rate checking. + """ + + def _observed(s: dict[str, Any]) -> float: + calls = s.get("tool_calls", 0) or 0 + errors = s.get("tool_errors", 0) or 0 + if calls <= 0: + return 0.0 + return errors / calls + + def _score(s: dict[str, Any]) -> float: + calls = s.get("tool_calls", 0) or 0 + if calls <= 0: + return 1.0 + return 1.0 if _observed(s) <= max_error_rate else 0.0 + + evaluator = SystemEvaluator(name="error_rate_evaluator") + evaluator.add_metric( + "error_rate", + _score, + threshold=1.0, + observed_fn=_observed, + budget=max_error_rate, + ) + return evaluator + + @staticmethod + def token_efficiency( + max_tokens: int = 50000, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when total tokens exceed the budget. + + Pass/fail is a raw comparison: ``total_tokens <= max_tokens`` + passes, strictly greater fails. + + Args: + max_tokens: Maximum acceptable total token count. + + Returns: + SystemEvaluator configured for token efficiency. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("total_tokens", 0) or 0 + return 1.0 if observed <= max_tokens else 0.0 + + evaluator = SystemEvaluator(name="token_efficiency_evaluator") + evaluator.add_metric( + "token_efficiency", + _score, + threshold=1.0, + observed_key="total_tokens", + budget=max_tokens, + ) + return evaluator + + @staticmethod + def ttft( + threshold_ms: float = 1000.0, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when TTFT exceeds the budget. + + Pass/fail is a raw comparison: ``avg_ttft_ms <= threshold_ms`` + passes, strictly greater fails. + + Args: + threshold_ms: Maximum acceptable average TTFT in ms. + + Returns: + SystemEvaluator configured for TTFT checking. + """ + + def _score(s: dict[str, Any]) -> float: + observed = s.get("avg_ttft_ms", 0) or 0 + return 1.0 if observed <= threshold_ms else 0.0 + + evaluator = SystemEvaluator(name="ttft_evaluator") + evaluator.add_metric( + "ttft", + _score, + threshold=1.0, + observed_key="avg_ttft_ms", + budget=threshold_ms, + ) + return evaluator + + @staticmethod + def cost_per_session( + max_cost_usd: float = 1.0, + input_cost_per_1k: float = 0.00025, + output_cost_per_1k: float = 0.00125, + ) -> SystemEvaluator: + """Pre-built evaluator that fails when per-session cost exceeds the budget. + + Pass/fail is a raw comparison: ``estimated_cost_usd <= max_cost_usd`` + passes, strictly greater fails. + + Args: + max_cost_usd: Maximum acceptable cost in USD. + input_cost_per_1k: Cost per 1K input tokens. + output_cost_per_1k: Cost per 1K output tokens. + + Returns: + SystemEvaluator configured for cost checking. + """ + + def _observed(s: dict[str, Any]) -> float: + input_tokens = s.get("input_tokens", 0) or 0 + output_tokens = s.get("output_tokens", 0) or 0 + return (input_tokens / 1000.0) * input_cost_per_1k + ( + output_tokens / 1000.0 + ) * output_cost_per_1k + + def _score(s: dict[str, Any]) -> float: + return 1.0 if _observed(s) <= max_cost_usd else 0.0 + + evaluator = SystemEvaluator(name="cost_evaluator") + evaluator.add_metric( + "cost", + _score, + threshold=1.0, + observed_fn=_observed, + budget=max_cost_usd, + ) + return evaluator + + @staticmethod + def context_cache_hit_rate( + min_hit_rate: float = 0.5, + fail_on_missing_telemetry: bool = False, + cold_start_rate: float = 0.1, + warm_rate: float = 0.9, + ) -> SystemEvaluator: + """Pre-built evaluator for Gemini context cache prefix hit rate. + + The observed rate is ``cached_tokens / input_tokens``. The session + summary should include ``input_tokens``, ``cached_tokens``, and + ideally ``cache_telemetry_events`` from ``SESSION_SUMMARY_QUERY``. + Missing cache telemetry is reported separately from a true 0-token + cache hit so older plugin data does not become a false failure by + default. + + Args: + min_hit_rate: Minimum acceptable cached-token fraction. + fail_on_missing_telemetry: If ``True``, sessions with input + tokens but no cache telemetry fail. If ``False`` (default), + they pass with ``cache_state='no_cache_telemetry'``. + cold_start_rate: Rate below which detail marks the session as + ``"cold_start"``. + warm_rate: Rate at or above which detail marks the session as + ``"warm"``. + + Returns: + SystemEvaluator configured for context cache efficiency. + """ + try: + min_hit_rate = float(min_hit_rate) + except (TypeError, ValueError) as exc: + raise ValueError( + f"min_hit_rate must be a number, got {min_hit_rate!r}" + ) from exc + if not 0.0 <= min_hit_rate <= 1.0: + raise ValueError( + "min_hit_rate must satisfy 0 <= min_hit_rate <= 1, " + f"got {min_hit_rate}" + ) + if not 0.0 <= cold_start_rate < warm_rate <= 1.0: + raise ValueError( + "cold_start_rate and warm_rate must satisfy " + "0 <= cold_start_rate < warm_rate <= 1" + ) + + def _number(value: Any, default: float = 0.0) -> float: + if value is None: + return default + try: + return float(value) + except (TypeError, ValueError): + return default + + def _has_cache_telemetry(s: dict[str, Any]) -> bool: + if "cache_telemetry_events" in s: + return _number(s.get("cache_telemetry_events")) > 0 + return s.get("cached_tokens") is not None + + def _rate(s: dict[str, Any]) -> Optional[float]: + input_tokens = _number(s.get("input_tokens")) + if input_tokens <= 0: + return 1.0 + if not _has_cache_telemetry(s): + return None + cached_tokens = _number(s.get("cached_tokens")) + return max(0.0, min(1.0, cached_tokens / input_tokens)) + + def _score(s: dict[str, Any]) -> float: + rate = _rate(s) + if rate is None: + return 0.0 if fail_on_missing_telemetry else 1.0 + return rate + + def _details(s: dict[str, Any]) -> dict[str, Any]: + input_tokens = _number(s.get("input_tokens")) + cached_tokens = _number(s.get("cached_tokens")) + telemetry_events = int(_number(s.get("cache_telemetry_events"))) + rate = _rate(s) + if input_tokens <= 0: + cache_state = "no_llm_input" + elif rate is None: + cache_state = "no_cache_telemetry" + elif rate < cold_start_rate: + cache_state = "cold_start" + elif rate >= warm_rate: + cache_state = "warm" + else: + cache_state = "partial" + return { + "cached_tokens": int(cached_tokens), + "input_tokens": int(input_tokens), + "cache_telemetry_events": telemetry_events, + "cache_state": cache_state, + "cold_start_rate": cold_start_rate, + "warm_rate": warm_rate, + "fail_on_missing_telemetry": fail_on_missing_telemetry, + } + + evaluator = SystemEvaluator(name="context_cache_hit_rate_evaluator") + evaluator.add_metric( + "context_cache_hit_rate", + _score, + threshold=min_hit_rate, + observed_fn=_rate, + budget=min_hit_rate, + detail_fn=_details, + ) + return evaluator + + +# Keep alias for backward compatibility +CodeEvaluator = SystemEvaluator + + +# ------------------------------------------------------------------ # +# LLM-as-Judge Evaluator # +# ------------------------------------------------------------------ # + + +_CORRECTNESS_PROMPT = """\ +You are evaluating an AI agent's response for correctness. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the response on a scale of 1 to 10 for correctness: Did the \ +agent provide an accurate, factual response that addresses the \ +user's request? + +Respond with ONLY a valid JSON object: +{{"correctness": , "justification": ""}} +""" + +_HALLUCINATION_PROMPT = """\ +You are evaluating an AI agent's response for hallucination. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the response on a scale of 1 to 10 for faithfulness (where \ +10 means NO hallucination). Does the response contain claims not \ +supported by the tool results or conversation context? + +Respond with ONLY a valid JSON object: +{{"faithfulness": , "justification": ""}} +""" + +_SENTIMENT_PROMPT = """\ +You are evaluating the sentiment of an AI agent's conversation. + +## Conversation Trace +{trace_text} + +## Final Agent Response +{final_response} + +## Instructions +Score the overall sentiment and helpfulness of the interaction \ +on a scale of 1 to 10 (10 = very positive and helpful). + +Respond with ONLY a valid JSON object: +{{"sentiment": , "justification": ""}} +""" + + +@dataclass +class _JudgeCriterion: + """A single LLM-as-judge criterion.""" + + name: str + prompt_template: str + score_key: str + threshold: float = 0.5 + + +class LLMAsJudge: + """Semantic evaluator using LLM-as-a-judge. + + Uses BigQuery's native ``ML.GENERATE_TEXT`` (or the Gemini API) + to evaluate agent traces against semantic criteria like + correctness, hallucination, and sentiment. + """ + + def __init__( + self, + name: str = "llm_judge", + criteria: Optional[list[_JudgeCriterion]] = None, + model: Optional[str] = None, + ) -> None: + self.name = name + self._criteria: list[_JudgeCriterion] = criteria or [] + self.model = model or "gemini-2.5-flash" + + def add_criterion( + self, + name: str, + prompt_template: str, + score_key: str, + threshold: float = 0.5, + ) -> LLMAsJudge: + """Adds a custom evaluation criterion. + + Args: + name: Criterion name. + prompt_template: Prompt with {trace_text} and + {final_response} placeholders. + score_key: JSON key in LLM response containing score. + threshold: Pass/fail threshold (0-1 scale). + + Returns: + Self for chaining. + """ + self._criteria.append( + _JudgeCriterion( + name=name, + prompt_template=prompt_template, + score_key=score_key, + threshold=threshold, + ) + ) + return self + + async def evaluate_session( + self, + trace_text: str, + final_response: str, + ) -> SessionScore: + """Evaluates a session using the LLM judge. + + Args: + trace_text: Formatted trace text. + final_response: Final agent response. + + Returns: + SessionScore with LLM-judged scores. + """ + scores: dict[str, float] = {} + feedback_parts: list[str] = [] + passed = True + + for criterion in self._criteria: + score, feedback = await self._judge_criterion( + criterion, + trace_text, + final_response, + ) + scores[criterion.name] = score + if feedback: + feedback_parts.append(f"{criterion.name}: {feedback}") + if score < criterion.threshold: + passed = False + + return SessionScore( + session_id="", + scores=scores, + passed=passed, + llm_feedback="\n".join(feedback_parts) or None, + ) + + async def _judge_criterion( + self, + criterion: _JudgeCriterion, + trace_text: str, + final_response: str, + ) -> tuple[float, str]: + """Evaluates one criterion via LLM call.""" + prompt = criterion.prompt_template.format( + trace_text=trace_text, + final_response=final_response or "No response.", + ) + + try: + from google import genai + from google.genai import types + + client = genai.Client() + response = await client.aio.models.generate_content( + model=self.model, + contents=prompt, + config=types.GenerateContentConfig( + temperature=0.1, + max_output_tokens=2048, + ), + ) + + text = response.text.strip() + result = _parse_json_from_text(text) + + if result and criterion.score_key in result: + raw = float(result[criterion.score_key]) + score = raw / 10.0 # Normalize 1-10 to 0-1 + justification = result.get("justification", "") + return score, justification + + return 0.0, text + + except ImportError: + logger.warning("google-genai not installed, skipping LLM judge.") + return 0.0, "google-genai not installed" + except Exception as e: + logger.warning("LLM judge failed: %s", e) + return 0.0, str(e) + + # ---- Pre-built evaluators ---- # + + @staticmethod + def correctness( + threshold: float = 0.5, + model: Optional[str] = None, + ) -> LLMAsJudge: + """Pre-built correctness evaluator. + + Args: + threshold: Minimum score to pass (0-1). + model: LLM model to use for judging. + + Returns: + LLMAsJudge configured for correctness. + """ + judge = LLMAsJudge( + name="correctness_judge", + model=model, + ) + judge.add_criterion( + name="correctness", + prompt_template=_CORRECTNESS_PROMPT, + score_key="correctness", + threshold=threshold, + ) + return judge + + @staticmethod + def hallucination( + threshold: float = 0.5, + model: Optional[str] = None, + ) -> LLMAsJudge: + """Pre-built hallucination (faithfulness) evaluator. + + Args: + threshold: Minimum faithfulness score to pass (0-1). + model: LLM model to use for judging. + + Returns: + LLMAsJudge configured for hallucination detection. + """ + judge = LLMAsJudge( + name="hallucination_judge", + model=model, + ) + judge.add_criterion( + name="faithfulness", + prompt_template=_HALLUCINATION_PROMPT, + score_key="faithfulness", + threshold=threshold, + ) + return judge + + @staticmethod + def sentiment( + threshold: float = 0.5, + model: Optional[str] = None, + ) -> LLMAsJudge: + """Pre-built sentiment evaluator. + + Args: + threshold: Minimum sentiment score to pass (0-1). + model: LLM model to use for judging. + + Returns: + LLMAsJudge configured for sentiment analysis. + """ + judge = LLMAsJudge( + name="sentiment_judge", + model=model, + ) + judge.add_criterion( + name="sentiment", + prompt_template=_SENTIMENT_PROMPT, + score_key="sentiment", + threshold=threshold, + ) + return judge + + +# ------------------------------------------------------------------ # +# SQL Templates for BigQuery-native evaluation # +# ------------------------------------------------------------------ # + +SESSION_SUMMARY_QUERY = """\ +SELECT + session_id, + COUNT(*) AS total_events, + COUNTIF(event_type = 'TOOL_STARTING') AS tool_calls, + COUNTIF(event_type = 'TOOL_ERROR') AS tool_errors, + COUNTIF(event_type = 'LLM_REQUEST') AS llm_calls, + AVG( + CAST( + JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 + ) + ) AS avg_latency_ms, + MAX( + CAST( + JSON_VALUE(latency_ms, '$.total_ms') AS FLOAT64 + ) + ) AS max_latency_ms, + TIMESTAMP_DIFF( + MAX(timestamp), MIN(timestamp), MILLISECOND + ) AS total_latency_ms, + COUNTIF( + event_type = 'USER_MESSAGE_RECEIVED' + ) AS turn_count, + AVG( + CAST( + JSON_VALUE(latency_ms, '$.time_to_first_token_ms') AS FLOAT64 + ) + ) AS avg_ttft_ms, + COUNTIF(event_type LIKE 'HITL_%') AS hitl_events, + COUNTIF( + ENDS_WITH(event_type, '_ERROR') + OR error_message IS NOT NULL + OR status = 'ERROR' + ) > 0 AS has_error, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.prompt_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.prompt' + ) AS INT64), + CAST(JSON_VALUE( + attributes, '$.input_tokens' + ) AS INT64) + )) AS input_tokens, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.candidates_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.completion' + ) AS INT64), + CAST(JSON_VALUE( + attributes, '$.output_tokens' + ) AS INT64) + )) AS output_tokens, + SUM(COALESCE( + SAFE_CAST(JSON_VALUE( + attributes, '$.usage_metadata.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.context_cache_metadata.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.context_cache_metadata.cached_tokens' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.cache_metadata.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.cache_metadata.cached_tokens' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + attributes, '$.cached_tokens' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + content, '$.usage_metadata.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + content, '$.context_cache_metadata.cached_content_token_count' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + content, '$.context_cache_metadata.cached_tokens' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + content, '$.usage.cached_tokens' + ) AS INT64), + SAFE_CAST(JSON_VALUE( + content, '$.usage.prompt_tokens_details.cached_tokens' + ) AS INT64), + 0 + )) AS cached_tokens, + COUNTIF(COALESCE( + JSON_VALUE( + attributes, '$.usage_metadata.cached_content_token_count' + ), + JSON_VALUE( + attributes, '$.context_cache_metadata.cached_content_token_count' + ), + JSON_VALUE( + attributes, '$.context_cache_metadata.cached_tokens' + ), + JSON_VALUE( + attributes, '$.cache_metadata.cached_content_token_count' + ), + JSON_VALUE( + attributes, '$.cache_metadata.cached_tokens' + ), + JSON_VALUE( + attributes, '$.cached_content_token_count' + ), + JSON_VALUE( + attributes, '$.cached_tokens' + ), + JSON_VALUE( + content, '$.usage_metadata.cached_content_token_count' + ), + JSON_VALUE( + content, '$.context_cache_metadata.cached_content_token_count' + ), + JSON_VALUE( + content, '$.context_cache_metadata.cached_tokens' + ), + JSON_VALUE( + content, '$.usage.cached_tokens' + ), + JSON_VALUE( + content, '$.usage.prompt_tokens_details.cached_tokens' + ) + ) IS NOT NULL) AS cache_telemetry_events, + SUM(COALESCE( + CAST(JSON_VALUE( + attributes, '$.usage_metadata.total_token_count' + ) AS INT64), + CAST(JSON_VALUE( + content, '$.usage.total' + ) AS INT64), + COALESCE( + CAST(JSON_VALUE( + attributes, '$.input_tokens' + ) AS INT64), 0 + ) + COALESCE( + CAST(JSON_VALUE( + attributes, '$.output_tokens' + ) AS INT64), 0 + ) + )) AS total_tokens +FROM `{project}.{dataset}.{table}` +WHERE {where} +GROUP BY session_id +LIMIT @trace_limit +""" + +_AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE = """\ +WITH session_traces AS ( + SELECT + session_id, + STRING_AGG( + CONCAT( + event_type, ': ', + COALESCE( + JSON_VALUE(content, '$.text_summary'), '' + ) + ), + '\\n' ORDER BY timestamp + ) AS trace_text, + ARRAY_AGG( + JSON_VALUE(content, '$.response') + IGNORE NULLS + ORDER BY timestamp DESC + LIMIT 1 + )[SAFE_OFFSET(0)] AS final_response + FROM `{project}.{dataset}.{table}` + WHERE {where} + GROUP BY session_id + HAVING LENGTH(trace_text) > 10 + LIMIT @trace_limit +) +SELECT + session_id, + trace_text, + final_response, + gen.score AS score, + gen.justification AS justification, + gen.status AS gen_status +FROM ( + SELECT + session_id, + trace_text, + final_response, + AI.GENERATE( + -- The Python prompt template is rebuilt at SQL time: + -- prefix ++ trace_text ++ middle ++ final_response ++ suffix + -- Each segment is a separate query parameter so AI.GENERATE + -- sees the exact full Python template (including the + -- per-criterion output-format spec) the API-fallback path uses. + prompt => CONCAT( + @judge_prompt_prefix, trace_text, + @judge_prompt_middle, COALESCE(final_response, 'N/A'), + @judge_prompt_suffix + ), + endpoint => '{endpoint}',{connection_arg} + model_params => JSON '{{"generationConfig": {{"temperature": 0.1, "maxOutputTokens": 1024}}}}', + output_schema => 'score INT64, justification STRING' + ) AS gen + FROM session_traces +) +""" + + +def render_ai_generate_judge_query( + *, + project: str, + dataset: str, + table: str, + where: str, + endpoint: str, + connection_id: Optional[str] = None, +) -> str: + """Render the AI.GENERATE judge batch query for a given config. + + ``AI.GENERATE`` is BigQuery's scalar generative function (it returns a + ``STRUCT`` shaped + by ``output_schema``). The function call lives inside a regular + ``SELECT`` — it is *not* a table-valued function, so the surrounding + ``FROM session_traces, AI.GENERATE(...)`` lateral-join syntax used + by older SDK versions does not parse against current BigQuery. + + ``connection_id`` is optional. When supplied (e.g. + ``"us.bqaa_ai_generate"``) the call uses that connection's service + account; when omitted, AI.GENERATE runs against the end-user + credentials of whichever account submits the job. Both shapes are + documented forms of the same function. + """ + if connection_id: + connection_arg = f"\n connection_id => '{connection_id}'," + else: + connection_arg = "" + return _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE.format( + project=project, + dataset=dataset, + table=table, + where=where, + endpoint=endpoint, + connection_arg=connection_arg, + ) + + +# Public alias kept for downstream code that imports the raw template +# string (e.g. for inspection / docs). Callers building queries should +# use ``render_ai_generate_judge_query`` instead so the optional +# ``connection_id`` arg is wired correctly. +AI_GENERATE_JUDGE_BATCH_QUERY = _AI_GENERATE_JUDGE_BATCH_QUERY_TEMPLATE + +# Legacy template kept for backward compatibility with pre-created +# BQ ML models. +_LEGACY_LLM_JUDGE_BATCH_QUERY = """\ +WITH session_traces AS ( + SELECT + session_id, + STRING_AGG( + CONCAT( + event_type, ': ', + COALESCE( + JSON_VALUE(content, '$.text_summary'), '' + ) + ), + '\\n' ORDER BY timestamp + ) AS trace_text, + ARRAY_AGG( + JSON_VALUE(content, '$.response') + IGNORE NULLS + ORDER BY timestamp DESC + LIMIT 1 + )[SAFE_OFFSET(0)] AS final_response + FROM `{project}.{dataset}.{table}` + WHERE {where} + GROUP BY session_id + HAVING LENGTH(trace_text) > 10 + LIMIT @trace_limit +) +SELECT + session_id, + trace_text, + final_response, + ML.GENERATE_TEXT( + MODEL `{model}`, + STRUCT( + -- Same prefix/middle/suffix substitution as the AI.GENERATE + -- path; preserves the full Python prompt_template. + CONCAT( + @judge_prompt_prefix, trace_text, + @judge_prompt_middle, COALESCE(final_response, 'N/A'), + @judge_prompt_suffix + ) AS prompt + ), + STRUCT(0.1 AS temperature, 500 AS max_output_tokens) + ).ml_generate_text_result AS evaluation +FROM session_traces +""" + +# Keep backward-compatible alias. +LLM_JUDGE_BATCH_QUERY = _LEGACY_LLM_JUDGE_BATCH_QUERY + + +_TRACE_SENTINEL = "\x00__BQAA_JUDGE_TRACE__\x00" +_RESPONSE_SENTINEL = "\x00__BQAA_JUDGE_RESPONSE__\x00" + + +def split_judge_prompt_template(prompt_template: str) -> tuple[str, str, str]: + """Split a Python judge prompt into ``(prefix, middle, suffix)``. + + The Python ``LLMAsJudge`` prompt template uses ``{trace_text}`` and + ``{final_response}`` placeholders (in that order) to interpolate + per-session inputs. The BigQuery-native ``AI.GENERATE`` and + ``ML.GENERATE_TEXT`` paths can't use Python ``str.format`` — they + build the prompt at SQL time. This helper returns the three + literal segments those SQL paths need to ``CONCAT`` together with + the SQL-side ``trace_text`` and ``final_response`` columns, + preserving the exact full template (including the per-criterion + output-format spec that follows the placeholders). + + Internally the helper format()s the template once with sentinel + values, so any literal ``{{...}}`` braces in the source template + (e.g. the JSON output spec ``{{"correctness": , ...}}``) + are correctly un-escaped before splitting. The SQL paths see the + same string the API-fallback path's ``str.format(...)`` would + produce. + + Args: + prompt_template: The Python prompt template, expected to + contain both ``{trace_text}`` and ``{final_response}`` + placeholders in that order. + + Returns: + ``(prefix, middle, suffix)`` such that + ``prefix + trace_text + middle + final_response + suffix`` + reproduces ``prompt_template.format(trace_text=..., final_response=...)`` + for any inputs. When a placeholder is missing, the helper + synthesizes a labeled section for the missing input and + places the label *immediately before* the injected value + (label first, then value), so the model reads + ``...Trace:\n\nResponse:\n...`` rather than + the value followed by an orphan label. + """ + has_trace = "{trace_text}" in prompt_template + has_response = "{final_response}" in prompt_template + + # Reminder for the fallback branches below: the SQL CONCAT runs + # prefix ++ trace_text ++ middle ++ final_response ++ suffix + # so any label we synthesize for an absent placeholder must end + # up *next to* the value it labels (label first, then value), + # not on the far side of it. Earlier versions appended labels + # *after* the values, which produced ``\nTrace:\n...``. + + if not has_trace and not has_response: + # No placeholders at all. Append a labeled trace + response + # block after the user's instructions. The labels precede the + # values so the model reads them in order. + return ( + prompt_template + "\nTrace:\n", + "\nResponse:\n", + "", + ) + + if not has_trace: + # final_response placeholder only. Honor the user's structure + # and inject a labeled trace block right before the response, + # so the trace label sits next to the trace. + formatted = prompt_template.format(final_response=_RESPONSE_SENTINEL) + before_response, _, after_response = formatted.partition(_RESPONSE_SENTINEL) + return ( + before_response + "\nTrace:\n", + "\n", + after_response, + ) + + if not has_response: + # trace_text placeholder only. Append a labeled response block + # after the original template's tail, so the response label + # sits next to the response value (not after it). + formatted = prompt_template.format(trace_text=_TRACE_SENTINEL) + prefix, _, after_trace = formatted.partition(_TRACE_SENTINEL) + return ( + prefix, + after_trace + "\nResponse:\n", + "", + ) + + formatted = prompt_template.format( + trace_text=_TRACE_SENTINEL, + final_response=_RESPONSE_SENTINEL, + ) + prefix, _, rest = formatted.partition(_TRACE_SENTINEL) + middle, _, suffix = rest.partition(_RESPONSE_SENTINEL) + return prefix, middle, suffix + + +# ------------------------------------------------------------------ # +# Helpers # +# ------------------------------------------------------------------ # + + +def strip_markdown_fences(text: Optional[str]) -> Optional[str]: + """Strip markdown code block fences (``\\`\\`\\`json ... \\`\\`\\```) if present. + + Models frequently wrap JSON output in fenced code blocks. This helper + removes the opening ``\\`\\`\\`json`` (or plain ``\\`\\`\\```) and closing + ``\\`\\`\\``` markers so the result can be passed to ``json.loads()``. + + The regex pattern matches the same fences handled server-side by + ``REGEXP_REPLACE`` in ``ontology_graph.py`` and ``context_graph.py``. + """ + if not text: + return text + text = text.strip() + if not text.startswith("```"): + return text + text = re.sub(r"^```[a-zA-Z0-9]*\s*\n?", "", text) + text = re.sub(r"\n?\s*```[\s\S]*$", "", text) + return text.strip() + + +def _parse_json_from_text(text: str) -> Optional[dict[str, Any]]: + """Extracts and parses JSON from LLM response text.""" + if not text: + return None + + # Strip markdown fences first + stripped = strip_markdown_fences(text) + try: + return json.loads(stripped) + except (json.JSONDecodeError, TypeError): + pass + + # Try raw JSON extraction (brace matching) + if "{" in stripped: + try: + start = stripped.index("{") + brace = 0 + end = start + for i, ch in enumerate(stripped[start:], start): + if ch == "{": + brace += 1 + elif ch == "}": + brace -= 1 + if brace == 0: + end = i + 1 + break + return json.loads(stripped[start:end]) + except (ValueError, json.JSONDecodeError): + pass + + return None diff --git a/tests/test_grader_pipeline.py b/tests/test_aggregate_grader.py similarity index 100% rename from tests/test_grader_pipeline.py rename to tests/test_aggregate_grader.py diff --git a/tests/test_multi_trial.py b/tests/test_multi_trial_performance_evaluator.py similarity index 100% rename from tests/test_multi_trial.py rename to tests/test_multi_trial_performance_evaluator.py diff --git a/tests/test_trace_evaluator.py b/tests/test_performance_evaluator.py similarity index 100% rename from tests/test_trace_evaluator.py rename to tests/test_performance_evaluator.py diff --git a/tests/test_sdk_evaluators.py b/tests/test_system_evaluator.py similarity index 100% rename from tests/test_sdk_evaluators.py rename to tests/test_system_evaluator.py