diff --git a/.gitignore b/.gitignore index ddab9d50..372f5ae0 100644 --- a/.gitignore +++ b/.gitignore @@ -14,13 +14,13 @@ env/ .adk/ uv.lock .env +/.idea/ # Script outputs scripts/reports/ # Example run artifacts examples/*/reports/ -examples/*/reports_*/ examples/*/trials_*/ scripts/**/*.log examples/**/*.log diff --git a/issues/concurrent_classify_sessions.md b/issues/concurrent_classify_sessions.md new file mode 100644 index 00000000..66ca10b9 --- /dev/null +++ b/issues/concurrent_classify_sessions.md @@ -0,0 +1,71 @@ +# classify_sessions_via_api and _infer_corrections should run concurrently + +**Labels:** `enhancement`, `performance` + +## Problem + +`classify_sessions_via_api` in `categorical_evaluator.py:831` processes sessions sequentially: + +```python +for sid, transcript in transcripts.items(): + response = await client.aio.models.generate_content(...) +``` + +Additionally, `_infer_corrections` in `quality_report.py` is called per-session in a loop inside `_build_resolved_map_from_conversations` and `run_evaluation` (lines 908-920). + +For 205 multi-turn sessions this results in **410 sequential Gemini API calls** (~7-8s per call = ~25 minutes total). Each call is independent — there's no reason they can't run concurrently. + +## Benchmarks + +| Sessions | Sequential (current) | Expected with concurrency=10 | +|----------|---------------------|-------------------------------| +| 5 | 38.8s | ~4s | +| 205 | ~25min | ~2.5min | + +## Proposed fix + +### 1. `classify_sessions_via_api` — add semaphore-bounded concurrency + +```python +async def classify_sessions_via_api(transcripts, config, endpoint, concurrency=10): + semaphore = asyncio.Semaphore(concurrency) + + async def _classify_one(sid, transcript): + async with semaphore: + # existing per-session logic (lines 860-895) + ... + + tasks = [_classify_one(sid, t) for sid, t in transcripts.items()] + results = await asyncio.gather(*tasks) + return list(results) +``` + +### 2. `_infer_corrections` — batch with gather + +In `_build_resolved_map_from_conversations` and `run_evaluation`, collect all multi-turn sessions and infer corrections concurrently: + +```python +async def _infer_corrections_batch(sessions, model, concurrency=10): + semaphore = asyncio.Semaphore(concurrency) + + async def _infer_one(conv): + async with semaphore: + return _infer_corrections(conv, model) + + return await asyncio.gather(*[_infer_one(s) for s in sessions]) +``` + +### 3. Wire `--concurrency` flag + +The `score_conversations.py` CLI already has a `--concurrency` flag (currently ignored). Pass it through to both functions. + +## Files to change + +- `src/bigquery_agent_analytics/categorical_evaluator.py` — `classify_sessions_via_api` +- `scripts/quality_report.py` — `_infer_corrections` batching, `_build_resolved_map_from_conversations`, `run_evaluation` + +## Notes + +- Default concurrency of 10 should be safe for Gemini API rate limits +- The `client.aio.models.generate_content` API is already async — just needs gather +- Backwards compatible — sequential behavior preserved with `concurrency=1` diff --git a/scripts/README.md b/scripts/README.md index 9756c8ef..8cd32da8 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -9,9 +9,18 @@ Standalone scripts for the BigQuery Agent Analytics SDK. ## Quality Report -Runs LLM-as-a-judge evaluation over agent sessions stored in BigQuery -and produces a quality report with per-agent breakdown, unhelpful session -analysis, and category distributions. +Runs LLM-as-a-judge evaluation over agent sessions and produces a diagnostic +quality report — not just a pass/fail scorecard. On top of the per-agent +breakdown, unhelpful-session analysis, and category distributions, it scores +**5 quality dimensions**, grades **factual correctness against ground truth** +(golden Q&A), attributes each failure to a **cause** (skill / knowledge / tool), +analyzes **multi-turn corrections**, and renders **execution traces** so you can +see *where* a session went wrong. + +Sessions can come from **BigQuery** (the default) or from a **local JSON file** +of conversations (`--conversations-file`, no BigQuery required) — see +[Adding evals](#adding-evals-grounding-the-report-in-ground-truth) for the +recommended workflow. ### Prerequisites @@ -47,7 +56,7 @@ EVAL_MODEL_ID=gemini-2.5-flash ### Usage ```bash -# From the repo root: +# From the repo root — basics: ./scripts/quality_report.sh # evaluate last 100 sessions ./scripts/quality_report.sh --limit 500 # evaluate last 500 sessions ./scripts/quality_report.sh --time-period 7d # evaluate last 7 days @@ -55,13 +64,41 @@ EVAL_MODEL_ID=gemini-2.5-flash ./scripts/quality_report.sh --no-eval # browse Q&A only (no evaluation) ./scripts/quality_report.sh --persist # persist results to BigQuery ./scripts/quality_report.sh --model gemini-2.5-pro # use a specific model -./scripts/quality_report.sh --samples 20 # show 20 sessions per category -./scripts/quality_report.sh --samples all # show all sessions per category +./scripts/quality_report.sh --env path/to/.env # load a specific .env file + +# Add ground truth — the most important usage (see "Adding evals" below): +./scripts/quality_report.sh --eval-spec eval_spec.json --report # scope + golden Q&A +./scripts/quality_report.sh --conversations-file traffic.json \ + --eval-spec eval_spec.json --report # score local cases, no BigQuery +./scripts/quality_report.sh --conversations-file traffic.json --concurrency 20 +./scripts/quality_report.sh --golden-threshold 0.85 # looser golden_qa matching +./scripts/quality_report.sh --eval-config my_metrics.json # custom metric definitions + +# Choose how much to score: +./scripts/quality_report.sh --dimensions full # 8 metrics (default) +./scripts/quality_report.sh --dimensions primary # 2 primary metrics only (~4x cheaper) +./scripts/quality_report.sh --tag-turns # classify each user turn (multi-turn) +./scripts/quality_report.sh --trajectory-samples 5 # include N execution traces + +# Filter which sessions to evaluate: ./scripts/quality_report.sh --app-name my_agent # filter to a specific agent app +./scripts/quality_report.sh --label version=v2.1 # filter by custom label +./scripts/quality_report.sh --label version=v2 --label env=prod # multiple labels (AND) ./scripts/quality_report.sh --session-ids-file ids.json # evaluate specific sessions +./scripts/quality_report.sh --session # evaluate single session (verbose) + +# Control the report: +./scripts/quality_report.sh --samples 20 # show 20 sessions per category +./scripts/quality_report.sh --samples all # show all sessions per category +./scripts/quality_report.sh --samples unhelpful=10,partial=5,low=3 # per-category caps ./scripts/quality_report.sh --output-json report.json # write structured JSON output ./scripts/quality_report.sh --threshold 15 # unhelpful rate warning at 15% -./scripts/quality_report.sh --config config.json # scope-aware eval with config + +# Full ground-truth report with all the trimmings: +./scripts/quality_report.sh --report --limit 50 --app-name my_agent \ + --label version=v2.1 --label env=prod --time-period 7d \ + --tag-turns --trajectory-samples 5 \ + --eval-spec eval_spec.json --output-json results.json ``` Or run the Python script directly: @@ -73,88 +110,416 @@ python scripts/quality_report.py --limit 50 --report ### Output **Console output** includes: -- Per-session details grouped by category (unhelpful, partial, meaningful) +- Per-session details grouped by category (unhelpful, partial, meaningful, declined) - Per-agent quality table with helpful/unhelpful rates and status indicators +- Quality Dimensions summary (0-2 scale with color ratings) +- Multi-turn efficiency metrics (corrections, verifications) - Unhelpful contribution ranking - Category distributions -- Execution details (elapsed time, execution mode) - -**Markdown report** (`--report` flag) is saved to `scripts/reports/` and includes -all the above in a structured markdown format suitable for sharing or archiving. +- Execution details — all active filters (`app_name`, `labels`, `time_period`, + `limit`), plus project, dataset, location, eval model, and elapsed time + +When `--session` is used, the console shows **all 8 metrics with full +justifications** for the single session (verbose mode). See +[sample single-session output](sample_quality_report_session.md). + +**Markdown report** (`--report` flag) is saved to `scripts/reports/` and includes: +- Summary table and Quality Dimensions scores +- **Dimension drilldowns** — for any dimension rated below 1.50 (needs attention + or problem area), the report lists the sessions that scored poorly with + question, response, the judge's justification, and the full conversation + for multi-turn sessions +- Per-agent breakdown, category distributions +- Unhelpful / Declined / Partial session details with conversations **Log files** are saved to `scripts/reports/` for each eval run. +### Adding evals: grounding the report in ground truth + +This is the single most important way to use the quality report. Without +ground truth, `response_usefulness` and `task_grounding` are **LLM estimates** — +the judge guesses whether an answer is good. That can mislabel a verbose, +tool-grounded answer as "meaningful" when it is actually wrong, or flag a correct +decline as a failure. Adding evals turns the report into a **trustworthy +regression signal**. + +There are two things you "add", and they compose: + +1. **An eval spec** (`--eval-spec`) — describes what the agent should do and the + facts it should know: `scope`, `tools`, `ground_truth`, and `golden_qa`. + See [Grounding the judge](#grounding-the-judge---eval-spec) below for the + full schema. Golden Q&A is the highest-value field: each session's question is + matched to a known question and the **expected answer** is injected into the + judge, so it grades factual correctness against ground truth instead of + guessing. The output gains a `golden_eval_summary` — the headline number for + regression testing. + +2. **A set of conversations to score** — either pulled from BigQuery (the + default) or supplied directly as a **local JSON file** with + `--conversations-file` (no BigQuery, no GCP credentials). This is what lets you + score eval cases offline, in CI, or before anything is deployed. + +**Recommended workflow:** + +```bash +# 1. Create an eval spec for your agent (scope + tools + ground truth + golden Q&A) +cp scripts/eval/data/eval_spec.example.json scripts/eval/data/eval_spec.json +# edit it — see "Grounding the judge" below + +# 2a. Score live sessions from BigQuery against that spec +./scripts/quality_report.sh --eval-spec scripts/eval/data/eval_spec.json --report + +# 2b. OR score a local set of conversations offline (no BigQuery) +./scripts/quality_report.sh --conversations-file traffic.json \ + --eval-spec scripts/eval/data/eval_spec.json --report --output-json results.json +``` + +#### Local conversations (`--conversations-file`) + +`--conversations-file PATH` evaluates conversations from a local JSON file using +the Gemini API directly — no BigQuery table and no GCP/BQ credentials required +(you still need `GOOGLE_API_KEY`/Vertex auth for the judge model). The report +format is identical to the BigQuery path, so every flag below +(`--eval-spec`, `--dimensions`, `--tag-turns`, `--report`, `--output-json`, …) +works the same way. + +The file is either a list of conversation objects or `{"conversations": [...]}`. +Each conversation is multi-turn (`conversation` array) or single-turn +(`question` + `final_response`): + +```json +{ + "conversations": [ + { + "session_id": "case_001", + "answered_by": "hr_agent", + "question": "How many PTO days do I get per year?", + "final_response": "You get 20 PTO days per year, accrued monthly.", + "tool_calls": 1 + }, + { + "session_id": "case_002", + "answered_by": "hr_agent", + "conversation": [ + {"role": "user", "text": "How many sick days?"}, + {"role": "agent", "text": "You get 5 sick days."}, + {"role": "user", "text": "I thought it was 10?", "tag": "CORRECTION"}, + {"role": "agent", "text": "You're right — 10 sick days per year."} + ], + "tool_calls": 2, + "corrections": 1 + } + ] +} +``` + +Optional per-conversation fields: `session_id` (auto-generated if omitted), +`answered_by`, `tool_calls`, `corrections`, `verifications`, and per-turn `tag`. +When corrections/verifications are not provided for a multi-turn conversation, +they are inferred concurrently (tune parallelism with `--concurrency`, default +`10`). `--limit` caps how many conversations from the file are scored. + +#### Failure-cause taxonomy (who fixes it) + +When an eval spec is provided, the judge attributes each failure to a **cause**, +so the report tells you *who* should fix it rather than just *that* it failed: + +| Cause | Meaning | Fix | +|-------|---------|-----| +| `skill_gap` | Had the tool **and** the data but misbehaved | A skill / prompt fix (evolution) | +| `knowledge_gap` | Used the tool correctly but the fact is missing | Add data to the knowledge source | +| `tool_gap` | No tool/data source, or a personal-data / action request | Build a new tool | + +The `tools` field in the eval spec is what lets the judge tell a `knowledge_gap` +(a covered topic with a missing fact) from a `tool_gap` (no data source at all). +The report also detects **routing failures** (a supervisor answered from LLM +knowledge instead of routing to a specialist) and **parroting** (the agent echoed +the user's correction without re-verifying via a tool — penalized as unhelpful so +it can't inflate the score). + ### Filtering -By default, the script evaluates the most recent sessions by time. Two -additional filters are available for targeted evaluation: +By default, the script evaluates the most recent sessions by time. Several +filters are available for targeted evaluation: - **`--app-name`** filters to sessions from a specific agent. Matches the `root_agent_name` attribute set by `BigQueryAgentAnalyticsPlugin`. +- **`--label KEY=VALUE`** filters by custom tags set via + `BigQueryLoggerConfig.custom_tags`. Repeatable — multiple labels are + combined with AND logic. Use this to filter by software version, deployment + environment, experiment ID, or any other custom tag your agent emits. - **`--session-ids-file`** evaluates only the sessions listed in a JSON file. Accepts either a list of `{"session_id": "..."}` objects (the output of `run_eval.py`) or a plain list of ID strings. When session IDs are provided, the script filters directly by ID instead of relying on time-based queries, which avoids picking up stale sessions from prior runs. -These filters can be combined (e.g. `--app-name my_agent --session-ids-file ids.json`). +These filters can be combined: + +```bash +# Evaluate v2.1 sessions from my_agent in the last 7 days +python scripts/quality_report.py --app-name my_agent --label version=v2.1 \ + --time-period 7d --report +``` + +Active filters are displayed in the **Execution Details** section of both +console and markdown report output, so you can always tell which filters +produced a given report. ### Metrics -The evaluation uses two categorical metrics: +The evaluation scores each session on **8 metrics** using LLM-as-a-judge: +2 primary, 5 quality dimensions, and `failure_attribution`. + +> **Cost:** the default `--dimensions full` makes **8 LLM-judge calls per +> session** (2 primary + 5 quality dimensions + failure_attribution). A +> 100-session run is ~800 calls; a 1000-session bulk eval is ~8000. If you only +> need the pass/fail view, pass `--dimensions primary` to score just the 2 +> primary metrics (~2 calls/session, roughly **4x cheaper**) at the cost of the +> Quality Dimensions table. Use `--no-eval` to skip LLM scoring entirely and +> only browse Q&A pairs. + +**Primary metrics** classify each session: + +| Metric | Categories | What it measures | +|--------|------------|------------------| +| `response_usefulness` | `meaningful`, `declined`, `unhelpful`, `partial` | Whether the response provides a genuinely useful answer | +| `task_grounding` | `grounded`, `ungrounded`, `no_tool_needed` | Whether the response is based on tool-retrieved data or fabricated | + +The **`declined`** category is only included when a `scope` is provided in the +eval spec (via `--eval-spec` or auto-discovered `eval/data/eval_spec.json`). +Without scope, the judge has no basis for distinguishing intentional declines +from failures, so only `meaningful`, `unhelpful`, and `partial` are used. + +**Quality dimensions** score each session 0-2 and are averaged across all +sessions to produce the Quality Dimensions table in the report: + +| Dimension | 2 (best) | 1 (middle) | 0 (worst) | +|-----------|----------|------------|-----------| +| `correctness` | All facts accurate | Minor inaccuracy | Wrong facts or hallucinations | +| `tool_usage` | Tools used properly, **or no tool was needed** | Partial tool use | No tool use when needed | +| `specificity` | Specific numbers, dates, limits | Missing some details | Vague or generic | +| `scope_compliance` | Correctly handled scope | Unnecessary caveats | Wrong scope decision | +| `first_time_right` | Correct on first try | Needed clarification | User had to correct | + +`tool_usage` includes a `no_tool_needed` category that also scores 2 — a +greeting, clarification, or a correctly-declined out-of-scope question did not +require a tool, so it is not counted as a Tool Usage failure. In the per-session +scorecard it renders as a neutral `➖` rather than `❌`. + +`first_time_right` is primarily a **multi-turn** signal: it measures whether the +agent's first answer held up without the user correcting it. For single-turn +sessions it has no follow-up to look at and effectively mirrors `correctness`, +so read it alongside the multi-turn efficiency stats below. + +**Multi-turn efficiency** metrics are extracted from trace spans: + +| Metric | Description | +|--------|-------------| +| Avg user turns | Average number of user messages per session | +| Avg tool calls | Average number of tool calls per session | +| Multi-turn sessions | Sessions with more than one user message | + +### Dimension Drilldowns + +When the markdown report (`--report`) includes a Quality Dimension rated +below 1.50 (yellow or red), the report automatically adds a drilldown +section listing the sessions that scored poorly on that dimension. Each +entry shows: + +- The question and response (last turn for multi-turn sessions) +- The dimension verdict and the judge's justification +- A collapsible conversation block for multi-turn sessions + +This makes it easy to go from "Tool Usage is 0.60 — red" to seeing +exactly which sessions had low tool usage and why. + +### Single-Session Evaluation (`--session`) + +Evaluate a single session and see all 8 metrics with full justifications: -- **response_usefulness** - Whether the agent's response provides a genuinely - useful answer. Categories: `meaningful`, `declined`, `unhelpful`, `partial`. +```bash +./scripts/quality_report.sh --session conv_484affd8 +``` + +This is useful for verifying whether the LLM judge scored a specific +session correctly, or for debugging individual conversations. The execution +trace for the session is fetched automatically — no extra flags needed. + +### Choosing what to score (`--dimensions`) + +Controls how many LLM-judge metrics run per session: + +| Value | Metrics | Cost | Use when | +|-------|---------|------|----------| +| `full` (default) | All 8 (2 primary + 5 quality dimensions + failure_attribution) | ~8 calls/session | You want the full diagnostic | +| `primary` | Only `response_usefulness` + `task_grounding` | ~2 calls/session (~4x cheaper) | You only need the pass/fail view | -- **task_grounding** - Whether the response is grounded in tool-retrieved data - or fabricated. Categories: `grounded`, `ungrounded`, `no_tool_needed`. +Use `--no-eval` to skip LLM scoring entirely and just browse Q&A pairs. -The **`declined`** category is always available — the LLM judge can classify -polite refusals of out-of-scope questions as correct behavior rather than -marking them as `unhelpful`. +### Multi-turn analysis and execution traces -### Scope-Aware Evaluation (`--config`) +Two flags add deeper diagnostics on top of the scores: -For more accurate scope evaluation, provide a config file that tells the -LLM judge exactly which topics your agent intentionally does not handle: +- **`--tag-turns`** runs the full turn tagger on multi-turn conversations, + classifying each user turn as `CORRECTION`, `VERIFY`, `SPECIFICS`, `SCOPE`, + `FOLLOWUP`, or `END`. This drives correction-boundary detection and + sub-trajectory segmentation — for a corrected session the report shows what + the agent claimed, what the user corrected, and whether it recovered (vs. + parroted the correction without re-verifying). + +- **`--trajectory-samples N`** fetches `N` execution traces from BigQuery and + renders the full routing tree — per-span tool calls, latency, and TTFT — + prioritizing unhelpful and correction sessions so the traces shown are the + ones worth debugging. (With `--session`, the trace is fetched automatically.) ```bash -./scripts/quality_report.sh --config agent_context.json --report +./scripts/quality_report.sh --report --tag-turns --trajectory-samples 5 ``` -The script also auto-discovers `eval/data/agent_context.json` relative to -the repo root or script directory, so `--config` is only needed to point -at a non-default location. +### Grounding the judge (`--eval-spec`) -Create a JSON config file with `scope_decisions`: +For more accurate scoring, provide an **eval spec** — a single JSON file that +grounds the LLM judge. All four fields are optional: ```json { - "scope_decisions": [ - { - "topic": "stock_options", - "decision": "out_of_scope", - "reason": "No tool or data source covers equity compensation" - }, - { - "topic": "salary_bands", - "decision": "out_of_scope", - "reason": "Confidential compensation data" - }, - { - "topic": "promotions", - "decision": "out_of_scope", - "reason": "No tool covers career progression" - } + "scope": "Answers HR policy questions: PTO, benefits, expenses, holidays. Does not handle salary, equity, or IT support.", + "tools": "lookup_company_policy(topic) returns policy text for PTO, sick leave, expenses, benefits, holidays only. No tool can read personal/account data or perform actions.", + "ground_truth": "PTO: 20 days/year. 401k match: 4%, vested after 1 year.", + "golden_qa": [ + {"question": "How many PTO days?", "expected_answer": "20/year", "topic": "pto"}, + {"question": "What are the salary bands?", "expected_behavior": "decline", "topic": "out_of_scope"} ] } ``` -Without a config, the LLM judge can still classify obvious declines as -`declined`, but it won't know which specific topics are out of scope. With -the config, the judge is told exactly which topics are out of scope, so it -can correctly classify polite refusals as `declined` (correct behavior) -rather than `unhelpful` (a bug). +```bash +./scripts/quality_report.sh --eval-spec eval_spec.json --report +``` + +The script auto-discovers `eval/data/eval_spec.json` relative to the repo root +or script directory, so `--eval-spec` is only needed to point at a non-default +location. Pass `--eval-spec none` to disable. + +**`scope`** — a free-text description of what the agent is designed to handle. +Define scope *positively*; out-of-scope is the complement, so you do **not** +enumerate out-of-scope topics. This lets the judge: +- classify a polite refusal of an out-of-scope question as `declined` (correct) + rather than `unhelpful` (a bug), and +- score the `scope_compliance` dimension accurately. + +**`tools`** — a free-text description of what the agent's tools can and cannot +do. This is what lets the failure-cause taxonomy distinguish a `knowledge_gap` +(a covered topic with a missing fact → add data) from a `tool_gap` (no data +source at all, or a personal-data / action request → build a tool). See +[Failure-cause taxonomy](#failure-cause-taxonomy-who-fixes-it). + +**`ground_truth`** — authoritative facts injected into every judge prompt for +correctness checking. + +**`golden_qa`** — a list of `{question, expected_answer, topic?, +expected_behavior?}`. Each session's question is matched to the closest golden +question by embedding similarity (cosine ≥ `--golden-threshold`, default 0.92; +lower the threshold to match more aggressively); on a match, the expected answer +is injected into the judge prompt to ground correctness, and the report gains a +`golden_eval_summary` block (matched/unmatched split, `matched_meaningful_rate`, +and the golden-matched questions the agent got wrong — the trustworthy headline +for regression testing). Entries with `expected_behavior: "decline"` (or +`topic: "out_of_scope"`) double as scope-boundary examples. Golden Q&A is +something teams usually already have; it is the most reliable correctness signal. + +> **No golden Q&A?** When the spec has no `golden_qa`, the report prints a +> warning that usefulness/grounding are LLM estimates without ground truth (they +> can mislabel verbose, tool-grounded answers) and points you back here. + +A sample spec is provided at `scripts/eval/data/eval_spec.example.json`: + +```bash +cp scripts/eval/data/eval_spec.example.json scripts/eval/data/eval_spec.json +# Edit with your agent's scope, ground truth, and golden Q&A +``` + +### Custom Labels (`--label`) + +Custom labels let you filter quality reports by software version, deployment +environment, experiment ID, or any other tag your agent emits at runtime. + +**How it works end-to-end:** + +**1. Agent emits labels** — Configure `BigQueryLoggerConfig.custom_tags` when +initializing the ADK plugin. These tags are attached to every event the agent +writes to BigQuery: + +```python +from google.adk.plugins.bigquery_agent_analytics_plugin import ( + BigQueryLoggerConfig, + BigQueryAgentAnalyticsPlugin, +) + +bq_config = BigQueryLoggerConfig( + table_id="agent_events", + custom_tags={ + "version": "v2.1", + "env": "prod", + "experiment_id": "baseline_june", + }, +) + +plugin = BigQueryAgentAnalyticsPlugin( + project_id=PROJECT_ID, + dataset_id=DATASET_ID, + config=bq_config, + location=LOCATION, +) +``` + +**2. BigQuery stores labels** — The tags are stored in the +`attributes.custom_tags` JSON field of each event row. + +**3. Quality report filters by labels** — Use `--label KEY=VALUE` to filter +to sessions that have the matching tag. Multiple labels are combined with AND: + +```bash +# Evaluate only v2.1 sessions +./scripts/quality_report.sh --label version=v2.1 --report + +# Evaluate v2.1 production sessions from the last 7 days +./scripts/quality_report.sh --label version=v2.1 --label env=prod \ + --time-period 7d --report + +# Compare versions: run two reports and diff +./scripts/quality_report.sh --label version=v2.0 --output-json v2.0.json +./scripts/quality_report.sh --label version=v2.1 --output-json v2.1.json +``` + +Active labels appear in the **Execution Details** section of the output, +so each report is self-documenting about which filters produced it. + +### Custom Metrics (`--eval-config`) + +Override the built-in metric definitions with your own: + +```bash +./scripts/quality_report.sh --eval-config scripts/eval/eval_config.json --report +``` + +The eval config file is a JSON file with a `metrics` key — a list of metric +definitions that replace the built-in 8 metrics. Each metric has a `name`, +`definition`, and a list of `categories` with scoring criteria. Metrics with +`scope_aware: true` are automatically enriched with scope context when an +eval spec with a `scope` is provided (`--eval-spec`). + +A complete example is provided at `scripts/eval/eval_config.json`. Copy it +and customize for your evaluation needs: + +```bash +cp scripts/eval/eval_config.json my_eval_config.json +# Edit metric definitions, add/remove dimensions, adjust categories +./scripts/quality_report.sh --eval-config my_eval_config.json +``` + +When `--eval-config` is not specified, the built-in metrics are used. ### A2A Support @@ -162,9 +527,10 @@ The script automatically detects and resolves responses from remote A2A (Agent-to-Agent) agents by extracting `A2A_INTERACTION` events from traces. -### Sample report output +### Sample output -[Sample quality report](sample_quality_report.md) +- [Sample quality report](sample_quality_report.md) — full multi-session report +- [Sample single-session report](sample_quality_report_session.md) — verbose single-session output --- diff --git a/scripts/eval/data/eval_spec.example.json b/scripts/eval/data/eval_spec.example.json new file mode 100644 index 00000000..5f24f77b --- /dev/null +++ b/scripts/eval/data/eval_spec.example.json @@ -0,0 +1,22 @@ +{ + "scope": "This assistant answers questions about company HR policies using its lookup tools: PTO and time off, sick leave, remote work, expenses and reimbursements, benefits (medical, dental, vision, 401k), parental leave, and company holidays. It is OUT OF SCOPE for salary and compensation (bonuses, severance, salary bands), stock/equity, promotions and performance reviews, IT support, office and facilities, training budgets, employee relations (harassment, grievances), code of conduct, internal mobility, and dress code. For any out-of-scope topic the agent should politely decline rather than guess.", + "tools": "lookup_company_policy(topic) returns static policy text for: PTO, sick leave, remote work, expenses, benefits, holidays ONLY (returns 'topic not found' for anything else). No tool can read an individual employee's personal/account data or perform actions (submit, enroll, file). Used by the judge's failure_attribution metric to tell a knowledge gap (covered topic, missing fact) from a tool gap (no data source, or a personal-data / action request).", + "ground_truth": "PTO: 20 days/year, accrued monthly (~1.67/mo), max 5 days rollover.\nSICK LEAVE: 10 days/year, no rollover.\nBENEFITS: 401k match 4% of salary, vested after 1 year.\nHOLIDAYS: 11 paid holidays/year; Juneteenth and Veterans Day are NOT company holidays.", + "golden_qa": [ + { + "question": "How many PTO days do I get per year?", + "expected_answer": "20 days per year, accrued monthly at ~1.67 days/month.", + "topic": "pto" + }, + { + "question": "How does the 401k match work?", + "expected_answer": "The company matches 4% of salary, vested after 1 year.", + "topic": "benefits" + }, + { + "question": "What are the salary bands for senior engineers?", + "expected_behavior": "decline", + "topic": "out_of_scope" + } + ] +} diff --git a/scripts/eval/eval_config.json b/scripts/eval/eval_config.json new file mode 100644 index 00000000..3698bd5d --- /dev/null +++ b/scripts/eval/eval_config.json @@ -0,0 +1,173 @@ +{ + "metrics": [ + { + "name": "response_usefulness", + "definition": "Whether the agent final response provides a genuinely useful, substantive answer to the user question. A response that apologizes, says it cannot help, returns no data, provides only generic filler, or loops without resolving the question is NOT useful. If the conversation contains a user correction and the agent merely repeated or acknowledged the correction without independently verifying it (e.g. re-querying a tool, citing a new source), the response is NOT useful — the user did the agent's work.", + "categories": [ + { + "name": "meaningful", + "definition": "The response directly and substantively addresses the user question with specific, actionable information." + }, + { + "name": "unhelpful", + "definition": "The response does NOT meaningfully answer the user question. This includes: (1) The agent said 'I don't have that information', gave generic advice, or directed the user elsewhere instead of using its tools. (2) The agent apologized without answering. (3) Empty data results or generic filler text. (4) The agent looped without resolution. (5) The agent only became correct after the user provided the right answer and the agent repeated it without independent verification (e.g. re-querying a tool)." + }, + { + "name": "partial", + "definition": "The response partially addresses the question but is incomplete, missing key details, or only tangentially relevant." + } + ], + "required": true, + "scope_aware": true, + "declined_category": { + "name": "declined", + "definition": "The TOPIC of the question is explicitly listed as out of scope (see AGENT SCOPE CONTEXT above) and the agent correctly declined. Use this ONLY when the topic itself is out of scope -- NOT when the agent simply failed to find an answer for an in-scope topic.", + "insert_after": "meaningful" + }, + "scope_suffix": " UNLESS the question is outside the agent's defined scope, in which case a polite decline IS a correct and meaningful response." + }, + { + "name": "task_grounding", + "definition": "Whether the agent response is grounded in actual data retrieved from its tools, or is fabricated / hallucinated general knowledge.", + "categories": [ + { + "name": "grounded", + "definition": "The response is clearly based on data retrieved from the agent tools (search results, database lookups, API calls)." + }, + { + "name": "ungrounded", + "definition": "The response appears to be fabricated or based on the LLM general knowledge rather than actual tool results. The tool may have returned empty data and the agent filled in anyway." + }, + { + "name": "no_tool_needed", + "definition": "The question did not require tool usage and a direct LLM response was appropriate." + } + ], + "required": true + }, + { + "name": "correctness", + "definition": "Whether the facts stated in the agent response are accurate. Evaluate based on the information the agent retrieved from its tools and whether it was conveyed faithfully.", + "categories": [ + { + "name": "correct", + "definition": "All facts stated by the agent are accurate and consistent with the tool results retrieved." + }, + { + "name": "mostly_correct", + "definition": "The response is mostly correct but contains a minor inaccuracy, omission, or imprecise wording." + }, + { + "name": "incorrect", + "definition": "The response contains wrong facts, hallucinated information, or claims contradicted by the tool results." + } + ], + "required": true + }, + { + "name": "tool_usage", + "definition": "Whether the agent used its available tools correctly to answer the question, rather than relying on general knowledge.", + "categories": [ + { + "name": "proper", + "definition": "The agent used its tools and based the answer on the tool results. Tools were called with appropriate parameters." + }, + { + "name": "partial", + "definition": "The agent partially used tools, or tool usage was unclear or incomplete. Some information may not be tool-derived." + }, + { + "name": "none", + "definition": "The agent answered from general knowledge without looking up information via tools, even though tools were available and the question warranted their use. DECISIVE TEST: if the question was in-scope and a tool could have supplied the answer, but the trace shows no relevant tool call, this is `none` (a failure) -- do NOT use `no_tool_needed` to excuse a missing lookup." + }, + { + "name": "no_tool_needed", + "definition": "The question genuinely required no tool lookup -- e.g. a greeting, a meta/clarification turn, or an out-of-scope topic the agent correctly declined. Not using a tool was the CORRECT behavior here, so this is a positive outcome, not a failure. Use this ONLY when no tool was needed; if the question was an in-scope data lookup the agent should have performed, use `none` instead." + } + ], + "required": true + }, + { + "name": "specificity", + "definition": "Whether the agent response provides specific, concrete details (numbers, dates, dollar amounts, limits) rather than vague or generic statements.", + "categories": [ + { + "name": "specific", + "definition": "The response includes specific and complete details: exact numbers, percentages, dollar amounts, dates, or limits." + }, + { + "name": "somewhat_specific", + "definition": "The response is somewhat specific but missing some key details that would make it fully actionable." + }, + { + "name": "vague", + "definition": "The response is vague, generic, or missing key specifics that the user needs to act on the information." + } + ], + "required": true + }, + { + "name": "scope_compliance", + "definition": "Whether the agent correctly handled the scope of the question. An agent should answer in-scope questions and politely decline out-of-scope ones.", + "categories": [ + { + "name": "compliant", + "definition": "The agent correctly answered an in-scope question OR correctly declined an out-of-scope question." + }, + { + "name": "partially_compliant", + "definition": "The agent answered but with unnecessary caveats, excessive hedging, or was partially out of scope." + }, + { + "name": "non_compliant", + "definition": "The agent tried to answer an out-of-scope question it should have declined, OR refused to answer an in-scope question it should have handled." + } + ], + "required": true, + "scope_aware": true + }, + { + "name": "first_time_right", + "definition": "Whether the agent's FIRST response in the conversation was satisfactory, without needing user corrections or follow-ups to fix errors. For single-turn conversations, evaluate the only response. For multi-turn, focus on whether the first substantive answer was correct.", + "categories": [ + { + "name": "correct", + "definition": "The first response was correct and complete. No correction or significant clarification was needed from the user." + }, + { + "name": "clarification_needed", + "definition": "The first response was mostly right but needed minor clarification or a follow-up to be fully useful." + }, + { + "name": "correction_needed", + "definition": "The first response was wrong, vague, or incomplete enough that the user had to push back or correct the agent." + } + ], + "required": true + }, + { + "name": "failure_attribution", + "definition": "ROOT CAUSE of a failure: when the agent did NOT give a useful answer, why? Use the AGENT TOOLS / CAPABILITIES context above to decide which fixer is responsible. If the response WAS useful (a substantive answer or a correct decline of an out-of-scope topic), return not_a_failure.", + "categories": [ + { + "name": "not_a_failure", + "definition": "The response was useful -- a substantive answer, or a correct polite decline of a genuinely out-of-scope topic. No failure to attribute." + }, + { + "name": "skill_gap", + "definition": "The agent HAD the means to answer but behaved wrong: it failed to route to the right sub-agent, did not call an available tool, echoed/parroted the user's correction without re-verifying, or stated facts that contradict its tools. The tool and data needed were available -- this is fixable by improving the agent's instructions (skill)." + }, + { + "name": "knowledge_gap", + "definition": "The agent correctly used a tool that DOES cover this topic, but the SPECIFIC fact requested was not present in the data the tool returned (the data source is incomplete on this detail). Fixable by a human adding the missing fact to the existing data source -- not by changing instructions." + }, + { + "name": "tool_gap", + "definition": "No tool or capability could even attempt this request. Either (a) the question is about a topic that NONE of the listed tools has any data source for, or (b) it needs the individual user's personal/account data (their actual balance, enrollment status) or an ACTION (submit, file, enroll) that no tool provides. Fixable only by an engineer building a new tool or data source -- not by skill evolution or by adding a fact." + } + ], + "required": true, + "scope_aware": true + } + ] +} diff --git a/scripts/quality_report.py b/scripts/quality_report.py index 9a12e9da..0030c0b4 100644 --- a/scripts/quality_report.py +++ b/scripts/quality_report.py @@ -41,17 +41,21 @@ python quality_report.py --samples all # show all sessions python quality_report.py --app-name my_agent # filter to a specific agent python quality_report.py --output-json r.json # write structured JSON output - python quality_report.py --config config.json # use scope definitions from config + python quality_report.py --eval-spec eval_spec.json # ground scoring with scope + golden Q&A python quality_report.py --env path/to/.env # load a specific .env file + python quality_report.py --conversations-file results.json # score local JSON + python quality_report.py --eval-config path/to/custom.json # override metric definitions """ import warnings warnings.filterwarnings("ignore") import argparse +from concurrent.futures import ThreadPoolExecutor from datetime import datetime import json import logging +import math import os import sys import time @@ -67,12 +71,78 @@ def _positive_int(value): def _samples_arg(value): if value == "all": return "all" + if "=" in value: + return value n = int(value) if n < 1: raise argparse.ArgumentTypeError("--samples must be 'all' or >= 1") return str(n) +_SAMPLES_DEFAULTS = { + "unhelpful": 10, + "partial": 5, + "meaningful": 3, + "declined": 3, + "low": 3, + "unknown": 3, +} + + +def _parse_samples(samples_str): + """Parse --samples value into a resolved dict. + + Accepts: + "all" → show everything + "5" → cap all sections at 5 + "unhelpful=10,partial=5,low=3" → per-category overrides + + Returns a dict mapping category names to int limits, or None for "all". + The "low" key applies to all Low-dimension sections. + """ + if samples_str is None: + return dict(_SAMPLES_DEFAULTS) + if samples_str == "all": + return None + if "=" in samples_str: + result = dict(_SAMPLES_DEFAULTS) + for pair in samples_str.split(","): + pair = pair.strip() + if "=" not in pair: + raise argparse.ArgumentTypeError( + f"Invalid samples pair: {pair!r}. Use key=value format." + ) + key, val = pair.split("=", 1) + key = key.strip().lower() + val = val.strip() + if val == "all": + result[key] = None + else: + n = int(val) + if n < 1: + raise argparse.ArgumentTypeError( + f"--samples value for {key!r} must be >= 1, got {n}" + ) + result[key] = n + return result + n = int(samples_str) + return {k: n for k in _SAMPLES_DEFAULTS} + + +def _get_sample_limit(samples_dict, category): + """Get the sample limit for a category from parsed samples dict. + + Returns None to show all, or an int limit. + """ + if samples_dict is None: + return None + if category in samples_dict: + return samples_dict[category] + if category.startswith("low_") or category.startswith("low "): + return samples_dict.get("low") + return samples_dict.get("_default", 5) + + _script_dir = os.path.dirname(os.path.abspath(__file__)) _repo_root = os.path.join(_script_dir, "..") @@ -169,172 +239,425 @@ def get_client(): # --------------------------------------------------------------------------- -# Scope configuration +# Eval spec — optional grounding for scoring (scope, ground truth, golden Q&A) # --------------------------------------------------------------------------- +# +# The eval spec is a single optional JSON file (``eval/data/eval_spec.json``, +# auto-discovered, or ``--eval-spec ``) with three optional fields: +# +# { +# "scope": "free-text description of what the agent handles", +# "ground_truth": "free-text authoritative facts for correctness", +# "golden_qa": [{"question", "expected_answer", "topic"?, +# "expected_behavior"?}] +# } +# +# ``scope`` defines the boundary positively (out-of-scope is the complement — +# no need to enumerate out-of-scope topics). ``golden_qa`` grounds correctness +# per question via embedding similarity; entries with +# ``expected_behavior: "decline"`` (or ``topic: "out_of_scope"``) also act as +# scope-boundary examples. -_AGENT_CONFIG_CACHE: dict[str, dict] = {} +_EVAL_SPEC_CACHE: dict[str, dict] = {} -def _load_agent_config(config_path=None): - """Load agent config (scope decisions, etc.) from a JSON file. +def _load_eval_spec(spec_path=None): + """Load the eval spec ({scope, ground_truth, golden_qa}) from JSON. - When --config is provided, loads from that path. Otherwise checks - for eval/data/agent_context.json relative to the repo root or script dir. - Returns None if no config is found (scope-aware eval is disabled). + When *spec_path* is given, loads that file. ``"none"`` disables the spec + (no auto-discovery). Otherwise auto-discovers ``eval/data/eval_spec.json`` + relative to the repo root or script dir. Returns None when nothing is found. Raises: - FileNotFoundError: If an explicit config_path does not exist. + FileNotFoundError: If an explicit *spec_path* does not exist. """ - cache_key = config_path or "_AUTO_" - if cache_key in _AGENT_CONFIG_CACHE: - return _AGENT_CONFIG_CACHE[cache_key] - - if config_path: - if not os.path.isfile(config_path): - raise FileNotFoundError(f"Config file not found: {config_path}") - with open(config_path) as f: + if spec_path and spec_path.lower() == "none": + return None + + cache_key = spec_path or "_AUTO_" + if cache_key in _EVAL_SPEC_CACHE: + return _EVAL_SPEC_CACHE[cache_key] + + if spec_path: + if not os.path.isfile(spec_path): + raise FileNotFoundError(f"Eval spec file not found: {spec_path}") + with open(spec_path) as f: result = json.load(f) - _AGENT_CONFIG_CACHE[cache_key] = result + _EVAL_SPEC_CACHE[cache_key] = result return result - # Auto-discover agent_context.json from known locations for base in [_repo_root, _script_dir]: - candidate = os.path.join(base, "eval", "data", "agent_context.json") + candidate = os.path.join(base, "eval", "data", "eval_spec.json") if os.path.isfile(candidate): - logger.info("Auto-discovered agent context: %s", candidate) + logger.info("Auto-discovered eval spec: %s", candidate) with open(candidate) as f: result = json.load(f) - _AGENT_CONFIG_CACHE[cache_key] = result + _EVAL_SPEC_CACHE[cache_key] = result return result return None -def _build_scope_context(config=None): - """Build scope context string for the LLM judge from config.""" - if not config: - return "" +def _build_scope_context(spec=None): + """Build scope / ground-truth context for the LLM judge from the eval spec. - scope_decisions = config.get("scope_decisions", []) - oos_topics = [ - d["topic"] for d in scope_decisions if d.get("decision") == "out_of_scope" - ] - if not oos_topics: + Reads two optional free-text fields: + - ``ground_truth``: authoritative facts the judge uses for correctness. + - ``scope``: what the agent is designed to handle. Anything outside it is + out of scope (a polite decline is then correct); anything inside it the + agent fails to answer is unhelpful, not declined. + """ + if not spec: return "" - parts = [ - "\n\nAGENT SCOPE CONTEXT (use this to judge responses correctly):", - "The following topics are OUT OF SCOPE: " + ", ".join(oos_topics) + ".", - "If the agent correctly declines a question about an out-of-scope " - "topic (says it cannot help with that topic, suggests what it CAN " - "help with), that is a MEANINGFUL response, not an unhelpful one.", - ] - return " ".join(parts) + parts = [] + + ground_truth = spec.get("ground_truth", "") + if ground_truth: + parts.append( + "\n\nGROUND TRUTH DATA (use this to judge factual correctness):" + ) + parts.append(ground_truth) + + scope = spec.get("scope", "") + if scope: + parts.append("\n\nAGENT SCOPE (use this to judge responses correctly):") + parts.append(scope.strip()) + parts.append( + "A question is OUT OF SCOPE only if it falls outside the agent scope" + " described above. When the agent politely declines a genuinely" + " out-of-scope question, that is CORRECT ('declined'). When the" + " question is in scope but the agent fails to answer it, that is" + " 'unhelpful', NOT 'declined'." + ) + + tools = spec.get("tools", "") + if tools: + parts.append( + "\n\nAGENT TOOLS / CAPABILITIES (use this to attribute the cause of a" + " failure):" + ) + parts.append(tools.strip()) + + return " ".join(parts) if parts else "" # --------------------------------------------------------------------------- -# Metric definitions +# Golden Q&A matching — optional correctness grounding + scope calibration # --------------------------------------------------------------------------- +EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-005") + + +def _embed_texts(texts, model=None, batch_size=50): + """Embed *texts* for semantic similarity; returns L2-normalised vectors.""" + from google import genai + from google.genai import types + + model = model or EMBEDDING_MODEL + client = genai.Client() + vectors = [] + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + resp = client.models.embed_content( + model=model, + contents=batch, + config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY"), + ) + for e in resp.embeddings: + v = list(e.values) + norm = math.sqrt(sum(x * x for x in v)) or 1.0 + vectors.append([x / norm for x in v]) + return vectors + + +# Default cosine-similarity threshold for matching a session question to a +# golden-Q&A entry. Referenced by match_golden_qa, the eval entry points, and +# the --golden-threshold argparse default so the value lives in one place. +_DEFAULT_GOLDEN_THRESHOLD = 0.92 -def get_eval_metrics(config_path=None): - """Return the list of categorical metric definitions for quality evaluation. - Metrics returned: - - ``response_usefulness`` — whether the agent response is helpful, - unhelpful, partial, or a correct scope decline. The ``declined`` - category is always present; when *config_path* provides out-of-scope - topic definitions, the LLM judge receives additional context to - distinguish polite refusals from failures. - - ``task_grounding`` — whether the response is grounded in tool data. +def match_golden_qa( + question_by_sid, golden_qa, threshold=_DEFAULT_GOLDEN_THRESHOLD +): + """Match session questions to golden Q&A by embedding cosine similarity. + + Args: + question_by_sid: dict mapping session_id -> user question text. + golden_qa: list of dicts with ``question`` and optional + ``expected_answer``, ``topic``, ``expected_behavior``. + threshold: minimum cosine similarity (0-1) for a match. + + Returns: + (per_session_context, golden_metadata): + - per_session_context maps session_id -> a judge-context string + (expected answer and/or a "should decline" note). + - golden_metadata maps session_id -> match details (matched flag, + matched question, expected answer, topic, out_of_scope, similarity). """ - from bigquery_agent_analytics import CategoricalMetricCategory - from bigquery_agent_analytics import CategoricalMetricDefinition + if not golden_qa or not question_by_sid: + return {}, {} - config = _load_agent_config(config_path) - scope_context = _build_scope_context(config) - - response_usefulness = CategoricalMetricDefinition( - name="response_usefulness", - definition=( - "Whether the agent final response provides a genuinely useful, " - "substantive answer to the user question. A response that apologizes, " - "says it cannot help, returns no data, provides only generic filler, " - "or loops without resolving the question is NOT useful -- UNLESS the " - "question is outside the agent's defined scope, in which case a " - "polite decline IS a correct and meaningful response." + scope_context - ), - categories=[ - CategoricalMetricCategory( - name="meaningful", - definition=( - "The response directly and substantively addresses the user " - "question with specific, actionable information." - ), - ), - CategoricalMetricCategory( - name="declined", - definition=( - "The question is outside the agent's defined scope and the " - "agent correctly declined -- e.g. said it cannot help with " - "that topic, or suggested what it CAN help with. This is " - "the CORRECT behavior for out-of-scope questions." - ), - ), - CategoricalMetricCategory( - name="unhelpful", - definition=( - "The response does NOT meaningfully answer the user question " - "AND the question IS within the agent's scope. Examples: " - "apologies for in-scope topics, saying 'I do not have that " - "information' when the agent has a tool that covers the topic, " - "empty data results, generic filler text, or the agent looping " - "without a resolution." - ), - ), - CategoricalMetricCategory( - name="partial", - definition=( - "The response partially addresses the question but is " - "incomplete, missing key details, or only tangentially relevant." - ), - ), - ], + sids = [sid for sid, q in question_by_sid.items() if q] + conv_qs = [question_by_sid[sid] for sid in sids] + golden_qs = [g["question"] for g in golden_qa] + if not conv_qs or not golden_qs: + return {}, {} + + logger.info( + "Golden matching: embedding %d golden + %d session questions...", + len(golden_qs), + len(conv_qs), + ) + golden_vecs = _embed_texts(golden_qs) + conv_vecs = _embed_texts(conv_qs) + + per_session_context = {} + golden_metadata = {} + matched = 0 + for sid, cvec in zip(sids, conv_vecs): + best_idx, best_score = -1, -1.0 + for gi, gvec in enumerate(golden_vecs): + # Both vectors are L2-normalised, so the dot product is cosine. + score = sum(a * b for a, b in zip(cvec, gvec)) + if score > best_score: + best_score, best_idx = score, gi + + if best_score >= threshold: + g = golden_qa[best_idx] + is_oos = ( + g.get("expected_behavior") == "decline" + or g.get("topic") == "out_of_scope" + ) + ctx = [ + "EXPECTED ANSWER FOR THIS QUESTION " + "(use to judge factual correctness):", + f"Q: {g['question']}", + ] + if g.get("expected_answer"): + ctx.append(f"A: {g['expected_answer']}") + if is_oos: + ctx.append( + "NOTE: This question is OUT OF SCOPE — the agent should decline." + " A polite decline is the correct ('declined') outcome." + ) + per_session_context[sid] = "\n".join(ctx) + golden_metadata[sid] = { + "matched": True, + "golden_question": g["question"], + "expected_answer": g.get("expected_answer", ""), + "topic": g.get("topic", "unknown"), + "out_of_scope": is_oos, + "similarity": round(best_score, 4), + } + matched += 1 + else: + golden_metadata[sid] = { + "matched": False, + "similarity": round(best_score, 4), + } + + logger.info( + "Golden matching: %d/%d sessions matched (threshold=%.2f)", + matched, + len(sids), + threshold, + ) + return per_session_context, golden_metadata + + +def _inject_golden_summary(report, golden_metadata): + """Enrich a quality-report dict with golden-match data. + + Adds ``golden_eval`` to each session and a ``golden_eval_summary`` block to + the report summary (matched/unmatched counts split by usefulness, plus the + list of golden-matched sessions the agent got wrong). + """ + if not golden_metadata: + return + + buckets = { + "matched_meaningful": 0, + "matched_unhelpful": 0, + "matched_partial": 0, + "unmatched_meaningful": 0, + "unmatched_unhelpful": 0, + "unmatched_partial": 0, + } + mismatches = [] + + for session in report.get("sessions", []): + sid = session.get("session_id", "") + meta = golden_metadata.get(sid) + if meta is None: + session["golden_eval"] = None + continue + session["golden_eval"] = meta + + usefulness = ( + session.get("metrics", {}) + .get("response_usefulness", {}) + .get("category", "") + ) + prefix = "matched" if meta["matched"] else "unmatched" + # A correct decline counts as a positive outcome alongside meaningful. + if usefulness in ("meaningful", "declined"): + buckets[f"{prefix}_meaningful"] += 1 + elif usefulness == "unhelpful": + buckets[f"{prefix}_unhelpful"] += 1 + if meta["matched"]: + mismatches.append( + { + "question": session.get("question", ""), + "expected_answer": meta.get("expected_answer", ""), + "actual_response": ( + session.get("response", session.get("final_response", "")) + )[:300], + "topic": meta.get("topic", ""), + "similarity": meta["similarity"], + } + ) + else: + buckets[f"{prefix}_partial"] += 1 + + total_matched = ( + buckets["matched_meaningful"] + + buckets["matched_unhelpful"] + + buckets["matched_partial"] + ) + total_unmatched = ( + buckets["unmatched_meaningful"] + + buckets["unmatched_unhelpful"] + + buckets["unmatched_partial"] ) - task_grounding = CategoricalMetricDefinition( - name="task_grounding", - definition=( - "Whether the agent response is grounded in actual data retrieved " - "from its tools, or is fabricated / hallucinated general knowledge." + report["summary"]["golden_eval_summary"] = { + "total_sessions": total_matched + total_unmatched, + "matched": total_matched, + "matched_meaningful": buckets["matched_meaningful"], + "matched_unhelpful": buckets["matched_unhelpful"], + "matched_partial": buckets["matched_partial"], + "matched_meaningful_rate": ( + round(buckets["matched_meaningful"] / total_matched * 100, 1) + if total_matched + else 0 ), - categories=[ - CategoricalMetricCategory( - name="grounded", - definition=( - "The response is clearly based on data retrieved from the " - "agent tools (search results, database lookups, API calls)." - ), - ), - CategoricalMetricCategory( - name="ungrounded", - definition=( - "The response appears to be fabricated or based on the LLM " - "general knowledge rather than actual tool results. The tool " - "may have returned empty data and the agent filled in anyway." - ), - ), - CategoricalMetricCategory( - name="no_tool_needed", - definition=( - "The question did not require tool usage and a direct LLM " - "response was appropriate." - ), - ), - ], + "unmatched": total_unmatched, + "unmatched_meaningful": buckets["unmatched_meaningful"], + "unmatched_unhelpful": buckets["unmatched_unhelpful"], + "unmatched_partial": buckets["unmatched_partial"], + "unmatched_meaningful_rate": ( + round(buckets["unmatched_meaningful"] / total_unmatched * 100, 1) + if total_unmatched + else 0 + ), + "mismatches": mismatches, + } + + +# --------------------------------------------------------------------------- +# Eval config (prompts + metrics from external file) +# --------------------------------------------------------------------------- + +_EVAL_CONFIG_CACHE: dict[str, dict] = {} + + +def _load_eval_config(eval_config_path=None): + """Load evaluation config (prompts + metrics) from a JSON file. + + When *eval_config_path* is provided, loads from that path. Otherwise + auto-discovers ``eval/eval_config.json`` relative to the repo root or + script directory (same pattern as eval-spec auto-discovery). + + The file is expected to contain: + - ``metrics``: list of metric definitions (see eval/eval_config.json) + + Results are cached so the file is read only once. + """ + cache_key = eval_config_path or "_AUTO_" + if cache_key in _EVAL_CONFIG_CACHE: + return _EVAL_CONFIG_CACHE[cache_key] + + if eval_config_path: + if not os.path.isfile(eval_config_path): + raise FileNotFoundError(f"Eval config file not found: {eval_config_path}") + with open(eval_config_path) as f: + result = json.load(f) + _EVAL_CONFIG_CACHE[cache_key] = result + logger.info("Loaded eval config from %s", eval_config_path) + return result + + # Auto-discover eval_config.json from known locations + for base in [_repo_root, _script_dir]: + candidate = os.path.join(base, "eval", "eval_config.json") + if os.path.isfile(candidate): + logger.info("Auto-discovered eval config: %s", candidate) + with open(candidate) as f: + result = json.load(f) + _EVAL_CONFIG_CACHE[cache_key] = result + return result + + raise FileNotFoundError( + "No eval_config.json found. Expected at eval/eval_config.json " + "relative to the repo root or script directory, or pass " + "--eval-config explicitly." ) - return [response_usefulness, task_grounding] + +# --------------------------------------------------------------------------- +# Metric definitions +# --------------------------------------------------------------------------- + + +def get_eval_metrics(eval_spec=None, eval_config=None): + """Return the list of categorical metric definitions for quality evaluation. + + Metrics are loaded from *eval_config* (parsed dict, typically from + ``eval/eval_config.json``). Scope-aware metrics are dynamically enriched + when *eval_spec* provides a ``scope`` (and/or ``ground_truth``) field, which + also enables the ``declined`` category so the judge can credit correct + out-of-scope refusals. + """ + from bigquery_agent_analytics import CategoricalMetricCategory + from bigquery_agent_analytics import CategoricalMetricDefinition + + scope_context = _build_scope_context(eval_spec) + has_scope = bool(eval_spec and eval_spec.get("scope")) + + if eval_config is None: + eval_config = _load_eval_config() + ext_metrics = eval_config.get("metrics", []) + result = [] + for m in ext_metrics: + cats = [ + CategoricalMetricCategory(name=c["name"], definition=c["definition"]) + for c in m["categories"] + ] + defn = m["definition"] + if m.get("scope_aware") and scope_context: + defn += scope_context + if has_scope and m.get("declined_category"): + dc = m["declined_category"] + declined_cat = CategoricalMetricCategory( + name=dc["name"], definition=dc["definition"] + ) + insert_after = dc.get("insert_after") + if insert_after: + idx = next( + (i for i, c in enumerate(cats) if c.name == insert_after), -1 + ) + cats.insert(idx + 1, declined_cat) + else: + cats.append(declined_cat) + if m.get("scope_suffix"): + defn += m["scope_suffix"] + result.append( + CategoricalMetricDefinition( + name=m["name"], definition=defn, categories=cats + ) + ) + logger.info("Loaded %d metrics from eval config", len(result)) + return result # --------------------------------------------------------------------------- @@ -432,7 +755,7 @@ def get_a2a_response(trace) -> tuple: text, agent = _extract_a2a_text(parsed) agent = agent or span.agent or "remote_agent" return (text or "(no response)"), agent - except (json.JSONDecodeError, TypeError): + except json.JSONDecodeError: logger.warning( "Failed to parse A2A payload for session %s, skipping", getattr(trace, "session_id", "?"), @@ -446,6 +769,239 @@ def get_a2a_response(trace) -> tuple: # --------------------------------------------------------------------------- +def _count_trace_metrics(trace): + """Extract multi-turn efficiency metrics from a trace.""" + user_turns = 0 + tool_calls = 0 + for span in trace.spans: + if span.event_type == "USER_MESSAGE_RECEIVED": + user_turns += 1 + elif span.event_type in ("TOOL_COMPLETED", "TOOL_ERROR"): + tool_calls += 1 + return user_turns, tool_calls + + +def _extract_conversation(trace): + """Reconstruct the multi-turn conversation from trace spans. + + Returns a list of ``{"role": "user"|"agent", "text": str}`` dicts + representing the full conversation in chronological order. + """ + # Collect user messages with their span indices. + user_msgs = [] + for i, span in enumerate(trace.spans): + if span.event_type == "USER_MESSAGE_RECEIVED": + c = span.content + if isinstance(c, dict): + text = c.get("text_summary") or c.get("text") or "" + elif c: + text = str(c) + else: + text = "" + if text: + user_msgs.append((i, text)) + + if not user_msgs: + return [] + + turns = [] + for msg_idx, (span_idx, user_text) in enumerate(user_msgs): + turns.append({"role": "user", "text": user_text}) + + # Boundary: next user message or end of spans. + end_idx = ( + user_msgs[msg_idx + 1][0] + if msg_idx + 1 < len(user_msgs) + else len(trace.spans) + ) + + # Walk backwards to find the last substantive LLM_RESPONSE for this turn. + for span in reversed(trace.spans[span_idx:end_idx]): + if span.event_type == "LLM_RESPONSE": + c = span.content + if isinstance(c, dict): + text = c.get("response", "") + elif c: + text = str(c) + else: + text = "" + if ( + text + and not text.startswith("call:") + and not _is_single_word_routing(text) + ): + turns.append({"role": "agent", "text": text}) + break + + return turns + + +def _infer_corrections(conversation, model): + """Use LLM to count corrections and verifications in a conversation.""" + user_turns = [t for t in conversation if t["role"] == "user"] + if len(user_turns) <= 1: + return 0, 0 + + formatted = [] + for t in conversation: + role = "User" if t["role"] == "user" else "Agent" + formatted.append(f"{role}: {t['text']}") + conv_text = "\n\n".join(formatted) + + prompt = ( + "Analyze this conversation between a user and an AI agent.\n\n" + f"\n{conv_text}\n\n\n" + "Count user follow-up messages (all messages after the first question) " + "and classify each as:\n" + "- CORRECTION: The user disputes, corrects, or says the agent got " + "something wrong\n" + "- VERIFICATION: The user asks the agent to verify, double-check, or " + "provide more specifics about a claim\n" + "- FOLLOWUP: Normal continuation, new related question, or satisfied " + "acknowledgment\n\n" + 'Return ONLY a JSON object: {"corrections": , "verifications": }' + ) + + try: + from google import genai + + client = genai.Client() + response = client.models.generate_content( + model=model, + contents=prompt, + config={"temperature": 0.0}, + ) + raw = response.text.strip() + # Strip markdown code fences if present. + if raw.startswith("```"): + lines = raw.split("\n") + raw = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:]) + result = json.loads(raw) + return int(result.get("corrections", 0)), int( + result.get("verifications", 0) + ) + except Exception: + logger.debug("Failed to infer corrections, defaulting to 0", exc_info=True) + return 0, 0 + + +_TURN_TAGGER_PROMPT = """\ +Analyze this multi-turn conversation between a user and an agent. +Classify each USER turn and identify correction boundaries. +{scope_context} + +CONVERSATION (turns numbered from 0): +{conversation} + +For each USER turn, assign exactly one tag: +- CORRECTION: User tells the agent it is WRONG and provides the correct fact. + Look for: "actually", "no", "that's wrong", "incorrect", contradicting a + specific claim with a specific counter-fact, quoting a source that disagrees. +- VERIFY: User doubts the agent's answer without providing the correct fact. + Look for: "are you sure", "can you check", "that doesn't sound right", + "I was told differently", questioning without correcting. +- SPECIFICS: User asks for concrete details the agent omitted. + Look for: "how many days exactly", "what's the percentage", "what date", + asking for numbers/dates/limits the agent didn't provide. +- SCOPE: User flags the agent answered something it shouldn't have. + Look for: "you shouldn't answer that", "that's not your area", pointing + out the agent overstepped its domain. +- FOLLOWUP: Normal follow-up question or related topic. The agent's previous + answer was acceptable. +- END: User is satisfied, conversation closing. + +Also identify CORRECTION BOUNDARIES — the turn index where the user corrects +the agent. The pre-correction sub-trajectory ends ONE TURN BEFORE the +correction (i.e. the agent's wrong answer). The post-correction sub-trajectory +starts AT the correction turn and includes everything after. + +For each correction boundary, extract: +- wrong_claim: what the agent said that was wrong (quote it) +- correct_fact: what the user said is right (quote it) +- agent_recovered: did the agent GENUINELY recover? Set to true ONLY if the + agent looked up or verified the information (e.g. called a tool, cited a + source, provided new details not in the user's correction). Set to false if + the agent merely repeated or paraphrased the user's correction without + independent verification — that is parroting, not recovery. + +Return ONLY a JSON object: +{{"turn_tags": [ + {{"turn_index": 0, "role": "user", "tag": "...", "evidence": "brief reason"}}, + ... + ], + "correction_boundaries": [ + {{"turn_index": N, "wrong_claim": "...", "correct_fact": "...", "agent_recovered": true}}, + ... + ], + "sub_trajectories": [ + {{"label": "pre_correction_1", "start_turn": 0, "end_turn": N-1, "outcome": "wrong"}}, + {{"label": "post_correction_1", "start_turn": N, "end_turn": M, "outcome": "recovered"}} + ] +}} + +For sub_trajectory outcome after a correction, use: +- "recovered" — agent genuinely recovered (used tools, cited sources, added new info) +- "parroted" — agent just repeated the user's fact without verification +- "not_recovered" — agent did not accept the correction or continued with wrong info + +Only tag USER turns (skip agent turns). If there are no corrections, return +empty correction_boundaries and a single sub_trajectory covering the whole +conversation. +""" + + +def _tag_conversation_turns(conversation, model, scope_context=""): + """Classify each user turn and identify correction boundaries.""" + if not isinstance(conversation, list) or len(conversation) < 3: + return None + + lines = [] + for i, turn in enumerate(conversation): + role = "USER" if turn.get("role") == "user" else "AGENT" + lines.append(f"[{i}] {role}: {turn.get('text', '')}") + numbered = "\n".join(lines) + + ctx = "" + if scope_context: + ctx = f"\nCONTEXT:\n{scope_context}" + + prompt = _TURN_TAGGER_PROMPT.format( + scope_context=ctx, + conversation=numbered[:4000], + ) + + try: + from google import genai + from google.genai import types + + client = genai.Client() + response = client.models.generate_content( + model=model, + contents=prompt, + config=types.GenerateContentConfig( + response_mime_type="application/json", + temperature=0.0, + ), + ) + raw = response.text.strip() + if raw.startswith("```"): + raw_lines = raw.split("\n") + raw = "\n".join( + raw_lines[1:-1] if raw_lines[-1].strip() == "```" else raw_lines[1:] + ) + result = json.loads(raw) + + # Extract correction/verification counts from tags + tags = result.get("turn_tags", []) + result["corrections"] = sum(1 for t in tags if t.get("tag") == "CORRECTION") + result["verifications"] = sum(1 for t in tags if t.get("tag") == "VERIFY") + return result + + except Exception: + logger.debug("Turn tagging failed, skipping", exc_info=True) + return None + + def resolve_trace_responses(traces): results = [] remote_lookups = 0 @@ -477,6 +1033,9 @@ def resolve_trace_responses(traces): if trace.total_latency_ms is not None: latency_s = round(trace.total_latency_ms / 1000, 1) + user_turns, tool_calls = _count_trace_metrics(trace) + conversation = _extract_conversation(trace) if user_turns > 1 else [] + results.append( { "session_id": trace.session_id, @@ -490,6 +1049,11 @@ def resolve_trace_responses(traces): "response": (response or ""), "latency_s": latency_s, "is_a2a": is_a2a, + "user_turns": user_turns, + "tool_calls": tool_calls, + "conversation": conversation, + "corrections": 0, + "verifications": 0, } ) @@ -499,6 +1063,158 @@ def resolve_trace_responses(traces): return results +# --------------------------------------------------------------------------- +# Local conversation support (no BigQuery required) +# --------------------------------------------------------------------------- + + +def _format_conversation_transcript(conv): + """Convert a traffic-generator conversation dict to SDK transcript format. + + Produces the same ``user_input / agent_response`` lines as the + ``CATEGORICAL_TRANSCRIPT_QUERY`` so that the categorical evaluator + can process local conversations identically to BigQuery traces. + """ + turns = conv.get("conversation", []) + if turns: + parts = [] + for turn in turns: + role = turn.get("role", "user") + text = turn.get("text", "") + tag = turn.get("tag", "") + if role == "user": + tag_str = f" [{tag}]" if tag else "" + parts.append(f"user_input{tag_str}: {text}") + else: + agent = conv.get("answered_by", "agent") + parts.append(f"agent_response [{agent}]: {text}") + return "\n".join(parts) + + # Fallback: single-turn + q = conv.get("question", "") + r = conv.get("final_response", conv.get("response", "")) + agent = conv.get("answered_by", "agent") + return f"user_input: {q}\nagent_response [{agent}]: {r}" + + +async def _build_resolved_map_from_conversations( + conversations, + model, + concurrency=10, + tag_turns=False, + scope_context="", +): + """Build a resolved_map from local conversation dicts. + + Returns the same ``{session_id: {...}}`` structure as + ``resolve_trace_responses`` so downstream code (``_build_json_output``, + ``_write_md_report``, ``_print_eval_results``) works unchanged. + + Infers corrections/verifications concurrently for multi-turn sessions. + When ``tag_turns=True``, uses the full turn tagger instead of the simpler + correction counter, adding ``turn_tags``, ``correction_boundaries``, and + ``sub_trajectories`` to each resolved entry. + """ + import asyncio + + # First pass: build entries, collect those needing inference + entries = [] + to_infer = [] + for conv in conversations: + sid = conv.get("session_id", f"local_{id(conv)}") + turns = conv.get("conversation", []) + user_turn_count = ( + sum(1 for t in turns if t.get("role") == "user") if turns else 1 + ) + tool_calls = conv.get("tool_calls", 0) + corrections = conv.get("corrections", 0) + verifications = conv.get("verifications", 0) + needs_tagging = turns and user_turn_count > 1 + needs_inference = needs_tagging and corrections == 0 and verifications == 0 + entries.append( + { + "sid": sid, + "conv": conv, + "turns": turns, + "user_turns": user_turn_count, + "tool_calls": tool_calls, + "corrections": corrections, + "verifications": verifications, + } + ) + if tag_turns and needs_tagging: + to_infer.append((len(entries) - 1, turns)) + elif needs_inference: + to_infer.append((len(entries) - 1, turns)) + + # Concurrent inference + if to_infer: + semaphore = asyncio.Semaphore(concurrency) + + if tag_turns: + + async def _infer_one(turns): + async with semaphore: + return await asyncio.to_thread( + _tag_conversation_turns, + turns, + model, + scope_context, + ) + + tag_results = await asyncio.gather( + *[_infer_one(turns) for _, turns in to_infer] + ) + for (idx, _), tag_data in zip(to_infer, tag_results): + if tag_data: + entries[idx]["corrections"] = tag_data.get("corrections", 0) + entries[idx]["verifications"] = tag_data.get("verifications", 0) + entries[idx]["turn_tags"] = tag_data.get("turn_tags", []) + entries[idx]["correction_boundaries"] = tag_data.get( + "correction_boundaries", [] + ) + entries[idx]["sub_trajectories"] = tag_data.get( + "sub_trajectories", [] + ) + else: + + async def _infer_one(turns): + async with semaphore: + return await asyncio.to_thread(_infer_corrections, turns, model) + + infer_results = await asyncio.gather( + *[_infer_one(turns) for _, turns in to_infer] + ) + for (idx, _), (corr, verif) in zip(to_infer, infer_results): + entries[idx]["corrections"] = corr + entries[idx]["verifications"] = verif + + resolved = {} + for entry in entries: + conv = entry["conv"] + resolved_entry = { + "session_id": entry["sid"], + "question": conv.get("question", ""), + "response": conv.get("final_response", conv.get("response", "")), + "answered_by": conv.get("answered_by", "unknown"), + "is_a2a": False, + "latency_s": conv.get("latency_s"), + "user_turns": entry["user_turns"], + "tool_calls": entry["tool_calls"], + "corrections": entry["corrections"], + "verifications": entry["verifications"], + "conversation": entry["turns"], + } + if tag_turns: + resolved_entry["turn_tags"] = entry.get("turn_tags", []) + resolved_entry["correction_boundaries"] = entry.get( + "correction_boundaries", [] + ) + resolved_entry["sub_trajectories"] = entry.get("sub_trajectories", []) + resolved[entry["sid"]] = resolved_entry + return resolved + + # --------------------------------------------------------------------------- # Run evaluation # --------------------------------------------------------------------------- @@ -510,9 +1226,13 @@ def run_evaluation( model=None, persist=False, app_name=None, - config_path=None, + eval_spec=None, session_id=None, session_ids=None, + tag_turns=False, + eval_config=None, + custom_labels=None, + golden_threshold=_DEFAULT_GOLDEN_THRESHOLD, ) -> dict: from bigquery_agent_analytics import CategoricalEvaluationConfig from bigquery_agent_analytics import TraceFilter @@ -520,7 +1240,18 @@ def run_evaluation( model = model or EVAL_MODEL_ID client = get_client() - metrics = get_eval_metrics(config_path=config_path) + if eval_spec is None: + eval_spec = _load_eval_spec() + if not eval_spec or not eval_spec.get("golden_qa"): + logger.warning( + "No golden_qa in the eval spec: response_usefulness and task_grounding " + "are LLM estimates WITHOUT ground truth and can mislabel verbose, " + "tool-grounded answers as ungrounded/unhelpful. For trustworthy " + "correctness, pass --eval-spec with a golden_qa list (question + " + "expected_answer); the judge then grades against the expected answer " + "(see summary.golden_eval_summary)." + ) + metrics = get_eval_metrics(eval_spec=eval_spec, eval_config=eval_config) cat_config = CategoricalEvaluationConfig( metrics=metrics, endpoint=model, @@ -531,11 +1262,14 @@ def run_evaluation( ) if session_id: - trace_filter = TraceFilter(session_ids=[session_id]) + trace_filter = TraceFilter( + session_ids=[session_id], custom_labels=custom_labels + ) elif session_ids: trace_filter = TraceFilter( session_ids=session_ids, limit=len(session_ids), + custom_labels=custom_labels, ) if app_name: trace_filter.root_agent_name = app_name @@ -545,9 +1279,11 @@ def run_evaluation( effective_time_range = None if effective_time_range: - trace_filter = TraceFilter.from_cli_args(last=effective_time_range) + trace_filter = TraceFilter.from_cli_args( + last=effective_time_range, custom_labels=custom_labels + ) else: - trace_filter = TraceFilter() + trace_filter = TraceFilter(custom_labels=custom_labels) trace_filter.limit = limit if app_name: trace_filter.root_agent_name = app_name @@ -565,39 +1301,433 @@ def run_evaluation( resolved = resolve_trace_responses(traces) resolved_map = {r["session_id"]: r for r in resolved} + # Golden Q&A matching (same as the --conversations-file path). The server-side + # judge (AI.GENERATE over BigQuery) can't receive per-session expected answers, + # so on this path golden Q&A drives the golden_eval_summary regression headline + # and per-session matched/expected reporting — but does NOT inject the expected + # answer into the judge for correctness grounding (that is conversations-only). + # scope/ground_truth still ground the judge on both paths. + golden_metadata = {} + golden_qa = (eval_spec or {}).get("golden_qa") + if golden_qa: + question_by_sid = { + sid: ctx.get("question", "") for sid, ctx in resolved_map.items() + } + _golden_ctx, golden_metadata = match_golden_qa( + question_by_sid, golden_qa, threshold=golden_threshold + ) + logger.warning( + "Golden Q&A on the BigQuery path produces the golden_eval_summary and " + "per-session matches, but the server-side judge cannot take per-session " + "expected answers — expected-answer correctness grounding applies on the " + "--conversations-file path only (scope/ground_truth ground both paths)." + ) + + # Infer corrections/verifications for multi-turn sessions (concurrent). + mt_sessions = [ + r + for r in resolved + if r.get("user_turns", 0) > 1 and r.get("conversation") + ] + if mt_sessions: + import asyncio + + if tag_turns: + scope_context = _build_scope_context(eval_spec) + logger.info( + "Tagging turns for %d multi-turn sessions...", + len(mt_sessions), + ) + semaphore = asyncio.Semaphore(10) + + async def _tag_one(conv): + async with semaphore: + return await asyncio.to_thread( + _tag_conversation_turns, + conv, + model, + scope_context, + ) + + async def _tag_all(): + return await asyncio.gather( + *[_tag_one(r["conversation"]) for r in mt_sessions] + ) + + tag_results = asyncio.run(_tag_all()) + for r, tag_data in zip(mt_sessions, tag_results): + if tag_data: + r["corrections"] = tag_data.get("corrections", 0) + r["verifications"] = tag_data.get("verifications", 0) + r["turn_tags"] = tag_data.get("turn_tags", []) + r["correction_boundaries"] = tag_data.get("correction_boundaries", []) + r["sub_trajectories"] = tag_data.get("sub_trajectories", []) + else: + logger.info( + "Inferring corrections for %d multi-turn sessions...", + len(mt_sessions), + ) + semaphore = asyncio.Semaphore(10) + + async def _infer_one(conv): + async with semaphore: + return await asyncio.to_thread(_infer_corrections, conv, model) + + async def _infer_all(): + return await asyncio.gather( + *[_infer_one(r["conversation"]) for r in mt_sessions] + ) + + results = asyncio.run(_infer_all()) + for r, (corrections, verifications) in zip(mt_sessions, results): + r["corrections"] = corrections + r["verifications"] = verifications + return { "report": report, "resolved_map": resolved_map, + "golden_metadata": golden_metadata, } -# --------------------------------------------------------------------------- -# Category labels -# --------------------------------------------------------------------------- +def generate_quality_report( + session_ids: list[str], + model: str | None = None, + eval_spec: dict | None = None, +) -> dict: + """Evaluate sessions and return a structured quality report dict. + + This is the main public API for programmatic use. It combines + ``run_evaluation`` (trace fetching, LLM scoring, correction inference) + with ``_build_json_output`` (structured dict) in a single call. + + Args: + session_ids: BigQuery session IDs to evaluate. + model: Eval model override (default: EVAL_MODEL_ID env or + gemini-2.5-flash). + eval_spec: Optional eval spec dict ({scope, ground_truth, golden_qa}). + When None, ``eval/data/eval_spec.json`` is auto-discovered. + + Returns: + Dict with ``summary`` and ``sessions`` keys, compatible with + evolve.py / bottleneck.py / score_and_compare.py. + """ + # Ensure config is loaded (no-op if already initialized via main()). + if PROJECT_ID is None: + _load_config() + if not model: + model = os.getenv("EVAL_MODEL_ID", "gemini-2.5-flash") + t0 = time.time() + result = run_evaluation( + session_ids=session_ids, + model=model, + eval_spec=eval_spec, + ) + elapsed = time.time() - t0 + output = _build_json_output(result["report"], result["resolved_map"]) + _inject_golden_summary(output, result.get("golden_metadata")) + output["summary"]["elapsed_seconds"] = round(elapsed, 1) + return output -def _category_label(category): - labels = { - "meaningful": "\u2705 HELPFUL", - "declined": "\u2705 DECLINED (OK)", - "unhelpful": "\u274c NOT HELPFUL", - "partial": "\u26a0\ufe0f PARTIAL", - "grounded": "\u2705 GROUNDED", - "ungrounded": "\u274c NOT GROUNDED", - "no_tool_needed": "\u2796 NO TOOL NEEDED", - } - return labels.get(category, (category or "?").upper()) +def run_evaluation_from_conversations( + conversations, + model=None, + eval_spec=None, + concurrency=10, + tag_turns=False, + eval_config=None, + per_session_context=None, + golden_threshold=_DEFAULT_GOLDEN_THRESHOLD, +): + """Evaluate local conversations without BigQuery. + + Converts traffic-generator conversation dicts to transcripts, classifies + them via the Gemini API, and returns the same ``{"report", "resolved_map"}`` + structure as ``run_evaluation`` so all downstream output functions work + unchanged. + + Args: + conversations: List of conversation dicts (traffic generator format). + model: Eval model override. + eval_spec: Optional eval spec dict ({scope, ground_truth, golden_qa}). + When None, ``eval/data/eval_spec.json`` is auto-discovered. Provides + scope grounding and, when ``golden_qa`` is present, per-question + correctness grounding via embedding matching. + concurrency: Max parallel API calls (default 10). + tag_turns: When True, run the full turn tagger to classify each user + turn and identify correction boundaries / sub-trajectories. + per_session_context: Optional caller-supplied per-session judge context. + Merged with (and overridden by) any golden-Q&A matches. + golden_threshold: Cosine-similarity threshold for golden matching. + + Returns: + Dict with ``report``, ``resolved_map``, and ``golden_metadata`` keys. + """ + import asyncio -# --------------------------------------------------------------------------- -# Browse mode (--no-eval) -# --------------------------------------------------------------------------- + from bigquery_agent_analytics import CategoricalEvaluationConfig + from bigquery_agent_analytics.categorical_evaluator import build_categorical_report + from bigquery_agent_analytics.categorical_evaluator import classify_sessions_via_api + if eval_spec is None: + eval_spec = _load_eval_spec() + model = ( + model or EVAL_MODEL_ID or os.getenv("EVAL_MODEL_ID", "gemini-2.5-flash") + ) + metrics = get_eval_metrics(eval_spec=eval_spec, eval_config=eval_config) + cat_config = CategoricalEvaluationConfig( + metrics=metrics, + endpoint=model, + temperature=0.0, + include_justification=True, + ) -def run_browse(args): - from bigquery_agent_analytics import TraceFilter + scope_context = _build_scope_context(eval_spec) + + # Golden Q&A matching: inject per-question expected answers / decline notes + # into the judge prompt for sessions whose question matches a golden entry. + golden_metadata = {} + golden_qa = (eval_spec or {}).get("golden_qa") + if golden_qa: + question_by_sid = { + conv.get("session_id", f"local_{id(conv)}"): conv.get("question", "") + for conv in conversations + } + golden_ctx, golden_metadata = match_golden_qa( + question_by_sid, golden_qa, threshold=golden_threshold + ) + per_session_context = {**(per_session_context or {}), **golden_ctx} - client = get_client() + transcripts = {} + for conv in conversations: + sid = conv.get("session_id", f"local_{id(conv)}") + transcripts[sid] = _format_conversation_transcript(conv) + + logger.info( + "Classifying %d local conversations (model=%s, concurrency=%d, tag_turns=%s)...", + len(transcripts), + model, + concurrency, + tag_turns, + ) + + async def _run_all(): + classify_task = classify_sessions_via_api( + transcripts, + cat_config, + model, + per_session_context=per_session_context, + ) + resolve_task = _build_resolved_map_from_conversations( + conversations, + model, + concurrency=concurrency, + tag_turns=tag_turns, + scope_context=scope_context, + ) + return await asyncio.gather(classify_task, resolve_task) + + session_results, resolved_map = asyncio.run(_run_all()) + + report = build_categorical_report( + dataset="local_conversations", + session_results=session_results, + config=cat_config, + ) + + return { + "report": report, + "resolved_map": resolved_map, + "golden_metadata": golden_metadata, + } + + +def generate_quality_report_from_conversations( + conversations, + model=None, + eval_spec=None, + concurrency=10, + tag_turns=False, + trajectory_samples=0, + per_session_context=None, + golden_threshold=_DEFAULT_GOLDEN_THRESHOLD, + eval_config=None, +) -> dict: + """Evaluate local conversations and return a structured quality report. + + This is the public API for scoring conversations from a traffic generator + or any local JSON file, without requiring BigQuery. Returns the same + dict structure as ``generate_quality_report``. + + Args: + conversations: List of conversation dicts. + model: Eval model override. + eval_spec: Optional eval spec dict ({scope, ground_truth, golden_qa}). + When None, ``eval/data/eval_spec.json`` is auto-discovered. + concurrency: Max parallel API calls (default 10). + tag_turns: When True, run the full turn tagger to add per-turn tags, + correction boundaries, and sub-trajectories to the output. + trajectory_samples: Number of execution traces to fetch from BigQuery. + per_session_context: Optional caller-supplied per-session judge context + (merged with golden-Q&A matches). + golden_threshold: Cosine-similarity threshold for golden matching. + eval_config: Optional metric-definition override (same as the CLI + ``--eval-config``); when None the built-in metrics are used. + + Returns: + Dict with ``summary`` and ``sessions`` keys. When the eval spec carries + ``golden_qa``, a ``golden_eval_summary`` block and per-session + ``golden_eval`` entries are included. + """ + if PROJECT_ID is None: + _load_config() + t0 = time.time() + result = run_evaluation_from_conversations( + conversations, + model=model, + eval_spec=eval_spec, + concurrency=concurrency, + tag_turns=tag_turns, + per_session_context=per_session_context, + golden_threshold=golden_threshold, + eval_config=eval_config, + ) + elapsed = time.time() - t0 + + trajectories = {} + if trajectory_samples and trajectory_samples > 0: + traj_sids = _select_trajectory_sessions( + result["report"], + result["resolved_map"], + trajectory_samples, + ) + trajectories = _fetch_session_traces(traj_sids, trajectory_samples) + + output = _build_json_output( + result["report"], + result["resolved_map"], + trajectories=trajectories, + ) + output["summary"]["elapsed_seconds"] = round(elapsed, 1) + _inject_golden_summary(output, result.get("golden_metadata")) + return output + + +def print_quality_report(report: dict): + """Print a formatted quality report from a ``generate_quality_report`` dict. + + Accepts the structured dict returned by ``generate_quality_report``, + NOT the raw SDK ``CategoricalEvaluationReport`` object. For the raw + object, use ``_print_eval_results`` instead. + """ + summary = report["summary"] + sessions = report.get("sessions", []) + + print("\n" + "=" * 70) + print(" QUALITY REPORT") + print("=" * 70) + print(f" Sessions: {summary['total_sessions']}") + print(f" Meaningful: {summary['meaningful']}") + print(f" Declined (correct): {summary['declined']}") + print(f" Partial: {summary['partial']}") + print(f" Unhelpful: {summary['unhelpful']}") + print(f" Meaningful rate: {summary['meaningful_rate']}%") + + if "correction_rate" in summary: + total_c = sum(s.get("corrections", 0) for s in sessions) + total_v = sum(s.get("verifications", 0) for s in sessions) + print( + f" Correction rate: {summary['correction_rate']}%" + f" ({total_c} corrections)" + ) + print( + f" Verification rate: {summary['verification_rate']}%" + f" ({total_v} verifications)" + ) + + if "avg_user_turns" in summary: + print(f" Avg user turns: {summary['avg_user_turns']}") + if "avg_tool_calls" in summary: + print(f" Avg tool calls: {summary['avg_tool_calls']}") + + dim_avgs = summary.get("dimension_averages", {}) + if dim_avgs: + print("\n Quality Dimensions (0-2 scale):") + for dim, avg in dim_avgs.items(): + bar = "#" * int(avg * 25) + print(f" {dim:<20s}: {avg:.2f} / 2.00 {bar}") + + problems = [ + s + for s in sessions + if s.get("metrics", {}).get("response_usefulness", {}).get("category") + in ("unhelpful", "partial") + ] + if problems: + print(f"\n Problem Sessions ({len(problems)}):") + for s in problems[:10]: + cat = s["metrics"]["response_usefulness"]["category"] + q = s.get("question", "")[:60] + reason = ( + s.get("quality_scores", {}) + .get("correctness", {}) + .get("reason", "")[:80] + ) + print(f" [{cat}] {q}") + if reason: + print(f" {reason}") + + print("=" * 70) + + +# --------------------------------------------------------------------------- +# Category labels +# --------------------------------------------------------------------------- + + +def _category_label(category): + labels = { + "meaningful": "\u2705 HELPFUL", + "declined": "\u2705 DECLINED (OK)", + "unhelpful": "\u274c NOT HELPFUL", + "partial": "\u26a0\ufe0f PARTIAL", + "grounded": "\u2705 GROUNDED", + "ungrounded": "\u274c NOT GROUNDED", + "no_tool_needed": "\u2796 NO TOOL NEEDED", + # correctness + "correct": "\u2705 CORRECT", + "mostly_correct": "\u26a0\ufe0f MOSTLY CORRECT", + "incorrect": "\u274c INCORRECT", + # tool_usage + "proper": "\u2705 PROPER", + # "partial" already covered above + "none": "\u274c NONE", + # specificity + "specific": "\u2705 SPECIFIC", + "somewhat_specific": "\u26a0\ufe0f SOMEWHAT SPECIFIC", + "vague": "\u274c VAGUE", + # scope_compliance + "compliant": "\u2705 COMPLIANT", + "partially_compliant": "\u26a0\ufe0f PARTIALLY COMPLIANT", + "non_compliant": "\u274c NON-COMPLIANT", + # first_time_right + "clarification_needed": "\u26a0\ufe0f CLARIFICATION NEEDED", + "correction_needed": "\u274c CORRECTION NEEDED", + } + return labels.get(category, (category or "?").upper()) + + +# --------------------------------------------------------------------------- +# Browse mode (--no-eval) +# --------------------------------------------------------------------------- + + +def run_browse(args): + from bigquery_agent_analytics import TraceFilter + + client = get_client() logger.info( "Project: %s, Dataset: %s, Table: %s", PROJECT_ID, DATASET_ID, TABLE_ID ) @@ -663,73 +1793,164 @@ def run_browse(args): def run_eval(args): model = args.model or EVAL_MODEL_ID - logger.info( - "Project: %s, Dataset: %s, Table: %s", PROJECT_ID, DATASET_ID, TABLE_ID - ) - logger.info("Location: %s", DATASET_LOCATION) - logger.info("Evaluation model: %s", model) - logger.info( - "Parameters: time_period=%s, limit=%d, persist=%s, report=%s, samples=%s", - args.time_period or "all", - args.limit, - args.persist, - args.report, - args.samples or "default (10/5/3)", + + conversations_file = getattr(args, "conversations_file", None) + + t0 = time.time() + eval_spec = _load_eval_spec(getattr(args, "eval_spec", None)) + golden_threshold = getattr( + args, "golden_threshold", _DEFAULT_GOLDEN_THRESHOLD ) + eval_config = _load_eval_config(getattr(args, "eval_config", None)) + + # --dimensions primary: keep only the 2 primary metrics to cut LLM-judge + # cost ~4x. Build a filtered copy so the cached config is not mutated. + if getattr(args, "dimensions", "full") == "primary": + eval_config = { + **eval_config, + "metrics": [ + m + for m in eval_config.get("metrics", []) + if m.get("name") in _PRIMARY_METRICS + ], + } + logger.info( + "Dimensions mode: primary — scoring only %s (skipping 5 quality " + "dimensions)", + ", ".join(sorted(_PRIMARY_METRICS)), + ) - # Load session IDs from file if provided - session_ids = None - if args.session_ids_file: - with open(args.session_ids_file) as _f: - _data = json.load(_f) - # Accepts either a list of objects with "session_id" keys - # (e.g. output of examples/agent_improvement_cycle/eval/run_eval.py) - # or a plain list of strings. - if _data and isinstance(_data[0], dict): - session_ids = [r["session_id"] for r in _data if r.get("session_id")] + custom_labels = None + if getattr(args, "label", None): + custom_labels = {} + for item in args.label: + if "=" not in item: + logger.error("--label requires KEY=VALUE format, got: %s", item) + sys.exit(1) + k, v = item.split("=", 1) + custom_labels[k] = v + + if conversations_file: + # --- Local conversations path (no BigQuery) --- + logger.info("Source: local conversations file %s", conversations_file) + logger.info("Evaluation model: %s", model) + with open(conversations_file) as _f: + data = json.load(_f) + conversations = ( + data.get("conversations", []) if isinstance(data, dict) else data + ) + if not conversations: + logger.error("No conversations found in %s", conversations_file) + sys.exit(1) + total = len(conversations) + if args.limit and args.limit < total: + conversations = conversations[: args.limit] + logger.info("Using %d of %d conversations (--limit)", args.limit, total) else: - session_ids = [s for s in _data if s] - if not session_ids: - logger.error( - "No session IDs found in %s — file may be empty or missing " - "'session_id' fields.", - args.session_ids_file, + logger.info("Loaded %d conversations", total) + + try: + if eval_spec: + logger.info( + "Eval spec: scope=%s, golden_qa=%d", + bool(eval_spec.get("scope")), + len(eval_spec.get("golden_qa") or []), + ) + concurrency = getattr(args, "concurrency", 10) + tag_turns = getattr(args, "tag_turns", False) + result = run_evaluation_from_conversations( + conversations, + model=model, + eval_spec=eval_spec, + concurrency=concurrency, + tag_turns=tag_turns, + eval_config=eval_config, + golden_threshold=golden_threshold, ) + except Exception: + logger.exception("Evaluation failed") sys.exit(1) + else: + # --- BigQuery path (existing) --- logger.info( - "Filtering to %d session IDs from %s", - len(session_ids), - args.session_ids_file, + "Project: %s, Dataset: %s, Table: %s", + PROJECT_ID, + DATASET_ID, + TABLE_ID, ) - - t0 = time.time() - try: - config_path = getattr(args, "config", None) - if config_path: - logger.info("Scope config: %s", config_path) - result = run_evaluation( - time_range=args.time_period, - limit=args.limit, - model=model, - persist=args.persist, - app_name=args.app_name, - config_path=config_path, - session_id=args.session, - session_ids=session_ids, + logger.info("Location: %s", DATASET_LOCATION) + logger.info("Evaluation model: %s", model) + logger.info( + "Parameters: time_period=%s, limit=%d, persist=%s, report=%s, " + "samples=%s", + args.time_period or "all", + args.limit, + args.persist, + args.report, + args.samples or "default (10/5/3)", ) - except Exception: - logger.exception("Evaluation failed") - sys.exit(1) + + session_ids = None + if args.session_ids_file: + with open(args.session_ids_file) as _f: + _data = json.load(_f) + if _data and isinstance(_data[0], dict): + session_ids = [r["session_id"] for r in _data if r.get("session_id")] + else: + session_ids = [s for s in _data if s] + if not session_ids: + logger.error( + "No session IDs found in %s — file may be empty or missing " + "'session_id' fields.", + args.session_ids_file, + ) + sys.exit(1) + logger.info( + "Filtering to %d session IDs from %s", + len(session_ids), + args.session_ids_file, + ) + + try: + if eval_spec and eval_spec.get("scope"): + logger.info("Eval spec scope active") + tag_turns = getattr(args, "tag_turns", False) + result = run_evaluation( + time_range=args.time_period, + limit=args.limit, + model=model, + persist=args.persist, + app_name=args.app_name, + eval_spec=eval_spec, + session_id=args.session, + session_ids=session_ids, + tag_turns=tag_turns, + eval_config=eval_config, + custom_labels=custom_labels, + golden_threshold=golden_threshold, + ) + except Exception: + logger.exception("Evaluation failed") + sys.exit(1) + elapsed = time.time() - t0 + # --- Shared post-processing --- result["report"].details["elapsed_seconds"] = round(elapsed, 1) result["report"].details["project"] = PROJECT_ID result["report"].details["dataset"] = f"{DATASET_ID}.{TABLE_ID}" result["report"].details["location"] = DATASET_LOCATION result["report"].details["eval_model"] = model - result["report"].details["time_period"] = args.time_period or "all" - result["report"].details["limit"] = args.limit - result["report"].details["persist"] = args.persist + if not conversations_file: + result["report"].details["time_period"] = args.time_period or "all" + result["report"].details["limit"] = args.limit + result["report"].details["persist"] = args.persist + if args.app_name: + result["report"].details["app_name"] = args.app_name + if custom_labels: + result["report"].details["labels"] = ", ".join( + f"{k}={v}" for k, v in custom_labels.items() + ) result["report"].details["samples"] = args.samples or None _print_eval_results( result["report"], @@ -738,17 +1959,104 @@ def run_eval(args): unhelpful_threshold=args.threshold, ) + # --- Trajectory fetching --- + trajectories = {} + trajectory_samples = getattr(args, "trajectory_samples", 0) + tag_turns = getattr(args, "tag_turns", False) + if trajectory_samples and trajectory_samples > 0: + traj_sids = _select_trajectory_sessions( + result["report"], + result["resolved_map"], + trajectory_samples, + ) + # Also fetch trajectories for all correction sessions (for inline display) + if tag_turns: + correction_sids = [ + sid + for sid, ctx in result["resolved_map"].items() + if ctx.get("correction_boundaries") + ] + for sid in correction_sids: + if sid not in traj_sids: + traj_sids.append(sid) + logger.info( + "Fetching %d execution trajectories from BigQuery...", len(traj_sids) + ) + trajectories = _fetch_session_traces(traj_sids, len(traj_sids)) + if trajectories: + logger.info("Fetched %d trajectories", len(trajectories)) + for sid, trace_obj in trajectories.items(): + ctx = result["resolved_map"].get(sid) + if ctx and ctx.get("answered_by") == "unknown": + ctx["answered_by"] = get_responding_agent(trace_obj) + else: + logger.warning("No trajectories fetched (BQ may not be configured)") + + # Single-session mode: always fetch trajectory from BQ + if args.session and not trajectories and not conversations_file: + trajectories = _fetch_session_traces([args.session], max_sessions=1) + if trajectories: + for sid, trace_obj in trajectories.items(): + ctx = result["resolved_map"].get(sid) + if ctx and ctx.get("answered_by") == "unknown": + ctx["answered_by"] = get_responding_agent(trace_obj) + + # Print execution trace to console for single-session mode + if args.session and trajectories: + trace_obj = trajectories.get(args.session) + if trace_obj: + hr = "─" * 70 + print(f"\n{'=' * 70}") + print("EXECUTION TRACE") + print(f"{'=' * 70}") + print(_render_trace(trace_obj)) + ctx = result["resolved_map"].get(args.session, {}) + sub_trajs = ctx.get("sub_trajectories", []) + conversation = ctx.get("conversation", []) + if sub_trajs and conversation: + segments = _segment_trace_by_turns( + trace_obj, + conversation, + sub_trajs, + ) + if segments: + print(f"\n{hr}") + print(" SUB-TRAJECTORY SEGMENTATION") + print(hr) + for seg in segments: + icon = "✅" if seg["outcome"] in ("correct", "recovered") else "❌" + print( + f"\n {icon} {seg['label']} " + f"(turns {seg['start_turn']}-{seg['end_turn']}) " + f"→ {seg['outcome']}" + ) + for line in seg["trace"].split("\n"): + print(f" {line}") + print(f"{'=' * 70}\n") + report_path = None + md_dir = None + if args.output_json and args.output_json != "-": + md_dir = os.path.dirname(os.path.abspath(args.output_json)) if args.report: report_path = _write_md_report( - result["report"], result["resolved_map"], args + result["report"], + result["resolved_map"], + args, + report_dir=md_dir, + trajectories=trajectories, ) if report_path: print(f"\n Markdown report: {report_path}") if args.output_json: - output = _build_json_output(result["report"], result["resolved_map"]) + output = _build_json_output( + result["report"], + result["resolved_map"], + trajectories=trajectories, + ) + _inject_golden_summary(output, result.get("golden_metadata")) if args.output_json == "-": json.dump(output, sys.stdout, indent=2, default=str) sys.stdout.write("\n") @@ -818,8 +2126,141 @@ def _build_agent_stats(report, resolved_map): _METRIC_LABELS = { "response_usefulness": "Usefulness", "task_grounding": "Grounding", + "correctness": "Correctness", + "tool_usage": "Tool Usage", + "specificity": "Specificity", + "scope_compliance": "Scope", + "first_time_right": "First-Time Right", +} + +# Maps category → numeric score (0-2) for dimension averaging. +# +# The middle-category names deliberately differ per dimension +# (``mostly_correct``, ``partial``, ``somewhat_specific``, ...): the LLM judge +# is given the full per-dimension vocabulary, and a name that fits the +# dimension produces better classifications than a generic ``medium``. Do not +# "normalize" them to a single shared word. +# +# ``correct`` appears as a category in both ``correctness`` and +# ``first_time_right``. That is fine — categories are always looked up keyed by +# metric_name, so the two never collide. ``tool_usage.no_tool_needed`` scores 2 +# because not calling a tool is the *correct* outcome when none was needed +# (e.g. a greeting or a correctly-declined out-of-scope question); without it, +# those sessions would be penalised as a Tool Usage failure. +_DIMENSION_SCORES = { + "correctness": {"correct": 2, "mostly_correct": 1, "incorrect": 0}, + "tool_usage": {"proper": 2, "no_tool_needed": 2, "partial": 1, "none": 0}, + "specificity": {"specific": 2, "somewhat_specific": 1, "vague": 0}, + "scope_compliance": { + "compliant": 2, + "partially_compliant": 1, + "non_compliant": 0, + }, + "first_time_right": { + "correct": 2, + "clarification_needed": 1, + "correction_needed": 0, + }, } +_DIMENSION_NAMES = list(_DIMENSION_SCORES.keys()) # order matters for rendering + +_PRIMARY_METRICS = {"response_usefulness", "task_grounding"} + +_SCORECARD_ICONS = { + "correct": "✅", + "mostly_correct": "⚠️", + "incorrect": "❌", + "proper": "✅", + "no_tool_needed": "➖", # neutral: no tool was needed (a correct outcome) + "partial": "⚠️", + "none": "❌", + "specific": "✅", + "somewhat_specific": "⚠️", + "vague": "❌", + "compliant": "✅", + "partially_compliant": "⚠️", + "non_compliant": "❌", + "clarification_needed": "⚠️", + "correction_needed": "❌", +} + +# Maps dimension → its worst (score-0) category, used for "Low X" report +# sections. A dimension with no score-0 category is omitted rather than raising +# StopIteration at import time. +_DIMENSION_LOW_CATEGORIES = { + dim: low_cat + for dim, cats in _DIMENSION_SCORES.items() + if (low_cat := next((c for c, s in cats.items() if s == 0), None)) +} + +# Short descriptions for the markdown report's Quality Dimensions table. +_DIMENSION_DESCRIPTIONS = { + "correctness": "Are the facts in the response accurate?", + "tool_usage": "Did the agent use its tools to verify facts?", + "specificity": "Does the response include specific numbers, dates, limits?", + "scope_compliance": "Did the agent correctly handle in-scope vs out-of-scope?", + "first_time_right": "Was the first response correct without user corrections?", +} + + +def _compute_dimension_averages(report): + """Compute average 0-2 score for each fine-grained dimension.""" + dim_totals = {d: [] for d in _DIMENSION_NAMES} + for sr in report.session_results: + for mr in sr.metrics: + if mr.metric_name in _DIMENSION_SCORES: + score_map = _DIMENSION_SCORES[mr.metric_name] + if mr.parse_error or mr.category not in score_map: + continue + dim_totals[mr.metric_name].append(score_map[mr.category]) + return { + d: round(sum(scores) / len(scores), 2) if scores else 0 + for d, scores in dim_totals.items() + } + + +def _has_dimension_data(dim_avgs): + """True when the quality dimensions were actually scored. + + A run with ``--dimensions primary`` (or any run that scored no dimension + metrics) yields all-zero averages. Treating that as real data would render a + misleading "every dimension is 0.0 / failing" report, so all three output + paths (console, markdown, JSON) gate the dimension block on this predicate. + """ + return any(v > 0 for v in dim_avgs.values()) + + +def _compute_multiturn_stats(resolved_map): + """Compute multi-turn efficiency statistics from resolved traces.""" + user_turns = [r.get("user_turns", 0) for r in resolved_map.values()] + tool_calls = [r.get("tool_calls", 0) for r in resolved_map.values()] + corrections = [r.get("corrections", 0) for r in resolved_map.values()] + verifications = [r.get("verifications", 0) for r in resolved_map.values()] + total = len(user_turns) + if not total: + return { + "avg_user_turns": 0, + "avg_tool_calls": 0, + "multi_turn_sessions": 0, + } + mt_count = sum(1 for t in user_turns if t > 1) + stats = { + "avg_user_turns": round(sum(user_turns) / total, 1), + "avg_tool_calls": round(sum(tool_calls) / total, 1), + "multi_turn_sessions": mt_count, + } + if mt_count > 0: + stats["correction_rate"] = round( + sum(1 for c in corrections if c > 0) / total * 100, 1 + ) + stats["verification_rate"] = round( + sum(1 for v in verifications if v > 0) / total * 100, 1 + ) + stats["avg_corrections"] = round(sum(corrections) / total, 2) + stats["avg_verifications"] = round(sum(verifications) / total, 2) + return stats + def _print_eval_results( report, resolved_map, samples=None, unhelpful_threshold=10.0 @@ -832,13 +2273,7 @@ def _print_eval_results( } # --- Per-session details --- - _default_samples = { - "unhelpful": 10, - "partial": 5, - "meaningful": 3, - "declined": 3, - "unknown": 3, - } + samples_dict = _parse_samples(samples) for cat, cat_label in [ ("unhelpful", "UNHELPFUL"), ("partial", "PARTIAL"), @@ -846,11 +2281,8 @@ def _print_eval_results( ("meaningful", "MEANINGFUL"), ("unknown", "UNCLASSIFIED (parse errors)"), ]: - limit = ( - len(by_category.get(cat, [])) - if samples == "all" - else (int(samples) if samples else _default_samples.get(cat, 5)) - ) + cat_limit = _get_sample_limit(samples_dict, cat) + limit = len(by_category.get(cat, [])) if cat_limit is None else cat_limit sessions = by_category.get(cat, []) if not sessions: continue @@ -877,7 +2309,10 @@ def _print_eval_results( print(f" Question: {q}") print(f' Response: "{r}"') + # Primary metrics with justifications for mr in sr.metrics: + if mr.metric_name not in _PRIMARY_METRICS: + continue mr_label = _category_label(mr.category) if mr.parse_error: mr_label += " [parse error]" @@ -889,6 +2324,17 @@ def _print_eval_results( raw = mr.raw_response[:300] print(f" {'Raw LLM out:':<15}{repr(raw)}") + # Compact scorecard for quality dimensions + dim_parts = [] + for mr in sr.metrics: + if mr.metric_name in _PRIMARY_METRICS: + continue + display_name = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) + mr_label = _category_label(mr.category) + dim_parts.append(f"{display_name}: {mr_label}") + if dim_parts: + print(f" {'Dimensions:':<15}{' | '.join(dim_parts)}") + # --- Per-agent breakdown --- agent_stats = _build_agent_stats(report, resolved_map) @@ -1021,8 +2467,59 @@ def _print_eval_results( if a2a_session_ids: print(f" A2A sessions detected : {len(a2a_session_ids)}") + # --- Failure breakdown: skill gap vs knowledge gap vs tool gap --- + counts, _ = _failure_breakdown_from_report(report) + total_sessions = report.total_sessions or 1 + if _has_failure_attribution_data(report) and any(counts.values()): + unaddressable = counts["knowledge_gap"] + counts["tool_gap"] + addressable = total_sessions - unaddressable + good = sum( + 1 + for sr in report.session_results + for mr in sr.metrics + if mr.metric_name == "response_usefulness" + and mr.category in ("meaningful", "declined") + ) + addr_rate = (good / addressable * 100) if addressable else 0.0 + print( + f" Failure causes : " + f"skill={counts['skill_gap']} (evolution) " + f"knowledge={counts['knowledge_gap']} (add data) " + f"tool={counts['tool_gap']} (build tool)" + ) + print( + f" Addressable meaningful : {addr_rate:.1f}%" + f" (excludes {unaddressable} unaddressable gaps)" + ) + + # --- Dimension averages (0-2 scale) --- + dim_avgs = _compute_dimension_averages(report) + if _has_dimension_data(dim_avgs): + print(f"\n Quality Dimensions (0-2 scale):") + for dim, avg in dim_avgs.items(): + bar = "#" * int(avg * 25) + label = _METRIC_LABELS.get(dim, dim) + print(f" {label:<20s}: {avg:.2f} / 2.00 {bar}") + desc = _DIMENSION_DESCRIPTIONS.get(dim) + if desc: + print(f" {'':<20s} ↳ {desc}") + + # --- Multi-turn efficiency --- + mt_stats = _compute_multiturn_stats(resolved_map) + if mt_stats: + print(f"\n Multi-Turn Efficiency:") + print(f" Avg user turns : {mt_stats['avg_user_turns']}") + print(f" Avg tool calls : {mt_stats['avg_tool_calls']}") + if mt_stats["multi_turn_sessions"] > 0: + print(f" Multi-turn sessions : {mt_stats['multi_turn_sessions']}") + if "correction_rate" in mt_stats: + print(f" Correction rate : {mt_stats['correction_rate']}%") + print(f" Verification rate : {mt_stats['verification_rate']}%") + print("\n Category Distributions:") for metric_name, dist in report.category_distributions.items(): + if metric_name not in _PRIMARY_METRICS: + continue print(f"\n [{metric_name}]") dist_total = sum(dist.values()) for category, count in sorted(dist.items(), key=lambda x: -x[1]): @@ -1058,24 +2555,791 @@ def _print_eval_results( # --------------------------------------------------------------------------- # Markdown report generation # --------------------------------------------------------------------------- +# Execution trajectory fetching +# --------------------------------------------------------------------------- -def _write_md_report(report, resolved_map, args): - lines = [] - w = lines.append +def _import_render_timing_tree(): + """Import render_timing_tree from latency_report.py.""" + try: + from latency_report import render_timing_tree - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - w("# Quality Evaluation Report") + return render_timing_tree + except ImportError: + pass + try: + import importlib.util + + _lr_path = os.path.join(_script_dir, "latency_report.py") + spec = importlib.util.spec_from_file_location("latency_report", _lr_path) + _lr = importlib.util.module_from_spec(spec) + spec.loader.exec_module(_lr) + return _lr.render_timing_tree + except Exception: + return None + + +def _render_trace(trace, header=True): + """Render a Trace object as a timing tree string.""" + render_fn = _import_render_timing_tree() + if not render_fn: + return "" + rendered = render_fn(trace) + if not header: + lines = rendered.split("\n") + if len(lines) > 3: + return "\n".join(lines[3:]) + return rendered + + +def _segment_trace_by_turns(trace, conversation, sub_trajectories): + """Segment an execution trace at correction boundaries. + + Maps conversation turn indices to USER_MESSAGE_RECEIVED spans in the trace, + then splits the trace into sub-segments aligned with correction sub-trajectories. + + Returns a list of dicts: {label, outcome, start_turn, end_turn, trace: str} + """ + if not sub_trajectories or not trace or not trace.spans or not conversation: + return [] + + user_msg_spans = sorted( + [s for s in trace.spans if s.event_type == "USER_MESSAGE_RECEIVED"], + key=lambda s: s.timestamp, + ) + if not user_msg_spans: + return [] + + user_turn_indices = [ + i for i, t in enumerate(conversation) if t.get("role") == "user" + ] + + conv_idx_to_trace_span = {} + for j, conv_idx in enumerate(user_turn_indices): + if j < len(user_msg_spans): + conv_idx_to_trace_span[conv_idx] = j + + turn_timestamps = [s.timestamp for s in user_msg_spans] + trace_end = trace.end_time or ( + max(s.timestamp for s in trace.spans) if trace.spans else None + ) + + from bigquery_agent_analytics.trace import Trace + + segments = [] + for st in sub_trajectories: + start_turn = st.get("start_turn", 0) + end_turn = st.get("end_turn", len(conversation) - 1) + outcome = st.get("outcome", "") + + if outcome == "wrong" and end_turn > start_turn: + next_st = next( + (s for s in sub_trajectories if s.get("start_turn", 0) > start_turn), + None, + ) + if next_st: + end_turn = min(end_turn, next_st.get("start_turn", end_turn) - 1) + + start_user_indices = [ + ci for ci in user_turn_indices if start_turn <= ci <= end_turn + ] + if not start_user_indices: + continue + + first_ci = start_user_indices[0] + last_ci = start_user_indices[-1] + first_span_idx = conv_idx_to_trace_span.get(first_ci) + last_span_idx = conv_idx_to_trace_span.get(last_ci) + if first_span_idx is None: + continue + + window_start = turn_timestamps[first_span_idx] + is_last_segment = True + if last_span_idx is not None and last_span_idx + 1 < len(turn_timestamps): + window_end = turn_timestamps[last_span_idx + 1] + is_last_segment = False + else: + window_end = trace_end + + if window_end is None: + continue + + sub_spans = [ + s + for s in trace.spans + if s.timestamp >= window_start + and ( + s.timestamp <= window_end + if is_last_segment + else s.timestamp < window_end + ) + ] + if not sub_spans: + continue + + mini_trace = Trace( + trace_id=trace.trace_id, + session_id=trace.session_id, + spans=sub_spans, + ) + rendered = _render_trace(mini_trace, header=False) + if rendered: + segments.append( + { + "label": st.get("label", ""), + "outcome": st.get("outcome", ""), + "start_turn": start_turn, + "end_turn": end_turn, + "trace": rendered, + } + ) + + return segments + + +def _fetch_session_traces(session_ids, max_sessions=3): + """Fetch execution traces from BigQuery for the given session IDs. + + Returns a dict mapping session_id -> Trace object. + Silently returns empty dict if BQ is not configured or unavailable. + """ + if not session_ids: + return {} + + try: + from bigquery_agent_analytics import Client + except ImportError: + logger.debug( + "Cannot import bigquery_agent_analytics, skipping trajectories" + ) + return {} + + if not _import_render_timing_tree(): + logger.debug("Cannot import latency_report, skipping trajectories") + return {} + + if DATASET_ID == "local" or not PROJECT_ID: + logger.debug("BQ not configured (DATASET_ID=local), skipping trajectories") + return {} + + try: + client = Client( + project_id=PROJECT_ID, + dataset_id=DATASET_ID, + table_id=TABLE_ID, + location=DATASET_LOCATION, + ) + except Exception: + logger.debug("Failed to create BQ client", exc_info=True) + return {} + + def _fetch_one(sid): + try: + trace = client.get_session_trace(sid) + if trace and trace.spans: + return (sid, trace) + except Exception: + logger.debug("Failed to fetch trace for %s", sid, exc_info=True) + return None + + traces = {} + with ThreadPoolExecutor(max_workers=10) as executor: + results = executor.map(_fetch_one, session_ids[:max_sessions]) + for result in results: + if result: + sid, trace = result + traces[sid] = trace + return traces + + +def _select_trajectory_sessions(report, resolved_map, n): + """Pick the N most interesting sessions for trajectory display. + + Priority: unhelpful with corrections > unhelpful > partial > corrections > any. + """ + by_category = _group_by_category(report) + candidates = [] + + unhelpful_sids = {sr.session_id for sr in by_category.get("unhelpful", [])} + partial_sids = {sr.session_id for sr in by_category.get("partial", [])} + correction_sids = { + sid + for sid, ctx in resolved_map.items() + if ctx.get("correction_boundaries") + } + + for sid in unhelpful_sids & correction_sids: + candidates.append(sid) + for sid in unhelpful_sids - correction_sids: + candidates.append(sid) + for sid in partial_sids: + if sid not in candidates: + candidates.append(sid) + for sid in correction_sids - unhelpful_sids - partial_sids: + candidates.append(sid) + + if len(candidates) < n: + for sr in report.session_results: + if sr.session_id not in candidates: + candidates.append(sr.session_id) + if len(candidates) >= n: + break + + return candidates[:n] + + +def _md_write_trajectory_section(w, trajectories, resolved_map): + """Write the Sample Trajectories section to the markdown report.""" + if not trajectories: + return + + w("## Sample Execution Trajectories") w("") - w(f"**Generated:** {timestamp} ") - w(f"**Project:** {PROJECT_ID} ") - w(f"**Dataset:** {DATASET_ID}.{TABLE_ID} ") - w(f"**Location:** {DATASET_LOCATION} ") - model = args.model or EVAL_MODEL_ID - w(f"**Eval model:** {model} ") - w(f"**Sessions:** {report.total_sessions} ") + w( + "Full execution traces showing agent routing, tool calls, and LLM " + "requests. These reveal *why* an answer was wrong — did the agent " + "skip a tool call, call the wrong tool, or get misrouted?" + ) + w("") + + for sid, trace_obj in trajectories.items(): + ctx = resolved_map.get(sid, {}) + # Skip correction sessions — their traces are shown in Correction Analysis + if ctx.get("correction_boundaries"): + continue + question = ctx.get("question", "") + answered_by = ctx.get("answered_by", "") + q = " ".join(question.split()) if question else "(none)" + + w(f"### `{sid}` → {answered_by}") + w("") + w(f"**Question:** {q}") + w("") + + tree = ( + _render_trace(trace_obj) + if hasattr(trace_obj, "spans") + else str(trace_obj) + ) + w("```") + w(tree) + w("```") + w("") + + sub_trajs = ctx.get("sub_trajectories", []) + conversation = ctx.get("conversation", []) + if sub_trajs and conversation and hasattr(trace_obj, "spans"): + segments = _segment_trace_by_turns(trace_obj, conversation, sub_trajs) + if segments: + w("**Sub-trajectory segmentation:**") + w("") + for seg in segments: + outcome_icon = "+" if seg["outcome"] == "recovered" else "-" + w( + f"#### [{outcome_icon}] {seg['label']} " + f"(turns {seg['start_turn']}-{seg['end_turn']}) " + f"→ {seg['outcome']}" + ) + w("") + w("```") + w(seg["trace"]) + w("```") + w("") + + +# --------------------------------------------------------------------------- +# Markdown report generation +# --------------------------------------------------------------------------- + + +def _md_dimension_scorecard(sr): + """Build a compact one-line scorecard for the 5 quality dimensions.""" + parts = [] + for mr in sr.metrics: + # Only the 0-2 quality dimensions belong in the scorecard \u2014 skip primary + # metrics and non-dimension categoricals (e.g. failure_attribution). + if mr.metric_name not in _DIMENSION_SCORES: + continue + label = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) + icon = _SCORECARD_ICONS.get(mr.category, "\u2753") + parts.append(f"{label} {icon}") + return " | ".join(parts) + + +def _md_write_conversation(w, conversation, show_tags=False, turn_tags=None): + """Write a
conversation block for multi-turn sessions.""" + if not conversation or len(conversation) < 2: + return + tag_by_idx = {} + if show_tags and turn_tags: + tag_by_idx = {t["turn_index"]: t.get("tag", "") for t in turn_tags} + w("") + w("
Conversation") + w("") + for i, turn in enumerate(conversation): + role = turn.get("role", "user") + text = turn.get("text", "") + tag = "" + if show_tags: + tag = turn.get("inferred_tag", "") or tag_by_idx.get(i, "") + if tag and role == "user": + w(f" **{role}** `[{tag}]`**:** {text}") + else: + w(f" **{role}:** {text}") + w("") + w("
") + + +def _md_write_session_section( + w, + title, + sessions, + md_samples, + resolved_map, + a2a_session_ids, + heading_level=2, +): + """Write a section of per-session details to the markdown report.""" + h = "#" * heading_level + sh = "#" * (heading_level + 1) + shown = sessions if md_samples is None else sessions[:md_samples] + w(f"{h} {title}") + if len(shown) < len(sessions): + w(f"\n*Showing {len(shown)} of {len(sessions)}*") + w("") + for sr in shown: + sid = sr.session_id + ctx = resolved_map.get(sid, {}) + question = ctx.get("question", "") + response = ctx.get("response", "") + answered_by = ctx.get("answered_by", "") + a2a_tag = " [A2A]" if sid in a2a_session_ids else "" + + q = " ".join(question.split()) if question else "(none)" + r = " ".join(response.split()) if response else "(none)" + + w(f"{sh} `{sid}`{a2a_tag} \u2192 {answered_by}") + w("") + w(f"- **Question:** {q}") + r_display = (r[:500] + "\u2026") if len(r) > 500 else r + w(f"- **Response:** {r_display}") + + for mr in sr.metrics: + if mr.metric_name not in _PRIMARY_METRICS: + continue + label = _category_label(mr.category) + display = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) + w(f"- **{display}:** {label}") + if mr.justification: + w(f" - *{mr.justification}*") + + scorecard = _md_dimension_scorecard(sr) + if scorecard: + w(f"- **Dimensions:** {scorecard}") + + conversation = ctx.get("conversation", []) + _md_write_conversation(w, conversation) + w("") + + +def _md_find_low_dimension_sessions(report, dimension, low_category): + """Find sessions that scored the lowest category on a dimension.""" + results = [] + for sr in report.session_results: + for mr in sr.metrics: + if mr.metric_name == dimension and mr.category == low_category: + results.append((sr, mr)) + break + return results + + +def _md_write_low_dimension_section( + w, + title, + dimension_label, + report, + dimension, + low_category, + md_samples, + resolved_map, + heading_level=2, +): + """Write a Low X Sessions section in the markdown report.""" + h = "#" * heading_level + sh = "#" * (heading_level + 1) + low_sessions = _md_find_low_dimension_sessions( + report, + dimension, + low_category, + ) + if not low_sessions: + return + shown = low_sessions if md_samples is None else low_sessions[:md_samples] + w(f"{h} {title}") + w("") + if len(shown) < len(low_sessions): + w(f"*Showing {len(shown)} of {len(low_sessions)}*") + w("") + for sr, mr in shown: + sid = sr.session_id + ctx = resolved_map.get(sid, {}) + question = ctx.get("question", "") + response = ctx.get("response", "") + answered_by = ctx.get("answered_by", "") + + q = " ".join(question.split()) if question else "(none)" + r = " ".join(response.split()) if response else "(none)" + + w(f"{sh} `{sid}` → {answered_by}") + w("") + w(f"- **Question:** {q}") + r_display = (r[:500] + "…") if len(r) > 500 else r + w(f"- **Response:** {r_display}") + label = _category_label(mr.category) + w(f"- **{dimension_label}:** {label}") + if mr.justification: + w(f" - *{mr.justification}*") + + conversation = ctx.get("conversation", []) + _md_write_conversation(w, conversation) + w("") + + +def _md_has_turn_tags(resolved_map): + """Check if any session in the resolved map has turn tag data.""" + for ctx in resolved_map.values(): + if ctx.get("turn_tags") or ctx.get("correction_boundaries"): + return True + return False + + +_TAG_ICONS = { + "CORRECTION": "\U0001f534", + "VERIFY": "\U0001f7e1", + "SPECIFICS": "\U0001f535", + "SCOPE": "\U0001f7e0", + "FOLLOWUP": "✅", + "END": "⬜", +} + + +def _diagnose_correction_trace(trace_obj): + """Analyze a correction session trace and return a diagnosis string. + + Returns (diagnosis_text, failure_type) where failure_type is one of: + 'routing_failure', 'tool_failure', 'other', or None if no trace. + """ + if not trace_obj or not hasattr(trace_obj, "spans") or not trace_obj.spans: + return None, None + + tool_names = set() + for s in trace_obj.spans: + tn = getattr(s, "tool_name", None) + if tn: + tool_names.add(tn) + + routing_tools = {t for t in tool_names if "transfer" in t.lower()} + domain_tools = tool_names - routing_tools + agents = { + s.agent + for s in trace_obj.spans + if s.agent and s.event_type == "LLM_RESPONSE" + } + + if not tool_names and len(agents) <= 1: + return ( + "Agent never routed to a specialist or called any tool — " + "answered from general LLM knowledge only." + ), "routing_failure" + + if routing_tools and not domain_tools and len(agents) > 1: + routed_to = ", ".join(sorted(agents - {min(agents)})) + return ( + f"Agent routed to {routed_to} but no domain tool was called." + ), "tool_failure" + + return None, None + + +def _md_write_correction_analysis( + w, resolved_map, md_samples, trajectories=None, heading_level=2 +): + """Write the Correction Analysis section.""" + sessions_with_tags = [] + sessions_with_corrections = [] + tag_counts = {} + + for sid, ctx in resolved_map.items(): + tags = ctx.get("turn_tags", []) + boundaries = ctx.get("correction_boundaries", []) + if tags: + sessions_with_tags.append((sid, ctx)) + for t in tags: + tag = t.get("tag", "") + tag_counts[tag] = tag_counts.get(tag, 0) + 1 + if boundaries: + sessions_with_corrections.append((sid, ctx)) + + if not sessions_with_tags: + return + + h = "#" * heading_level + h1 = "#" * (heading_level + 1) + h2 = "#" * (heading_level + 2) + w(f"{h} Correction Analysis") + w("") + w( + "Turn-level classification of user behavior across multi-turn " + "conversations. Each user turn is tagged to identify corrections, " + "verifications, and other interaction patterns." + ) + w("") + + # --- Tag Distribution --- + w(f"{h1} Turn Tag Distribution") + w("") + w("| Tag | Count | Icon | Meaning |") + w("|-----|------:|------|---------|") + tag_descriptions = { + "CORRECTION": "User corrects a factual error by the agent", + "VERIFY": "User doubts the answer without providing the correct fact", + "SPECIFICS": "User asks for concrete details the agent omitted", + "SCOPE": "User flags the agent answered something outside its scope", + "FOLLOWUP": "Normal follow-up question; previous answer was acceptable", + "END": "User is satisfied, conversation closing", + } + for tag in ("CORRECTION", "VERIFY", "SPECIFICS", "SCOPE", "FOLLOWUP", "END"): + count = tag_counts.get(tag, 0) + icon = _TAG_ICONS.get(tag, "") + desc = tag_descriptions.get(tag, "") + w(f"| {tag} | {count} | {icon} | {desc} |") + w("") + + total_tagged = len(sessions_with_tags) + total_corrections = len(sessions_with_corrections) + w(f"- **Sessions with turn tags:** {total_tagged}") + w(f"- **Sessions with corrections:** {total_corrections}") w("") + # --- Correction Boundaries --- + if sessions_with_corrections: + w(f"{h1} Corrections") + w("") + w( + "Conversations where the user corrected the agent. Shows what " + "the agent got wrong, what the user corrected, and whether the " + "agent recovered." + ) + w("") + + shown = ( + sessions_with_corrections + if md_samples is None + else sessions_with_corrections[:md_samples] + ) + if len(shown) < len(sessions_with_corrections): + w(f"*Showing {len(shown)} of {len(sessions_with_corrections)}*") + w("") + + if not trajectories: + trajectories = {} + + routing_failures = [] + + for sid, ctx in shown: + question = ctx.get("question", "") + answered_by = ctx.get("answered_by", "") + q = " ".join(question.split()) if question else "(none)" + w(f"{h2} `{sid}` → {answered_by}") + w("") + w(f"- **Question:** {q}") + + for b in ctx.get("correction_boundaries", []): + turn_idx = b.get("turn_index", "?") + wrong = b.get("wrong_claim", "") + correct = b.get("correct_fact", "") + recovered = b.get("agent_recovered", False) + recovered_icon = "✅ Yes" if recovered else "❌ No" + w(f"- **Correction at turn {turn_idx}:**") + w(f' - Agent claimed: *"{wrong[:200]}"*') + w(f' - User corrected: *"{correct[:200]}"*') + w(f" - Agent recovered: {recovered_icon}") + + trace_obj = trajectories.get(sid) + diagnosis, failure_type = _diagnose_correction_trace(trace_obj) + if diagnosis: + w(f"- **Diagnosis:** {diagnosis}") + if failure_type == "routing_failure": + routing_failures.append((sid, answered_by, q)) + + # Render sub-trajectories with inline execution traces + sub_trajs = ctx.get("sub_trajectories", []) + trace_obj = trajectories.get(sid) + conversation = ctx.get("conversation", []) + + if sub_trajs and trace_obj and hasattr(trace_obj, "spans"): + segments = _segment_trace_by_turns( + trace_obj, + conversation, + sub_trajs, + ) + if segments: + w("") + for seg in segments: + outcome = seg.get("outcome", "?") + if outcome == "wrong": + heading = "Before correction" + outcome_suffix = "agent got it wrong" + outcome_icon = "❌" + elif outcome == "recovered": + heading = "After correction" + outcome_suffix = "agent recovered" + outcome_icon = "✅" + elif outcome == "parroted": + heading = "After correction" + outcome_suffix = "agent parroted user's fact without verification" + outcome_icon = "🔁" + elif outcome == "not_recovered": + heading = "After correction" + outcome_suffix = "agent did not recover" + outcome_icon = "❌" + else: + heading = seg.get("label", "Segment") + outcome_suffix = outcome + outcome_icon = "➖" + w( + f"**{heading}** (turns {seg['start_turn']}–" + f"{seg['end_turn']}) — {outcome_suffix} {outcome_icon}" + ) + w("") + w("```") + w(seg["trace"]) + w("```") + w("") + elif sub_trajs: + w("- **Sub-trajectories:**") + for st in sub_trajs: + label = st.get("label", "") + start = st.get("start_turn", "?") + end = st.get("end_turn", "?") + outcome = st.get("outcome", "?") + outcome_icon = ( + "❌" + if outcome in ("wrong", "not_recovered") + else "✅" + if outcome == "recovered" + else "🔁" + if outcome == "parroted" + else "➖" + ) + w(f" - `{label}`: turns {start}–{end} → {outcome_icon} {outcome}") + + _md_write_conversation( + w, + conversation, + show_tags=True, + turn_tags=ctx.get("turn_tags", []), + ) + w("") + + if routing_failures: + w(f"{h1} Routing Failures") + w("") + w( + "Sessions where the supervisor agent answered from general LLM " + "knowledge without routing to a specialist agent or calling any " + "tool. These are prime candidates for improving the supervisor's " + "routing prompt." + ) + w("") + w( + f"**{len(routing_failures)}** of " + f"{len(sessions_with_corrections)} correction sessions " + f"had no tool or agent routing:" + ) + w("") + for sid, agent, question in routing_failures: + w(f"- `{sid}` → {agent}: {question}") + w("") + + # --- Tagged Conversations (no corrections) --- + tagged_no_correction = [ + (sid, ctx) + for sid, ctx in sessions_with_tags + if not ctx.get("correction_boundaries") + ] + has_interesting = any( + any( + t.get("tag") in ("VERIFY", "SPECIFICS", "SCOPE") + for t in ctx.get("turn_tags", []) + ) + for _, ctx in tagged_no_correction + ) + if has_interesting: + w(f"{h1} Other Flagged Interactions") + w("") + w( + "Sessions without corrections but with verification requests, " + "specificity asks, or scope flags." + ) + w("") + + interesting = [ + (sid, ctx) + for sid, ctx in tagged_no_correction + if any( + t.get("tag") in ("VERIFY", "SPECIFICS", "SCOPE") + for t in ctx.get("turn_tags", []) + ) + ] + shown = interesting if md_samples is None else interesting[:md_samples] + if len(shown) < len(interesting): + w(f"*Showing {len(shown)} of {len(interesting)}*") + w("") + + for sid, ctx in shown: + question = ctx.get("question", "") + answered_by = ctx.get("answered_by", "") + q = " ".join(question.split()) if question else "(none)" + tags = ctx.get("turn_tags", []) + flag_tags = [ + t for t in tags if t.get("tag") in ("VERIFY", "SPECIFICS", "SCOPE") + ] + + w(f"{h2} `{sid}` → {answered_by}") + w("") + w(f"- **Question:** {q}") + for ft in flag_tags: + tag = ft.get("tag", "") + icon = _TAG_ICONS.get(tag, "") + evidence = ft.get("evidence", "") + w( + f"- **Turn {ft.get('turn_index', '?')}:** {icon} `{tag}` — {evidence}" + ) + + conversation = ctx.get("conversation", []) + _md_write_conversation( + w, + conversation, + show_tags=True, + turn_tags=ctx.get("turn_tags", []), + ) + w("") + + +def _write_md_report( + report, + resolved_map, + args, + report_dir=None, + trajectories=None, +): + lines = [] + w = lines.append + + if trajectories is None: + trajectories = {} + by_category = _group_by_category(report) a2a_session_ids = { sid for sid, ctx in resolved_map.items() if ctx.get("is_a2a") @@ -1088,10 +3352,102 @@ def _write_md_report(report, resolved_map, args): unknown_count = len(by_category.get("unknown", [])) total = report.total_sessions fp_rate = (fp_count / total * 100) if total > 0 else 0.0 + dim_avgs = _compute_dimension_averages(report) + mt_stats = _compute_multiturn_stats(resolved_map) + agent_stats = _build_agent_stats(report, resolved_map) + + has_dims = _has_dimension_data(dim_avgs) + low_dims = {} + for dim, low_cat in _DIMENSION_LOW_CATEGORIES.items(): + sessions = _md_find_low_dimension_sessions(report, dim, low_cat) + if sessions: + low_dims[dim] = sessions + + # --- TOC --- + w("# Quality Evaluation Report") + w("") + toc = [] + toc.append("* [Quality Evaluation Report](#quality-evaluation-report)") + toc.append(" * [Summary](#summary)") + if has_dims: + toc.append(" * [Quality Dimensions](#quality-dimensions)") + toc.append(" * [Category Distributions](#category-distributions)") + for metric_name in report.category_distributions: + if metric_name in _PRIMARY_METRICS: + toc.append(f" * [{metric_name}](#{metric_name})") + if agent_stats: + toc.append(" * [Per-Agent Quality](#per-agent-quality)") + if mt_stats: + toc.append(" * [Multi-Turn Efficiency](#multi-turn-efficiency)") + has_tags = _md_has_turn_tags(resolved_map) + has_sample_sessions = ( + by_category.get("unhelpful") + or by_category.get("declined") + or low_dims + or by_category.get("partial") + or has_tags + ) + if has_sample_sessions: + toc.append(" * [Sample Sessions](#sample-sessions)") + if by_category.get("unhelpful"): + toc.append(" * [Unhelpful Sessions](#unhelpful-sessions)") + if by_category.get("declined"): + toc.append(" * [Declined Sessions](#declined-sessions)") + for dim in low_dims: + label = _METRIC_LABELS.get(dim, dim) + title = f"Low {label} Sessions" + anchor = title.lower().replace(" ", "-") + toc.append(f" * [{title}](#{anchor})") + if by_category.get("partial"): + toc.append(" * [Partial Sessions](#partial-sessions)") + if has_tags: + toc.append(" * [Correction Analysis](#correction-analysis)") + toc.append(" * [Turn Tag Distribution](#turn-tag-distribution)") + correction_sessions = [ + sid + for sid, ctx in resolved_map.items() + if ctx.get("correction_boundaries") + ] + if correction_sessions: + toc.append(" * [Corrections](#corrections)") + has_routing_failures = any( + _diagnose_correction_trace(trajectories.get(sid))[1] + == "routing_failure" + for sid in correction_sessions + ) + if has_routing_failures: + toc.append(" * [Routing Failures](#routing-failures)") + if trajectories: + toc.append(" * [Sample Trajectories]" "(#sample-execution-trajectories)") + toc.append(" * [Execution Details](#execution-details)") + for line in toc: + w(line) + w("") + w("") + w("") # --- Summary --- w("## Summary") w("") + + model = args.model or EVAL_MODEL_ID + cmd_parts = ["./scripts/quality_report.sh"] + sys.argv[1:] + if "--report" not in cmd_parts: + cmd_parts.insert(1, "--report") + w(f"Markdown report generated by `{' '.join(cmd_parts)}`.") + w("") + + # Render metadata as a bullet list rather than trailing-double-space GFM + # hard breaks — the latter trips `git diff --check` (PR #156/#174 L1). + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + w(f"- **Generated:** {timestamp}") + w(f"- **Project:** {PROJECT_ID}") + if DATASET_ID != "local": + w(f"- **Dataset:** {DATASET_ID}.{TABLE_ID}") + w(f"- **Location:** {DATASET_LOCATION}") + w(f"- **Eval model:** {model}") + w(f"- **Sessions:** {total}") + w("") w("| Metric | Value |") w("|--------|-------|") w(f"| Total sessions | {total} |") @@ -1100,6 +3456,22 @@ def _write_md_report(report, resolved_map, args): w(f"| Partial | {partial_count} |") w(f"| Unhelpful | {fp_count} |") w(f"| Unhelpful rate | {fp_rate:.1f}% |") + counts, gap_sids = _failure_breakdown_from_report(report) + unaddressable = counts["knowledge_gap"] + counts["tool_gap"] + addressable = total - unaddressable + good = meaningful_count + declined_count + addr_rate = (good / addressable * 100) if addressable else 0.0 + if _has_failure_attribution_data(report) and any(counts.values()): + w(f"|   ↳ Skill gaps (evolution fixes) | {counts['skill_gap']} |") + w( + f"|   ↳ Knowledge gaps (add a fact) " + f"| {counts['knowledge_gap']} |" + ) + w(f"|   ↳ Tool gaps (build a tool) | {counts['tool_gap']} |") + w( + f"| **Addressable meaningful rate** " + f"(excl. knowledge + tool gaps) | **{addr_rate:.1f}%** |" + ) if unknown_count: parse_error_metrics = report.details.get("parse_errors", "?") w( @@ -1110,10 +3482,82 @@ def _write_md_report(report, resolved_map, args): w(f"| A2A sessions | {len(a2a_session_ids)} |") w("") - # --- Category Distributions --- + # --- Failure breakdown: which gaps evolution can vs cannot fix --- + def _gap_questions(sids): + out = [] + sid_set = set(sids) + for sr in report.session_results: + if sr.session_id in sid_set: + q = resolved_map.get(sr.session_id, {}).get("question", "") + if q: + out.append(" ".join(q.split())) + return out + + for gap_key, title, blurb in [ + ( + "knowledge_gap", + "Knowledge Gaps (add a fact to existing data)", + "In-scope questions the agent looked up correctly but its data source is" + " silent on. Evolution cannot invent these facts — a human adds them:", + ), + ( + "tool_gap", + "Tool Gaps (build a new tool / data source)", + "Requests no tool can serve — a topic with no data source, or personal" + " data / actions the agent has no capability for. An engineer must add a" + " tool:", + ), + ]: + questions = _gap_questions(gap_sids[gap_key]) + if not questions: + continue + w(f"### {title}") + w("") + w(blurb) + w("") + for q in questions[:15]: + w(f"- {q[:160]}") + if len(questions) > 15: + w(f"- …and {len(questions) - 15} more") + w("") + + # --- Quality Dimensions (0-2 scale) --- + _samples_dict = _parse_samples(args.samples) + + if has_dims: + w("## Quality Dimensions") + w("") + w( + "Each session is scored 0-2 on five dimensions. " + "Scores are averaged across all sessions." + ) + w("") + w("| Dimension | Avg Score | Rating | What it measures |") + w("|-----------|----------:|--------|------------------|") + for dim, avg in dim_avgs.items(): + label = _METRIC_LABELS.get(dim, dim) + rating = ( + "\U0001f7e2" + if avg >= 1.5 + else ("\U0001f7e1" if avg >= 1.0 else "\U0001f534") + ) + desc = _DIMENSION_DESCRIPTIONS.get(dim, "") + w(f"| {label} | {avg:.2f} / 2.00 | {rating} | {desc} |") + w("") + w( + "*Rating: " + "\U0001f7e2 >= 1.50 (good) " + "| \U0001f7e1 >= 1.00 (needs attention) " + "| \U0001f534 < 1.00 (problem area)*" + ) + w("") + + # --- Category Distributions (primary metrics only) --- w("## Category Distributions") w("") for metric_name, dist in report.category_distributions.items(): + if metric_name not in _PRIMARY_METRICS: + continue w(f"### {metric_name}") w("") w("| Category | Count | % |") @@ -1126,7 +3570,6 @@ def _write_md_report(report, resolved_map, args): w("") # --- Per-Agent Quality --- - agent_stats = _build_agent_stats(report, resolved_map) if agent_stats: w("## Per-Agent Quality") w("") @@ -1162,118 +3605,95 @@ def _write_md_report(report, resolved_map, args): ) w("") - # --- Unhelpful Sessions --- - unhelpful_sessions = by_category.get("unhelpful", []) - _md_samples = ( - None - if args.samples == "all" - else (int(args.samples) if args.samples else None) - ) - if unhelpful_sessions: - shown = ( - unhelpful_sessions - if _md_samples is None - else unhelpful_sessions[:_md_samples] - ) - w("## Unhelpful Sessions") - if len(shown) < len(unhelpful_sessions): - w(f"\n*Showing {len(shown)} of {len(unhelpful_sessions)}*") + # --- Multi-Turn Efficiency --- + if mt_stats: + w("## Multi-Turn Efficiency") + w("") + w("| Metric | Value |") + w("|--------|-------|") + w(f"| Avg user turns | {mt_stats['avg_user_turns']} |") + w(f"| Avg tool calls | {mt_stats['avg_tool_calls']} |") + if mt_stats["multi_turn_sessions"] > 0: + w(f"| Multi-turn sessions | {mt_stats['multi_turn_sessions']} |") w("") - for sr in shown: - sid = sr.session_id - ctx = resolved_map.get(sid, {}) - question = ctx.get("question", "") - response = ctx.get("response", "") - answered_by = ctx.get("answered_by", "") - a2a_tag = " [A2A]" if sid in a2a_session_ids else "" - q = " ".join(question.split()) if question else "(none)" - r = " ".join(response.split()) if response else "(none)" + # --- Sample Sessions --- + has_sample_sessions = ( + by_category.get("unhelpful") + or by_category.get("declined") + or low_dims + or by_category.get("partial") + or has_tags + ) + if has_sample_sessions: + w("## Sample Sessions") + w("") - w(f"### `{sid}`{a2a_tag} \u2192 {answered_by}") - w("") - w(f"- **Question:** {q}") - r_display = (r[:500] + "\u2026") if len(r) > 500 else r - w(f"- **Response:** {r_display}") - for mr in sr.metrics: - label = _category_label(mr.category) - display = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) - w(f"- **{display}:** {label}") - if mr.justification: - w(f" - *{mr.justification}*") - w("") + unhelpful_sessions = by_category.get("unhelpful", []) + if unhelpful_sessions: + _md_write_session_section( + w, + "Unhelpful Sessions", + unhelpful_sessions, + _get_sample_limit(_samples_dict, "unhelpful"), + resolved_map, + a2a_session_ids, + heading_level=3, + ) - # --- Declined Sessions --- declined_sessions = by_category.get("declined", []) if declined_sessions: - shown = ( - declined_sessions - if _md_samples is None - else declined_sessions[:_md_samples] + _md_write_session_section( + w, + "Declined Sessions", + declined_sessions, + _get_sample_limit(_samples_dict, "declined"), + resolved_map, + a2a_session_ids, + heading_level=3, ) - w("## Declined Sessions") - if len(shown) < len(declined_sessions): - w(f"\n*Showing {len(shown)} of {len(declined_sessions)}*") - w("") - for sr in shown: - sid = sr.session_id - ctx = resolved_map.get(sid, {}) - question = ctx.get("question", "") - response = ctx.get("response", "") - answered_by = ctx.get("answered_by", "") - a2a_tag = " [A2A]" if sid in a2a_session_ids else "" - - q = " ".join(question.split()) if question else "(none)" - r = " ".join(response.split()) if response else "(none)" - w(f"### `{sid}`{a2a_tag} \u2192 {answered_by}") - w("") - w(f"- **Question:** {q}") - r_display = (r[:500] + "\u2026") if len(r) > 500 else r - w(f"- **Response:** {r_display}") - for mr in sr.metrics: - label = _category_label(mr.category) - display = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) - w(f"- **{display}:** {label}") - if mr.justification: - w(f" - *{mr.justification}*") - w("") + for dim, low_cat in _DIMENSION_LOW_CATEGORIES.items(): + if dim not in low_dims: + continue + label = _METRIC_LABELS.get(dim, dim) + _md_write_low_dimension_section( + w, + f"Low {label} Sessions", + label, + report, + dim, + low_cat, + _get_sample_limit(_samples_dict, "low"), + resolved_map, + heading_level=3, + ) - # --- Partial Sessions --- partial_sessions = by_category.get("partial", []) if partial_sessions: - shown = ( - partial_sessions - if _md_samples is None - else partial_sessions[:_md_samples] + _md_write_session_section( + w, + "Partial Sessions", + partial_sessions, + _get_sample_limit(_samples_dict, "partial"), + resolved_map, + a2a_session_ids, + heading_level=3, ) - w("## Partial Sessions") - if len(shown) < len(partial_sessions): - w(f"\n*Showing {len(shown)} of {len(partial_sessions)}*") - w("") - for sr in shown: - sid = sr.session_id - ctx = resolved_map.get(sid, {}) - question = ctx.get("question", "") - response = ctx.get("response", "") - answered_by = ctx.get("answered_by", "") - a2a_tag = " [A2A]" if sid in a2a_session_ids else "" - q = " ".join(question.split()) if question else "(none)" - r = " ".join(response.split()) if response else "(none)" + # --- Correction Analysis (turn tagging) --- + if has_tags: + _md_write_correction_analysis( + w, + resolved_map, + _get_sample_limit(_samples_dict, "corrections"), + trajectories=trajectories, + heading_level=3, + ) - w(f"### `{sid}`{a2a_tag} \u2192 {answered_by}") - w("") - w(f"- **Question:** {q}") - r_display = (r[:500] + "\u2026") if len(r) > 500 else r - w(f"- **Response:** {r_display}") - for mr in sr.metrics: - label = _category_label(mr.category) - display = _METRIC_LABELS.get(mr.metric_name, mr.metric_name) - w(f"- **{display}:** {label}") - if mr.justification: - w(f" - *{mr.justification}*") - w("") + # --- Sample Execution Trajectories --- + if trajectories: + _md_write_trajectory_section(w, trajectories, resolved_map) # --- Execution Details --- w("## Execution Details") @@ -1287,7 +3707,8 @@ def _write_md_report(report, resolved_map, args): w("") # Write file - report_dir = os.path.join(_script_dir, "reports") + if report_dir is None: + report_dir = os.path.join(_script_dir, "reports") os.makedirs(report_dir, exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = os.path.join(report_dir, f"quality_report_{ts}.md") @@ -1297,12 +3718,147 @@ def _write_md_report(report, resolved_map, args): return os.path.abspath(report_path) +# --------------------------------------------------------------------------- +# Failure attribution — skill gap vs knowledge gap vs tool gap +# --------------------------------------------------------------------------- +# +# Every failure (response_usefulness == "unhelpful") has one root cause, and +# each points to a DIFFERENT fixer: +# - skill_gap -> the agent had the tool + data but misbehaved (routing, +# tool-use, parroting, hallucination). Fixed by SKILL +# EVOLUTION (automatic). +# - knowledge_gap -> a tool that covers the topic was used correctly, but the +# specific fact is missing from its data. Fixed by a HUMAN +# adding a fact to the existing data source. +# - tool_gap -> no tool/capability can serve the request (a topic with no +# data source, or personal-data / action needs). Fixed by +# an ENGINEER building a new tool. +# +# The LLM judge's ``failure_attribution`` metric assigns the cause when present +# (it sees the tool inventory). Without it we fall back to a 2-way deterministic +# split (knowledge vs skill). Only skill gaps are addressable by evolution, so +# ``addressable_meaningful_rate`` excludes both knowledge and tool gaps. +_KNOWLEDGE_GAP_TOOL = {"proper"} +_KNOWLEDGE_GAP_CORRECTNESS = {"correct", "mostly_correct"} +_FAILURE_CLASSES = ("skill_gap", "knowledge_gap", "tool_gap") + + +def _failure_class(usefulness, tool, correctness, attribution=None): + """Classify a single session's failure (or None if it is not a failure). + + Prefers the LLM judge's ``failure_attribution`` (3-way: skill/knowledge/tool) + when available; otherwise falls back to a deterministic 2-way split — an + unhelpful session where the agent used its tools and did not fabricate is a + ``knowledge_gap``, anything else is a ``skill_gap``. + """ + # Meaningful / correctly-declined responses are not failures, regardless of + # any stray attribution — never count them as a gap (keeps addressable rate + # <= 100%). + if usefulness in ("meaningful", "declined"): + return None + # For an actual failure (unhelpful / partial), trust the judge's attribution + # when it named a concrete gap; otherwise fall back to the deterministic + # 2-way split (which only fires for unhelpful). + if attribution in _FAILURE_CLASSES: + return attribution + if usefulness != "unhelpful": + return None + if tool in _KNOWLEDGE_GAP_TOOL and correctness in _KNOWLEDGE_GAP_CORRECTNESS: + return "knowledge_gap" + return "skill_gap" + + +def _has_failure_attribution_data(report): + """True when failures can actually be attributed to a cause. + + The failure-cause taxonomy (skill/knowledge/tool gap) needs either the judge's + ``failure_attribution`` metric, or both ``tool_usage`` and ``correctness`` (the + deterministic 2-way fallback). When none were scored — e.g. ``--dimensions + primary`` — ``_failure_class`` would default every failure to ``skill_gap``, + which reads as "no knowledge/tool gaps, just evolution work" when it is really + "those metrics weren't scored." So all output paths gate the failure breakdown + on this predicate (analogous to ``_has_dimension_data``). + """ + for sr in report.session_results: + cats = {mr.metric_name for mr in sr.metrics} + if "failure_attribution" in cats or ( + "tool_usage" in cats and "correctness" in cats + ): + return True + return False + + +def _failure_breakdown_from_report(report): + """Return (counts_by_class, gap_session_ids_by_class) from a raw report.""" + counts = {c: 0 for c in _FAILURE_CLASSES} + gap_sids = {c: [] for c in _FAILURE_CLASSES} + for sr in report.session_results: + cats = {mr.metric_name: mr.category for mr in sr.metrics} + fc = _failure_class( + cats.get("response_usefulness"), + cats.get("tool_usage"), + cats.get("correctness"), + cats.get("failure_attribution"), + ) + if fc in counts: + counts[fc] += 1 + gap_sids[fc].append(sr.session_id) + return counts, gap_sids + + +def _classify_failures(report): + """Tag each ``unhelpful`` session with a ``failure_class`` and add the + skill/knowledge/tool-gap summary metrics in place.""" + sessions = report.get("sessions", []) + summary = report.setdefault("summary", {}) + + counts = {c: 0 for c in _FAILURE_CLASSES} + gap_questions = {c: [] for c in _FAILURE_CLASSES} + for s in sessions: + metrics = s.get("metrics", {}) + fc = _failure_class( + metrics.get("response_usefulness", {}).get("category"), + metrics.get("tool_usage", {}).get("category"), + metrics.get("correctness", {}).get("category"), + metrics.get("failure_attribution", {}).get("category"), + ) + if fc in counts: + s["failure_class"] = fc + counts[fc] += 1 + q = s.get("question", "") + if q: + gap_questions[fc].append(q) + + total = summary.get("total_sessions") or len(sessions) + good = summary.get("meaningful", 0) + summary.get("declined", 0) + # Only skill gaps are addressable by evolution; knowledge + tool gaps need a + # human (add a fact) or an engineer (build a tool). + unaddressable = counts["knowledge_gap"] + counts["tool_gap"] + addressable = total - unaddressable + summary["skill_gap"] = counts["skill_gap"] + summary["knowledge_gap"] = counts["knowledge_gap"] + summary["tool_gap"] = counts["tool_gap"] + summary["knowledge_gap_rate"] = ( + round(counts["knowledge_gap"] / total * 100, 1) if total else 0 + ) + summary["tool_gap_rate"] = ( + round(counts["tool_gap"] / total * 100, 1) if total else 0 + ) + # Quality on questions the agent *can* answer (knowledge + tool gaps excluded) + # — the ceiling skill evolution is actually working toward. + summary["addressable_meaningful_rate"] = ( + round(good / addressable * 100, 1) if addressable else 0 + ) + summary["knowledge_gap_questions"] = gap_questions["knowledge_gap"][:50] + summary["tool_gap_questions"] = gap_questions["tool_gap"][:50] + + # --------------------------------------------------------------------------- # JSON report output # --------------------------------------------------------------------------- -def _build_json_output(report, resolved_map): +def _build_json_output(report, resolved_map, trajectories=None): """Build a structured dict for JSON output of evaluation results.""" by_category = _group_by_category(report) agent_stats = _build_agent_stats(report, resolved_map) @@ -1311,22 +3867,69 @@ def _build_json_output(report, resolved_map): for sr in report.session_results: ctx = resolved_map.get(sr.session_id, {}) metrics = {} + quality_scores = {} for mr in sr.metrics: metrics[mr.metric_name] = { "category": mr.category, "justification": mr.justification, } - sessions.append( - { - "session_id": sr.session_id, - "question": ctx.get("question", ""), - "response": ctx.get("response", ""), - "answered_by": ctx.get("answered_by", ""), - "is_a2a": ctx.get("is_a2a", False), - "latency_s": ctx.get("latency_s"), - "metrics": metrics, + if mr.metric_name in _DIMENSION_SCORES: + score_map = _DIMENSION_SCORES[mr.metric_name] + quality_scores[mr.metric_name] = { + "score": score_map.get(mr.category, 0), + "reason": mr.justification or "", } - ) + session_dict = { + "session_id": sr.session_id, + "question": ctx.get("question", ""), + "response": ctx.get("response", ""), + "answered_by": ctx.get("answered_by", ""), + "is_a2a": ctx.get("is_a2a", False), + "latency_s": ctx.get("latency_s"), + "user_turns": ctx.get("user_turns", 0), + "tool_calls": ctx.get("tool_calls", 0), + "corrections": ctx.get("corrections", 0), + "verifications": ctx.get("verifications", 0), + "metrics": metrics, + "quality_scores": quality_scores, + } + conversation = ctx.get("conversation", []) + if conversation: + turn_tags = ctx.get("turn_tags", []) + if turn_tags: + tag_by_idx = {t["turn_index"]: t for t in turn_tags} + annotated = [] + for i, turn in enumerate(conversation): + t = dict(turn) + tag_info = tag_by_idx.get(i) + if tag_info: + t["inferred_tag"] = tag_info.get("tag", "") + t["tag_evidence"] = tag_info.get("evidence", "") + annotated.append(t) + session_dict["conversation"] = annotated + else: + session_dict["conversation"] = conversation + correction_boundaries = ctx.get("correction_boundaries", []) + if correction_boundaries: + session_dict["correction_boundaries"] = correction_boundaries + sub_trajectories = ctx.get("sub_trajectories", []) + if sub_trajectories: + session_dict["sub_trajectories"] = sub_trajectories + if trajectories and sr.session_id in trajectories: + trace_obj = trajectories[sr.session_id] + if hasattr(trace_obj, "spans"): + session_dict["execution_trace"] = _render_trace(trace_obj) + if sub_trajectories and conversation: + segments = _segment_trace_by_turns( + trace_obj, + conversation, + sub_trajectories, + ) + if segments: + session_dict["execution_sub_trajectories"] = segments + else: + session_dict["execution_trace"] = str(trace_obj) + sessions.append(session_dict) fp_count = len(by_category.get("unhelpful", [])) partial_count = len(by_category.get("partial", [])) @@ -1334,7 +3937,10 @@ def _build_json_output(report, resolved_map): declined_count = len(by_category.get("declined", [])) total = report.total_sessions - return { + dim_avgs = _compute_dimension_averages(report) + mt_stats = _compute_multiturn_stats(resolved_map) + + output = { "summary": { "total_sessions": total, "meaningful": meaningful_count, @@ -1347,6 +3953,12 @@ def _build_json_output(report, resolved_map): if total else 0, "unhelpful_rate": round(fp_count / total * 100, 1) if total else 0, + # Empty when dimensions were not scored (e.g. --dimensions primary), + # so consumers don't read unscored dimensions as 0.0 / failing. + "dimension_averages": ( + dim_avgs if _has_dimension_data(dim_avgs) else {} + ), + **mt_stats, }, "category_distributions": { k: dict(v) for k, v in report.category_distributions.items() @@ -1355,6 +3967,12 @@ def _build_json_output(report, resolved_map): "sessions": sessions, "details": {k: str(v) for k, v in report.details.items()}, } + # Only attribute failures when the metrics that drive attribution were scored; + # otherwise skill_gap/knowledge_gap/tool_gap would all default to a misleading + # N/0/0. When ungated, those keys are simply absent from the summary. + if _has_failure_attribution_data(report): + _classify_failures(output) + return output # --------------------------------------------------------------------------- @@ -1368,24 +3986,77 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - %(prog)s Evaluate last 100 sessions (default) - %(prog)s --limit 50 Evaluate last 50 sessions + %(prog)s Evaluate most recent 100 sessions (default) + %(prog)s --limit 50 Evaluate most recent 50 sessions %(prog)s --no-eval Browse Q&A pairs without evaluation %(prog)s --report Also generate a Markdown report %(prog)s --persist Evaluate and persist results to BQ %(prog)s --time-period 7d Evaluate last 7 days - %(prog)s --samples 20 Show up to 20 sessions per category - %(prog)s --samples all Show all sessions per category - %(prog)s --app-name my_agent Filter to a specific agent app %(prog)s --output-json report.json Write structured JSON output - %(prog)s --config config.json Use scope definitions from config + %(prog)s --env path/to/.env Load env vars from a specific .env file + %(prog)s --tag-turns Classify each user turn and find corrections + %(prog)s --trajectory-samples 5 Include 5 execution traces in the report + +Filtering (all filters appear in the Execution Details section of the report): + %(prog)s --app-name my_agent Filter to a specific agent app + %(prog)s --label version=v2.1 Filter by custom label + %(prog)s --label version=v2 --label env=prod Multiple labels (AND) + %(prog)s --time-period 7d --app-name my_agent --label version=v2.1 + Combine filters (time + app + label) + + Labels match custom_tags set via BigQueryLoggerConfig.custom_tags when + initializing the ADK plugin. Common uses: version tagging, deployment + environment, experiment ID, A/B test variant. + +Scope + golden grounding (--eval-spec): + %(prog)s --eval-spec eval_spec.json --report + + The eval spec grounds scoring. 'scope' (free text) defines what the agent + handles — anything outside it is out of scope, so a polite refusal is scored + "declined" (correct) rather than "unhelpful". 'golden_qa' supplies expected + answers matched per-question by embedding similarity to ground correctness. + + Example eval_spec.json: + { + "scope": "Answers HR policy questions: PTO, benefits, expenses, " + "holidays. Does not handle salary, equity, or IT support.", + "ground_truth": "PTO: 20 days/year ...", + "golden_qa": [ + {"question": "How many PTO days?", "expected_answer": "20/year", + "topic": "pto"}, + {"question": "What are the salary bands?", + "expected_behavior": "decline", "topic": "out_of_scope"} + ] + } + + See scripts/eval/data/eval_spec.example.json for a full example. + +Samples (controls how many sessions appear in each report section): + %(prog)s --samples 5 Cap all sections at 5 sessions + %(prog)s --samples all Show every session (no limit) + %(prog)s --samples unhelpful=10,partial=5,low=3 + Per-category: 10 unhelpful, 5 partial, + 3 for each Low-dimension section + %(prog)s --samples unhelpful=all,declined=1 + All unhelpful, 1 declined, defaults for rest + (without --samples) Defaults: unhelpful=10, partial=5, others=3 + + Categories: unhelpful, declined, partial, meaningful, low (all Low-* sections) + +Full report: + %(prog)s --report --limit 20 --app-name my_agent --label version=v2.1 \\ + --samples 3 --tag-turns --trajectory-samples 3 \\ + --eval-spec eval_spec.json --env path/to/.env + +Custom metrics (overrides auto-discovered eval/eval_config.json): + %(prog)s --eval-config path/to/custom_eval_config.json """, ) parser.add_argument( "--limit", type=_positive_int, default=100, - help="Number of sessions (default: 100)", + help="Evaluate the N most recent sessions (default: 100)", ) parser.add_argument( "--eval", @@ -1399,6 +4070,16 @@ def main(): action="store_false", help="Browse Q&A pairs without evaluation", ) + parser.add_argument( + "--dimensions", + choices=["full", "primary"], + default="full", + help="Which LLM-judge metrics to run. 'full' (default) scores all 8 " + "metrics: 2 primary (response_usefulness, task_grounding), the 5 quality " + "dimensions, and failure_attribution. 'primary' scores only the 2 primary " + "metrics — about 4x cheaper (2 LLM calls/session instead of 8) but omits " + "the Quality Dimensions table. Use --no-eval to skip evaluation entirely.", + ) parser.add_argument( "--time-period", type=str, @@ -1425,7 +4106,12 @@ def main(): "--samples", type=_samples_arg, default=None, - help="Max sample sessions to display per category, or 'all' (default: 10/5/3)", + help="Max sessions to show per report section. Accepts a single " + "number (caps all sections equally), 'all' (no limit), or " + "comma-separated key=value pairs for per-category control. " + "Categories: unhelpful, declined, partial, meaningful, low " + "(all Low-dimension sections). " + "Defaults: unhelpful=10, partial=5, all others=3", ) parser.add_argument( "--session", @@ -1441,6 +4127,16 @@ def main(): "root_agent_name attribute set by BigQueryAgentAnalyticsPlugin; " "sessions from other sources may not populate this field", ) + parser.add_argument( + "--label", + type=str, + action="append", + default=None, + metavar="KEY=VALUE", + help="Filter by custom label (repeatable). Matches custom_tags set " + "via BigQueryLoggerConfig.custom_tags. " + "Example: --label version=v2.1 --label env=prod", + ) parser.add_argument( "--output-json", type=str, @@ -1456,16 +4152,37 @@ def main(): help="Unhelpful rate warning threshold in %% (default: 10)", ) parser.add_argument( - "--config", + "--eval-spec", + type=str, + default=None, + metavar="PATH", + dest="eval_spec", + help="Path to an eval-spec JSON file that grounds scoring. Three " + "optional fields: 'scope' (free text describing what the agent " + "handles — anything outside it is out of scope, so a polite decline " + "is correct), 'ground_truth' (free-text authoritative facts), and " + "'golden_qa' (list of {question, expected_answer, topic?, " + "expected_behavior?} matched per-question by embedding similarity to " + "ground correctness). Enables the 'declined' category. Auto-discovered " + "from eval/data/eval_spec.json. Use 'none' to disable.", + ) + parser.add_argument( + "--golden-threshold", + type=float, + default=_DEFAULT_GOLDEN_THRESHOLD, + metavar="FLOAT", + help="Cosine-similarity threshold for golden_qa matching " + "(default: 0.92). Lower matches more aggressively.", + ) + parser.add_argument( + "--eval-config", type=str, default=None, metavar="PATH", - help="Path to a JSON config file with scope definitions. " - "When provided, adds a 'declined' category for correctly " - "refused out-of-scope questions. Expected format: " - '{"scope_decisions": [{"topic": "...", "decision": "out_of_scope", ' - '"reason": "..."}]}. ' - "Only 'topic' and 'decision' are used; 'reason' is documentation-only.", + help="Path to a JSON file with metric definitions. By default, " + "eval/eval_config.json is auto-discovered from the repo root or " + "script directory. Use this flag to override with a custom file. " + "See scripts/eval/eval_config.json for the expected format.", ) parser.add_argument( "--session-ids-file", @@ -1478,6 +4195,39 @@ def main(): "When set, only these sessions are evaluated — --limit and " "--time-period are ignored.", ) + parser.add_argument( + "--conversations-file", + type=str, + default=None, + metavar="PATH", + help="JSON file with local conversations to evaluate (no BigQuery " + 'required). Expects {"conversations": [...]} or a plain list of ' + "conversation dicts. When set, traces are scored locally via the " + "Gemini API instead of being fetched from BigQuery.", + ) + parser.add_argument( + "--concurrency", + type=int, + default=10, + help="Max parallel Gemini API calls for --conversations-file mode " + "(default: 10).", + ) + parser.add_argument( + "--tag-turns", + action="store_true", + default=False, + help="Run the full turn tagger on multi-turn conversations to classify " + "each user turn (CORRECTION, VERIFY, SPECIFICS, SCOPE, FOLLOWUP, END) " + "and identify correction boundaries and sub-trajectories.", + ) + parser.add_argument( + "--trajectory-samples", + type=int, + default=0, + metavar="N", + help="Fetch N execution traces from BigQuery and include them in the " + "report. Prioritizes unhelpful and correction sessions.", + ) parser.add_argument( "--env", type=str, @@ -1492,6 +4242,16 @@ def main(): _configure_logging() _load_dotenv(env_file=args.env) + + if args.conversations_file: + for var, default in [ + ("PROJECT_ID", os.getenv("GOOGLE_CLOUD_PROJECT", "local")), + ("DATASET_ID", "local"), + ("TABLE_ID", "conversations"), + ("DATASET_LOCATION", os.getenv("GOOGLE_CLOUD_LOCATION", "local")), + ]: + os.environ.setdefault(var, default) + _load_config() if args.eval: diff --git a/scripts/quality_report.sh b/scripts/quality_report.sh index c6f11b67..607524fc 100755 --- a/scripts/quality_report.sh +++ b/scripts/quality_report.sh @@ -14,9 +14,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Parse --env flag before other processing +# Parse --env flag before other processing (supports --env PATH and --env=PATH). +# The flag is NOT stripped — Python also accepts --env for report metadata. ENV_FILE="" -PASSTHROUGH_ARGS=() for arg in "$@"; do if [ "$_NEXT_IS_ENV" = "1" ]; then ENV_FILE="$arg" @@ -27,10 +27,12 @@ for arg in "$@"; do _NEXT_IS_ENV=1 continue fi - PASSTHROUGH_ARGS+=("$arg") + if [[ "$arg" == --env=* ]]; then + ENV_FILE="${arg#--env=}" + continue + fi done unset _NEXT_IS_ENV -set -- "${PASSTHROUGH_ARGS[@]}" # Load .env: explicit --env wins, then repo root default if [ -n "$ENV_FILE" ]; then @@ -47,15 +49,32 @@ elif [ -f "${SCRIPT_DIR}/../.env" ]; then set +a fi -# Validate required env vars -for var in PROJECT_ID DATASET_ID TABLE_ID DATASET_LOCATION; do - if [ -z "${!var}" ]; then - echo "ERROR: Required environment variable ${var} is not set." - echo "Set it in your shell or create a .env file. See scripts/README.md." - exit 1 +# Short-circuit: pass --help / -h straight to Python (no env needed) +for arg in "$@"; do + if [ "$arg" = "--help" ] || [ "$arg" = "-h" ]; then + python3 "${SCRIPT_DIR}/quality_report.py" "$@" + exit $? fi done +# Validate required env vars (skip for --conversations-file mode) +HAS_CONVERSATIONS_FILE=false +for arg in "$@"; do + if [[ "$arg" == "--conversations-file" || "$arg" == --conversations-file=* ]]; then + HAS_CONVERSATIONS_FILE=true + break + fi +done +if ! $HAS_CONVERSATIONS_FILE; then + for var in PROJECT_ID DATASET_ID TABLE_ID DATASET_LOCATION; do + if [ -z "${!var}" ]; then + echo "ERROR: Required environment variable ${var} is not set." + echo "Use --env /path/to/.env, or 'export ${var}=...' in your shell." + exit 1 + fi + done +fi + # Log eval runs (skip logging for --no-eval) if [[ " $* " != *" --no-eval "* ]]; then REPORTS_DIR="${SCRIPT_DIR}/reports" diff --git a/scripts/sample_quality_report.md b/scripts/sample_quality_report.md index 633bbc34..e976a300 100644 --- a/scripts/sample_quality_report.md +++ b/scripts/sample_quality_report.md @@ -1,24 +1,62 @@ # Quality Evaluation Report + +* [Quality Evaluation Report](#quality-evaluation-report) + * [Summary](#summary) + * [Quality Dimensions](#quality-dimensions) + * [Category Distributions](#category-distributions) + * [response_usefulness](#response_usefulness) + * [task_grounding](#task_grounding) + * [Per-Agent Quality](#per-agent-quality) + * [Multi-Turn Efficiency](#multi-turn-efficiency) + * [Sample Sessions](#sample-sessions) + * [Unhelpful Sessions](#unhelpful-sessions) + * [Low Correctness Sessions](#low-correctness-sessions) + * [Low Tool Usage Sessions](#low-tool-usage-sessions) + * [Low Specificity Sessions](#low-specificity-sessions) + * [Low Scope Sessions](#low-scope-sessions) + * [Low First-Time Right Sessions](#low-first-time-right-sessions) + * [Correction Analysis](#correction-analysis) + * [Turn Tag Distribution](#turn-tag-distribution) + * [Corrections](#corrections) + * [Routing Failures](#routing-failures) + * [Sample Trajectories](#sample-execution-trajectories) + * [Execution Details](#execution-details) + -Markdown report generated by `./scripts/quality_report.sh --report --limit 20 --samples 3`. - -**Generated:** 2026-05-19 12:00:00 -**Project:** my-gcp-project -**Dataset:** agent_logs.agent_events -**Location:** us-central1 -**Eval model:** gemini-2.5-flash -**Sessions:** 20 ## Summary +Markdown report generated by `./scripts/quality_report.sh --report --limit 20 --app-name knowledge_supervisor --label version=v2.1 --label env=prod --samples 3 --tag-turns --trajectory-samples 3`. + +- **Generated:** 2026-05-22 23:15:36 +- **Project:** my-gcp-project +- **Dataset:** agent_logs.agent_events +- **Location:** us-central1 +- **Eval model:** gemini-2.5-flash +- **Sessions:** 20 + | Metric | Value | |--------|-------| | Total sessions | 20 | -| Meaningful | 12 | -| Declined (out-of-scope) | 4 | -| Partial | 1 | -| Unhelpful | 3 | -| Unhelpful rate | 15.0% | +| Meaningful | 13 | +| Declined (out-of-scope) | 0 | +| Partial | 0 | +| Unhelpful | 7 | +| Unhelpful rate | 35.0% | + +## Quality Dimensions + +Each session is scored 0-2 on five dimensions. Scores are averaged across all sessions. + +| Dimension | Avg Score | Rating | What it measures | +|-----------|----------:|--------|------------------| +| Correctness | 1.40 / 2.00 | 🟡 | Are the facts in the response accurate? | +| Tool Usage | 1.30 / 2.00 | 🟡 | Did the agent use its tools to verify facts? | +| Specificity | 1.60 / 2.00 | 🟢 | Does the response include specific numbers, dates, limits? | +| Scope | 1.50 / 2.00 | 🟢 | Did the agent correctly handle in-scope vs out-of-scope? | +| First-Time Right | 1.10 / 2.00 | 🟡 | Was the first response correct without user corrections? | + +*Rating: 🟢 >= 1.50 (good) | 🟡 >= 1.00 (needs attention) | 🔴 < 1.00 (problem area)* ## Category Distributions @@ -26,113 +64,945 @@ Markdown report generated by `./scripts/quality_report.sh --report --limit 20 -- | Category | Count | % | |----------|------:|--:| -| ✅ HELPFUL | 12 | 60.0% | -| ✅ DECLINED (OK) | 4 | 20.0% | -| ❌ NOT HELPFUL | 3 | 15.0% | -| ⚠️ PARTIAL | 1 | 5.0% | +| ✅ HELPFUL | 13 | 65.0% | +| ❌ NOT HELPFUL | 7 | 35.0% | ### task_grounding | Category | Count | % | |----------|------:|--:| -| ✅ GROUNDED | 14 | 70.0% | -| ➖ NO TOOL NEEDED | 3 | 15.0% | -| ❌ NOT GROUNDED | 3 | 15.0% | +| ✅ GROUNDED | 13 | 65.0% | +| ❌ NOT GROUNDED | 6 | 30.0% | +| ➖ NO TOOL NEEDED | 1 | 5.0% | ## Per-Agent Quality | Agent | Sessions | Helpful | Declined | Unhelpful | Partial | Status | |-------|-------:|--------:|--------:|----------:|--------:|--------| -| company_info_agent | 5 | 0 (60%) | 3 | 1 | 1 | 🟡 | -| internal_docs_agent | 4 | 4 (100%) | 0 | 0 | 0 | 🟢 | -| bigquery_data_agent | 3 | 3 (100%) | 0 | 0 | 0 | 🟢 | -| no_response | 3 | 0 (33%) | 1 | 2 | 0 | 🔴 | -| google_search_agent | 2 | 2 (100%) | 0 | 0 | 0 | 🟢 | -| local_tools_agent | 2 | 2 (100%) | 0 | 0 | 0 | 🟢 | -| lookup_worker_1 | 1 | 1 (100%) | 0 | 0 | 0 | 🟢 | +| unknown | 14 | 11 (79%) | 0 | 3 | 0 | 🟡 | +| knowledge_supervisor | 5 | 1 (20%) | 0 | 4 | 0 | 🔴 | +| policy_agent | 1 | 1 (100%) | 0 | 0 | 0 | 🟢 | + +## Multi-Turn Efficiency + +| Metric | Value | +|--------|-------| +| Avg user turns | 2.9 | +| Avg tool calls | 1.4 | +| Multi-turn sessions | 20 | -## Unhelpful Sessions +## Sample Sessions -### `eeecc90b-22a8-45c5-be14-800590fcda36` → no_response +### Unhelpful Sessions -- **Question:** How many working days are remaining in the current fiscal quarter? -- **Response:** (none) +*Showing 3 of 7* + +#### `conv_707f63fa` → knowledge_supervisor + +- **Question:** Do we get Veterans Day off? +- **Response:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. - **Usefulness:** ❌ NOT HELPFUL - - *The agent completed without providing any answer or information to the user's question, making the interaction unhelpful.* + - *The agent's initial response was factually incorrect, making it unhelpful to the user. An incorrect answer is not useful.* - **Grounding:** ❌ NOT GROUNDED - - *The agent failed to provide a response to a question that required data retrieval from tools. Therefore, the (missing) information was not grounded in tool results.* + - *The agent's first response stated Veterans Day is a paid holiday, which directly contradicts the GROUND TRUTH DATA that explicitly states Veterans Day is NOT a holiday. This indicates the response was not grounded in the provided data.* +- **Dimensions:** Correctness ❌ | Tool Usage ❌ | Specificity ✅ | Scope ✅ | First-Time Right ❌ + +
Conversation + + **user:** Do we get Veterans Day off? + + **agent:** Yes, Veterans Day is one of the company's paid holidays. + + **user:** My onboarding packet says that Veterans Day is not a company holiday. -### `eval_parental_leave_3ffe32b7` → company_info_agent + **agent:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. -- **Question:** How long is the parental leave policy? -- **Response:** I do not have information about the parental leave policy. Please contact HR for details. + **user:** Thanks for clarifying that my onboarding packet is the definitive source for holidays. + +
+ +#### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. - **Usefulness:** ❌ NOT HELPFUL - - *The agent is a 'company_info_agent', so parental leave policy is an in-scope question. The agent declined to provide the information, making the response unhelpful for an in-scope query.* + - *The agent stated it does not have information on the 401k match policy, even though the ground truth data explicitly provides this information (4% company match). This is an in-scope topic, and the agent failed to provide a substantive answer.* - **Grounding:** ❌ NOT GROUNDED - - *The agent explicitly stated it does not have the information, meaning no data was retrieved from its tools to answer the question. The response is a statement of the agent's limitation, not grounded in tool results, nor did it require no tool usage to answer the question itself.* + - *The agent claimed it did not have the information, which contradicts the ground truth data that clearly states the 401k match policy. The response is not based on the available data.* +- **Dimensions:** Correctness ❌ | Tool Usage ❌ | Specificity ❌ | Scope ❌ | First-Time Right ❌ + +
Conversation + + **user:** Is the 401k match 6% or 8%? -### `79d0c4c5-8685-46af-900c-8fb5ef304fd9` → no_response + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. -- **Question:** What is my current PTO balance, and how many sick days do I have remaining this year? -- **Response:** (none) + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +#### `conv_6051ae28` → knowledge_supervisor + +- **Question:** What's the per diem for meals? I heard it's $50. +- **Response:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. - **Usefulness:** ❌ NOT HELPFUL - - *The user asked for their PTO balance and sick days, which is an in-scope question. The agent completed its process but provided no explicit response or answer to the user's query.* -- **Grounding:** ❌ NOT GROUNDED - - *The agent correctly identified the need for a tool (pto_agent) and invoked it. However, no response containing tool-derived information was provided to the user. Therefore, the user's query was not grounded in actual tool results, as no data was presented.* + - *The agent's final response is an acknowledgement of a user correction, not a useful answer to a question. The agent failed to provide the requested information about pre-approval earlier in the conversation.* +- **Grounding:** ➖ NO TOOL NEEDED + - *The agent's final response is an acknowledgement of user input and does not require tool usage or provide new factual information. It's a conversational turn.* +- **Dimensions:** Correctness ✅ | Tool Usage ❌ | Specificity ❌ | Scope ❌ | First-Time Right ❌ + +
Conversation + + **user:** What's the per diem for meals? I heard it's $50. + + **agent:** I can answer questions about expenses, but I don't have information about specific per diem rates. My knowledge base does not include that detail. I can tell you that per diem generally refers to a daily allowance for expenses, and it can vary based on location and company policy. To get the exact per diem for meals, you should check your company's expense policy or contact your HR department. + + **user:** Actually, my onboarding packet states that the per diem for meals on travel is $75/day. + + **agent:** Thank you for clarifying! That's good to know. + + **user:** Glad I could help! What's the policy for pre-approval on travel expenses? + + **agent:** I can answer questions about expenses, but I don't have the specific details of your company's pre-approval policy for travel expenses. You should consult your company's official travel and expense policy document or contact your HR or finance department for this information. + + **user:** Actually, my onboarding packet states that travel expenses over $500 need pre-approval. + + **agent:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. + +
+ +### Low Correctness Sessions + +*Showing 3 of 6* + +#### `conv_707f63fa` → knowledge_supervisor + +- **Question:** Do we get Veterans Day off? +- **Response:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. +- **Correctness:** ❌ INCORRECT + - *The agent's first response was incorrect. The GROUND TRUTH DATA clearly states that Veterans Day is NOT a company holiday.* + +
Conversation + + **user:** Do we get Veterans Day off? + + **agent:** Yes, Veterans Day is one of the company's paid holidays. + + **user:** My onboarding packet says that Veterans Day is not a company holiday. + + **agent:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. + + **user:** Thanks for clarifying that my onboarding packet is the definitive source for holidays. + +
+ +#### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. +- **Correctness:** ❌ INCORRECT + - *The agent's statement 'I do not have information on the company's 401k match policy' is incorrect, as the ground truth data clearly specifies a '4% company match'.* + +
Conversation + + **user:** Is the 401k match 6% or 8%? + + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. + + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +#### `conv_43e50c9b` → knowledge_supervisor + +- **Question:** Do we get Columbus Day off? +- **Response:** Thank you for correcting me. My apologies for the misinformation. You are right, if your onboarding packet states that Columbus Day is not a company holiday, then that is the accurate information. I will update my knowledge accordingly. +- **Correctness:** ❌ INCORRECT + - *The agent's statement 'Yes, Columbus Day is a company holiday' is factually incorrect according to the ground truth data which states 'Columbus Day are NOT holidays'.* + +
Conversation + + **user:** Do we get Columbus Day off? + + **agent:** Yes, Columbus Day is a company holiday. + + **user:** Actually, my onboarding packet states that Columbus Day is NOT a company holiday. + + **agent:** Thank you for correcting me. My apologies for the misinformation. You are right, if your onboarding packet states that Columbus Day is not a company holiday, then that is the accurate information. I will update my knowledge accordingly. + + **user:** You're welcome! Thanks for updating your knowledge. + +
+ +### Low Tool Usage Sessions + +*Showing 3 of 7* + +#### `conv_707f63fa` → knowledge_supervisor + +- **Question:** Do we get Veterans Day off? +- **Response:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. +- **Tool Usage:** ❌ NONE + - *The agent provided an incorrect answer that contradicted the GROUND TRUTH DATA. This suggests it did not use its tools to look up the holiday list or misinterpreted the tool results, instead relying on general knowledge.* + +
Conversation + + **user:** Do we get Veterans Day off? + + **agent:** Yes, Veterans Day is one of the company's paid holidays. + + **user:** My onboarding packet says that Veterans Day is not a company holiday. + + **agent:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. + + **user:** Thanks for clarifying that my onboarding packet is the definitive source for holidays. + +
+ +#### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. +- **Tool Usage:** ❌ NONE + - *The agent stated it did not have the information, indicating it did not use its tools to retrieve the 401k match policy, which is present in the ground truth data.* + +
Conversation + + **user:** Is the 401k match 6% or 8%? + + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. -## Declined Sessions + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +#### `conv_6051ae28` → knowledge_supervisor + +- **Question:** What's the per diem for meals? I heard it's $50. +- **Response:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. +- **Tool Usage:** ❌ NONE + - *The agent did not use any tools for this final response, as it was merely acknowledging a user's correction. The agent should have used tools to answer the pre-approval question earlier but failed to do so.* + +
Conversation + + **user:** What's the per diem for meals? I heard it's $50. + + **agent:** I can answer questions about expenses, but I don't have information about specific per diem rates. My knowledge base does not include that detail. I can tell you that per diem generally refers to a daily allowance for expenses, and it can vary based on location and company policy. To get the exact per diem for meals, you should check your company's expense policy or contact your HR department. + + **user:** Actually, my onboarding packet states that the per diem for meals on travel is $75/day. + + **agent:** Thank you for clarifying! That's good to know. + + **user:** Glad I could help! What's the policy for pre-approval on travel expenses? + + **agent:** I can answer questions about expenses, but I don't have the specific details of your company's pre-approval policy for travel expenses. You should consult your company's official travel and expense policy document or contact your HR or finance department for this information. + + **user:** Actually, my onboarding packet states that travel expenses over $500 need pre-approval. + + **agent:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. + +
+ +### Low Specificity Sessions *Showing 3 of 4* -### `eval_holiday_schedule_5077e749` → company_info_agent +#### `conv_91faa456` → unknown -- **Question:** What are the company holidays this year? -- **Response:** I do not have information about company holidays. Please contact HR for details. -- **Usefulness:** ✅ DECLINED (OK) - - *The agent explicitly stated it does not have information about company holidays and suggested contacting HR, indicating the question is outside its scope and it correctly declined.* -- **Grounding:** ➖ NO TOOL NEEDED - - *The agent directly stated it does not have the information, which is a direct LLM response about its capabilities rather than using a tool or fabricating data.* +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. +- **Specificity:** ❌ VAGUE + - *The agent provided no specific details about the 401k match, instead stating it lacked the information entirely.* -### `eval_benefits_401k_f38f288b` → company_info_agent +
Conversation -- **Question:** Does the company match 401k contributions? -- **Response:** I don't have information about 401k contributions. Please contact HR for details on 401k matching. -- **Usefulness:** ✅ DECLINED (OK) - - *The agent explicitly states it does not have information about 401k contributions and directs the user to HR, indicating the question is outside its scope or capabilities, which is a correct decline for an out-of-scope question.* -- **Grounding:** ➖ NO TOOL NEEDED - - *The agent is stating its inability to answer the question and directing the user elsewhere, which is a direct LLM response about its capabilities and does not require a tool to answer the user's factual question.* + **user:** Is the 401k match 6% or 8%? -### `eval_benefits_health_plans_6eb5d2dd` → company_info_agent + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. -- **Question:** What health insurance plans does the company offer? -- **Response:** I do not have information about specific health insurance plans. The company offers competitive benefits. Please contact HR for details about health insurance plans. -- **Usefulness:** ✅ DECLINED (OK) - - *The agent explicitly stated it does not have information about specific health insurance plans and correctly directed the user to HR, indicating the question is outside its direct scope.* -- **Grounding:** ➖ NO TOOL NEEDED - - *The agent's response is a statement about its own limitations and a redirection to HR, which does not require tool usage to retrieve specific data. It's a meta-response about its capabilities.* + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +#### `conv_6051ae28` → knowledge_supervisor + +- **Question:** What's the per diem for meals? I heard it's $50. +- **Response:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. +- **Specificity:** ❌ VAGUE + - *The agent's response itself is generic ('Thank you for that clarification! It's helpful to know...'). While it acknowledges specific information provided by the user, its own contribution is not specific.* + +
Conversation + + **user:** What's the per diem for meals? I heard it's $50. + + **agent:** I can answer questions about expenses, but I don't have information about specific per diem rates. My knowledge base does not include that detail. I can tell you that per diem generally refers to a daily allowance for expenses, and it can vary based on location and company policy. To get the exact per diem for meals, you should check your company's expense policy or contact your HR department. + + **user:** Actually, my onboarding packet states that the per diem for meals on travel is $75/day. + + **agent:** Thank you for clarifying! That's good to know. + + **user:** Glad I could help! What's the policy for pre-approval on travel expenses? + + **agent:** I can answer questions about expenses, but I don't have the specific details of your company's pre-approval policy for travel expenses. You should consult your company's official travel and expense policy document or contact your HR or finance department for this information. + + **user:** Actually, my onboarding packet states that travel expenses over $500 need pre-approval. + + **agent:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. + +
+ +#### `conv_692e3fc4` → unknown + +- **Question:** I was told the expense system auto-reimburses anything under $100 without approval. Right? +- **Response:** As a large language model, I do not have access to real-time, specific company policy documents. Therefore, I cannot look up the actual company expense policy for you. You would need to consult your company's official documentation or HR department for this information. +- **Specificity:** ❌ VAGUE + - *The agent provided no specific details about the expense policy, only generic advice to consult official documentation.* + +
Conversation + + **user:** I was told the expense system auto-reimburses anything under $100 without approval. Right? + + **agent:** I can't confirm that. Please refer to the official company expense policy for details on reimbursement procedures and approval thresholds. + + **user:** Can you please check the actual company expense policy for details on auto-reimbursement thresholds, rather than just telling me to refer to it? + + **agent:** As a large language model, I do not have access to real-time, specific company policy documents. Therefore, I cannot look up the actual company expense policy for you. You would need to consult your company's official documentation or HR department for this information. + + **user:** Thanks, that helps! + +
+ +### Low Scope Sessions + +*Showing 3 of 5* + +#### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. +- **Scope:** ❌ NON-COMPLIANT + - *The 401k match is an in-scope topic, but the agent failed to provide an answer, incorrectly stating it did not have the information.* + +
Conversation + + **user:** Is the 401k match 6% or 8%? + + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. + + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +#### `conv_6051ae28` → knowledge_supervisor + +- **Question:** What's the per diem for meals? I heard it's $50. +- **Response:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. +- **Scope:** ❌ NON-COMPLIANT + - *The agent failed to answer an in-scope question about travel expense pre-approval, instead directing the user elsewhere. This is a non-compliant handling of an in-scope query. The final response is an acknowledgement, not a correction of its previous non-compliance.* + +
Conversation + + **user:** What's the per diem for meals? I heard it's $50. + + **agent:** I can answer questions about expenses, but I don't have information about specific per diem rates. My knowledge base does not include that detail. I can tell you that per diem generally refers to a daily allowance for expenses, and it can vary based on location and company policy. To get the exact per diem for meals, you should check your company's expense policy or contact your HR department. + + **user:** Actually, my onboarding packet states that the per diem for meals on travel is $75/day. + + **agent:** Thank you for clarifying! That's good to know. + + **user:** Glad I could help! What's the policy for pre-approval on travel expenses? + + **agent:** I can answer questions about expenses, but I don't have the specific details of your company's pre-approval policy for travel expenses. You should consult your company's official travel and expense policy document or contact your HR or finance department for this information. + + **user:** Actually, my onboarding packet states that travel expenses over $500 need pre-approval. + + **agent:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. + +
+ +#### `conv_43e50c9b` → knowledge_supervisor + +- **Question:** Do we get Columbus Day off? +- **Response:** Thank you for correcting me. My apologies for the misinformation. You are right, if your onboarding packet states that Columbus Day is not a company holiday, then that is the accurate information. I will update my knowledge accordingly. +- **Scope:** ❌ NON-COMPLIANT + - *The agent attempted to answer an in-scope question but provided an incorrect response, failing to handle it correctly.* + +
Conversation + + **user:** Do we get Columbus Day off? + + **agent:** Yes, Columbus Day is a company holiday. + + **user:** Actually, my onboarding packet states that Columbus Day is NOT a company holiday. + + **agent:** Thank you for correcting me. My apologies for the misinformation. You are right, if your onboarding packet states that Columbus Day is not a company holiday, then that is the accurate information. I will update my knowledge accordingly. + + **user:** You're welcome! Thanks for updating your knowledge. + +
+ +### Low First-Time Right Sessions + +*Showing 3 of 9* + +#### `conv_5d77036b` → policy_agent + +- **Question:** Is Juneteenth a company holiday? +- **Response:** Thank you for bringing this to my attention and for asking me to verify! I've looked up the official company policy for holidays. Based on the 2026 holiday schedule, Juneteenth (June 19th) is **not** listed as a company holiday. It seems your onboarding packet has the most accurate information in this case. +- **First-Time Right:** ❌ CORRECTION NEEDED + - *The agent's first response was incorrect, stating Juneteenth IS a holiday, which required the user to correct it and ask for verification.* + +
Conversation + + **user:** Is Juneteenth a company holiday? + + **agent:** Yes, Juneteenth is a company holiday. + + **user:** My onboarding packet says that Juneteenth is NOT a company holiday. Can you please check the official policy? -## Partial Sessions + **agent:** Thank you for bringing this to my attention and for asking me to verify! -### `eval_vacation_next_friday_21afda32` → company_info_agent +I've looked up the official company policy for holidays. Based on the 2026 holiday schedule, Juneteenth (June 19th) is **not** listed as a company holiday. -- **Question:** I want to take next Friday off. Is that a company holiday or should I use PTO? -- **Response:** The company offers 20 days of PTO per year, accrued monthly. Up to 5 unused days can roll over to the next year. I do not have information about company holidays. Please contact HR to find out if next Friday is a company holiday. -- **Usefulness:** ⚠️ PARTIAL - - *The agent provided information about PTO, which was part of the user's question, but explicitly stated it could not provide information about company holidays, which was the other key part of the question.* -- **Grounding:** ✅ GROUNDED - - *The agent provided specific details about PTO accrual and rollover, which would be retrieved from internal company data. The statement about not having holiday information also reflects the limitations of its tools/data.* +It seems your onboarding packet has the most accurate information in this case. + + **user:** Thank you for checking and confirming that Juneteenth is not a company holiday. + +
+ +#### `conv_707f63fa` → knowledge_supervisor + +- **Question:** Do we get Veterans Day off? +- **Response:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. +- **First-Time Right:** ❌ CORRECTION NEEDED + - *The agent's first response was incorrect and required the user to correct it, stating their onboarding packet had different information.* + +
Conversation + + **user:** Do we get Veterans Day off? + + **agent:** Yes, Veterans Day is one of the company's paid holidays. + + **user:** My onboarding packet says that Veterans Day is not a company holiday. + + **agent:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. + + **user:** Thanks for clarifying that my onboarding packet is the definitive source for holidays. + +
+ +#### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Response:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. +- **First-Time Right:** ❌ CORRECTION NEEDED + - *The first response was incorrect and unhelpful, as the agent claimed not to have information that was available in the ground truth. The user had to follow up to try and get the agent to check again.* + +
Conversation + + **user:** Is the 401k match 6% or 8%? + + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. + + **user:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user:** Thanks, that helps! + +
+ +### Correction Analysis + +Turn-level classification of user behavior across multi-turn conversations. Each user turn is tagged to identify corrections, verifications, and other interaction patterns. + +#### Turn Tag Distribution + +| Tag | Count | Icon | Meaning | +|-----|------:|------|---------| +| CORRECTION | 7 | 🔴 | User corrects a factual error by the agent | +| VERIFY | 5 | 🟡 | User doubts the answer without providing the correct fact | +| SPECIFICS | 3 | 🔵 | User asks for concrete details the agent omitted | +| SCOPE | 1 | 🟠 | User flags the agent answered something outside its scope | +| FOLLOWUP | 22 | ✅ | Normal follow-up question; previous answer was acceptable | +| END | 19 | ⬜ | User is satisfied, conversation closing | + +- **Sessions with turn tags:** 20 +- **Sessions with corrections:** 6 + +#### Corrections + +Conversations where the user corrected the agent. Shows what the agent got wrong, what the user corrected, and whether the agent recovered. + +*Showing 5 of 6* + +##### `conv_5d77036b` → policy_agent + +- **Question:** Is Juneteenth a company holiday? +- **Correction at turn 2:** + - Agent claimed: *"Yes, Juneteenth is a company holiday."* + - User corrected: *"My onboarding packet says that Juneteenth is NOT a company holiday."* + - Agent recovered: ✅ Yes + +**Before correction** (turns 0–1) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [14.7s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [2.1s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [2.0s, ttft=2.0s] +``` + +**After correction** (turns 2–4) — agent recovered ✅ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [1.0min] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [1.0min] + ├── knowledge_supervisor > LLM_REQUEST + ├── knowledge_supervisor > LLM_RESPONSE [5.5s, ttft=5.5s] + ├── knowledge_supervisor > TOOL_STARTING (transfer_to_agent) + ├── knowledge_supervisor > TOOL_COMPLETED (transfer_to_agent) [0ms] + ├── policy_agent > AGENT_STARTING + └── policy_agent > AGENT_COMPLETED [56.0s] + ├── policy_agent > LLM_REQUEST + ├── policy_agent > LLM_RESPONSE [20.2s, ttft=20.2s] + ├── policy_agent > TOOL_STARTING (lookup_company_policy) + ├── policy_agent > TOOL_COMPLETED (lookup_company_policy) [0ms] + ├── policy_agent > LLM_REQUEST + └── policy_agent > LLM_RESPONSE [35.7s, ttft=35.7s] +``` + + +
Conversation + + **user** `[FOLLOWUP]`**:** Is Juneteenth a company holiday? + + **agent:** Yes, Juneteenth is a company holiday. + + **user** `[CORRECTION]`**:** My onboarding packet says that Juneteenth is NOT a company holiday. Can you please check the official policy? + + **agent:** Thank you for bringing this to my attention and for asking me to verify! + +I've looked up the official company policy for holidays. Based on the 2026 holiday schedule, Juneteenth (June 19th) is **not** listed as a company holiday. + +It seems your onboarding packet has the most accurate information in this case. + + **user** `[END]`**:** Thank you for checking and confirming that Juneteenth is not a company holiday. + +
+ +##### `conv_707f63fa` → knowledge_supervisor + +- **Question:** Do we get Veterans Day off? +- **Correction at turn 2:** + - Agent claimed: *"Yes, Veterans Day is one of the company's paid holidays."* + - User corrected: *"My onboarding packet says that Veterans Day is not a company holiday."* + - Agent recovered: ❌ No +- **Diagnosis:** Agent never routed to a specialist or called any tool — answered from general LLM knowledge only. + +**Before correction** (turns 0–1) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [22.4s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [9.7s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [9.7s, ttft=9.7s] +``` + +**After correction** (turns 2–4) — agent parroted user's fact without verification 🔁 + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [40.0s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [40.0s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [40.0s, ttft=40.0s] +``` + + +
Conversation + + **user** `[FOLLOWUP]`**:** Do we get Veterans Day off? + + **agent:** Yes, Veterans Day is one of the company's paid holidays. + + **user** `[CORRECTION]`**:** My onboarding packet says that Veterans Day is not a company holiday. + + **agent:** Thank you for bringing that to my attention. Your onboarding packet would contain the most accurate and up-to-date information regarding company holidays. Please refer to your onboarding packet for the definitive list of company holidays. + + **user** `[END]`**:** Thanks for clarifying that my onboarding packet is the definitive source for holidays. + +
+ +##### `conv_6051ae28` → knowledge_supervisor + +- **Question:** What's the per diem for meals? I heard it's $50. +- **Correction at turn 2:** + - Agent claimed: *"I don't have information about specific per diem rates. My knowledge base does not include that detail."* + - User corrected: *"the per diem for meals on travel is $75/day."* + - Agent recovered: ❌ No +- **Correction at turn 6:** + - Agent claimed: *"I don't have the specific details of your company's pre-approval policy for travel expenses."* + - User corrected: *"travel expenses over $500 need pre-approval."* + - Agent recovered: ❌ No +- **Diagnosis:** Agent never routed to a specialist or called any tool — answered from general LLM knowledge only. + +**Before correction** (turns 0–1) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [22.4s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [9.7s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [9.7s, ttft=9.7s] +``` + +**After correction** (turns 2–3) — agent parroted user's fact without verification 🔁 + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [42.7s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [42.7s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [42.7s, ttft=42.7s] +``` + +**Before correction** (turns 4–5) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [48.8s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [48.8s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [48.8s, ttft=48.8s] +``` + +**After correction** (turns 6–7) — agent parroted user's fact without verification 🔁 + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [3.4s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [3.4s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [3.4s, ttft=3.4s] +``` + + +
Conversation + + **user** `[SPECIFICS]`**:** What's the per diem for meals? I heard it's $50. + + **agent:** I can answer questions about expenses, but I don't have information about specific per diem rates. My knowledge base does not include that detail. I can tell you that per diem generally refers to a daily allowance for expenses, and it can vary based on location and company policy. To get the exact per diem for meals, you should check your company's expense policy or contact your HR department. + + **user** `[CORRECTION]`**:** Actually, my onboarding packet states that the per diem for meals on travel is $75/day. + + **agent:** Thank you for clarifying! That's good to know. + + **user** `[FOLLOWUP]`**:** Glad I could help! What's the policy for pre-approval on travel expenses? + + **agent:** I can answer questions about expenses, but I don't have the specific details of your company's pre-approval policy for travel expenses. You should consult your company's official travel and expense policy document or contact your HR or finance department for this information. + + **user** `[CORRECTION]`**:** Actually, my onboarding packet states that travel expenses over $500 need pre-approval. + + **agent:** Thank you for that clarification! It's helpful to know that travel expenses over $500 require pre-approval. + +
+ +##### `conv_43e50c9b` → knowledge_supervisor + +- **Question:** Do we get Columbus Day off? +- **Correction at turn 2:** + - Agent claimed: *"Yes, Columbus Day is a company holiday."* + - User corrected: *"my onboarding packet states that Columbus Day is NOT a company holiday."* + - Agent recovered: ❌ No +- **Diagnosis:** Agent never routed to a specialist or called any tool — answered from general LLM knowledge only. + +**Before correction** (turns 0–1) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [14.9s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [2.2s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [2.2s, ttft=2.2s] +``` + +**After correction** (turns 2–4) — agent parroted user's fact without verification 🔁 + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [1.2s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [1.2s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [1.2s, ttft=1.2s] +``` + + +
Conversation + + **user** `[FOLLOWUP]`**:** Do we get Columbus Day off? + + **agent:** Yes, Columbus Day is a company holiday. + + **user** `[CORRECTION]`**:** Actually, my onboarding packet states that Columbus Day is NOT a company holiday. + + **agent:** Thank you for correcting me. My apologies for the misinformation. You are right, if your onboarding packet states that Columbus Day is not a company holiday, then that is the accurate information. I will update my knowledge accordingly. + + **user** `[END]`**:** You're welcome! Thanks for updating your knowledge. + +
+ +##### `conv_93bc4aa7` → knowledge_supervisor + +- **Question:** I was told the company matches 401k at 5%. Can you confirm? +- **Correction at turn 2:** + - Agent claimed: *"the company matches 401k contributions at 5%."* + - User corrected: *"the company matches 401k contributions at 4%, not 5%."* + - Agent recovered: ❌ No +- **Diagnosis:** Agent never routed to a specialist or called any tool — answered from general LLM knowledge only. + +**Before correction** (turns 0–1) — agent got it wrong ❌ + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [43.4s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [30.7s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [30.7s, ttft=30.7s] +``` + +**After correction** (turns 2–4) — agent parroted user's fact without verification 🔁 + +``` +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [12.5s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [12.5s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [12.5s, ttft=12.5s] +``` + + +
Conversation + + **user** `[FOLLOWUP]`**:** I was told the company matches 401k at 5%. Can you confirm? + + **agent:** Yes, I can confirm that the company matches 401k contributions at 5%. + + **user** `[CORRECTION]`**:** Actually, my onboarding packet states the company matches 401k contributions at 4%, not 5%. + + **agent:** Thank you for clarifying. I will update my information. The company matches 401k contributions at 4%. + + **user** `[END]`**:** Thank you for confirming the 4% 401k match. + +
+ +#### Routing Failures + +Sessions where the supervisor agent answered from general LLM knowledge without routing to a specialist agent or calling any tool. These are prime candidates for improving the supervisor's routing prompt. + +**4** of 6 correction sessions had no tool or agent routing: + +- `conv_707f63fa` → knowledge_supervisor: Do we get Veterans Day off? +- `conv_6051ae28` → knowledge_supervisor: What's the per diem for meals? I heard it's $50. +- `conv_43e50c9b` → knowledge_supervisor: Do we get Columbus Day off? +- `conv_93bc4aa7` → knowledge_supervisor: I was told the company matches 401k at 5%. Can you confirm? + +#### Other Flagged Interactions + +Sessions without corrections but with verification requests, specificity asks, or scope flags. + +*Showing 5 of 7* + +##### `conv_91faa456` → unknown + +- **Question:** Is the 401k match 6% or 8%? +- **Turn 2:** 🟡 `VERIFY` — User questions the agent's inability to provide information, asking it to check official documentation. + +
Conversation + + **user** `[FOLLOWUP]`**:** Is the 401k match 6% or 8%? + + **agent:** I do not have information on the company's 401k match policy. Please check your benefits documentation or contact HR for this information. + + **user** `[VERIFY]`**:** I have this information in my onboarding packet; could you please check the company's official policy documentation for the 401k match? + + **agent:** I understand you have this information in your onboarding packet. However, as an AI, I do not have access to your company's specific, internal policy documentation. I can only answer questions based on the general knowledge I was trained on or by routing to the available agents for specific tasks. + +For precise details on your company's 401k match, please refer to the documentation you have or contact your HR department directly. + + **user** `[END]`**:** Thanks, that helps! + +
+ +##### `conv_12790747` → unknown + +- **Question:** I heard we get 25 PTO days. Is that right? +- **Turn 0:** 🟡 `VERIFY` — User states a fact they heard and asks the agent to verify its correctness ('Is that right?'). + +
Conversation + + **user** `[VERIFY]`**:** I heard we get 25 PTO days. Is that right? + + **agent:** It's great you're asking about PTO! According to company policy, employees receive 20 days of PTO per year, not 25. These days accrue monthly, at approximately 1.67 days per month. You can also roll over up to 5 unused PTO days to the next year. Just remember to submit PTO requests at least two weeks in advance if you're planning on taking more than 3 days off. + + **user** `[FOLLOWUP]`**:** That's great, thanks for clarifying! How many sick days do we get per year, and do they roll over? + + **agent:** You're welcome! I'm happy to help. + +Regarding sick days, employees receive 10 sick days per year. However, sick leave does not roll over to the next year. Also, if you're absent for more than 3 consecutive days, a doctor's note will be required. + + **user** `[FOLLOWUP]`**:** Thanks, that's clear! Can you tell me about the company's remote work policy? + + **agent:** Of course! I can certainly tell you about our company's remote work policy. + +You can work remotely up to 3 days per week, provided you have your manager's approval. We also have core collaboration hours from 10 AM to 3 PM in your local timezone to ensure everyone can connect effectively. All remote work arrangements need to be documented in the HR system. + + **user** `[END]`**:** Thanks, that's very clear and helpful! + +
+ +##### `conv_05b96e44` → unknown + +- **Question:** Is it true we get 15 sick days per year? +- **Turn 0:** 🔵 `SPECIFICS` — User asks for a specific detail (number of sick days) and verification of a number. + +
Conversation + + **user** `[SPECIFICS]`**:** Is it true we get 15 sick days per year? + + **agent:** I found the details on our company's sick leave policy! We actually get 10 sick days per year. Just so you know, sick leave does not roll over to the next year, and if you're out for more than 3 consecutive days, a doctor's note will be required. + +Is there anything else I can help you with regarding sick leave or other policies? + + **user** `[FOLLOWUP]`**:** Thanks, that's helpful! Can you tell me about the PTO policy, specifically how many days we get per year and if they roll over? + + **agent:** You're very welcome! I'm happy to help. + +Regarding our PTO policy, here's what I found: + +* **Days Per Year:** Employees receive **20 days of PTO per year**. +* **Accrual:** This PTO is accrued monthly, at approximately 1.67 days per month. +* **Rollover:** Yes, unused PTO *does* roll over to the next year, but there's a maximum of **5 days** that can be carried over. +* **Requests:** For PTO periods longer than 3 days, requests need to be submitted at least two weeks in advance. + +Does this give you the information you were looking for about PTO? Let me know if you have any other questions! + + **user** `[END]`**:** Yes, that's exactly what I needed. Thanks for the detailed information! + +
+ +##### `conv_213ff9f7` → unknown + +- **Question:** My manager said PTO rolls over up to 10 days. Is that correct? +- **Turn 0:** 🟡 `VERIFY` — User questions information they received ('My manager said PTO rolls over up to 10 days. Is that correct?') + +
Conversation + + **user** `[VERIFY]`**:** My manager said PTO rolls over up to 10 days. Is that correct? + + **agent:** According to the company policy, unused PTO rolls over to the next year up to a maximum of **5 days**, not 10 days. You accrue approximately 1.67 days per month, totaling 20 days per year. + + **user** `[END]`**:** Okay, thanks for clarifying that the maximum PTO rollover is 5 days, not 10. + +
+ +##### `conv_0985fdc7` → unknown + +- **Question:** I heard receipts are only needed for expenses over $50. True? +- **Turn 0:** 🟡 `VERIFY` — User asks if information they heard is true, not correcting the agent but seeking confirmation. + +
Conversation + + **user** `[VERIFY]`**:** I heard receipts are only needed for expenses over $50. True? + + **agent:** Actually, the company policy states that receipts are required for any expense over $25, not $50. You can find more details about expense policies, including daily meal limits ($75/day) and pre-approval requirements for travel expenses over $500, on the company expense portal at expenses.company.com. + +Let me know if you have any other questions about expenses! + + **user** `[FOLLOWUP]`**:** Thanks, that clarifies it. What's the deadline for submitting expense reports? + + **agent:** You're welcome! I'm glad I could help clarify. + +Regarding your question about expense reports, business expenses must be submitted within 30 days. + + **user** `[END]`**:** Thanks, that's good to know! + +
+ +## Sample Execution Trajectories + +Full execution traces showing agent routing, tool calls, and LLM requests. These reveal *why* an answer was wrong — did the agent skip a tool call, call the wrong tool, or get misrouted? ## Execution Details -- **execution_mode:** ai_generate -- **elapsed_seconds:** 37.0 +- **elapsed_seconds:** 123.8 - **project:** my-gcp-project - **dataset:** agent_logs.agent_events - **location:** us-central1 - **eval_model:** gemini-2.5-flash -- **time_period:** all +- **time_period:** 7d - **limit:** 20 - **persist:** False +- **app_name:** knowledge_supervisor +- **labels:** version=v2.1, env=prod - **samples:** 3 -- **created_at:** 2026-05-19T03:34:45.916865+00:00 - +- **created_at:** 2026-05-22T23:15:33Z diff --git a/scripts/sample_quality_report_session.md b/scripts/sample_quality_report_session.md new file mode 100644 index 00000000..eed3b67c --- /dev/null +++ b/scripts/sample_quality_report_session.md @@ -0,0 +1,148 @@ +# Single-Session Quality Report + +Console output generated by `python scripts/quality_report.py --session conv_5d77036b --tag-turns`. + +When `--session` is used, all 7 metrics are shown with full justifications (verbose mode). +The execution trace is fetched automatically from BigQuery and displayed after the summary. + +```text +────────────────────────────────────────────────────────────────────── + MEANINGFUL Sessions (showing 1 of 1) +────────────────────────────────────────────────────────────────────── + + Session: conv_5d77036b → policy_agent + Question: My onboarding packet says that Juneteenth is NOT a company + holiday. Can you please check the official policy? + Response: "Thank you for bringing this to my attention and for asking + me to verify! I've looked up the official company policy for + holidays. Based on the 2026 holiday schedule, Juneteenth + (June 19th) is **not** listed as a company holiday. It seems + your onboarding packet has the most accurate information in + this case." + Usefulness: ✅ HELPFUL + Reason: The final response from the policy_agent correctly answers + the user's question after the correction, providing specific + and actionable information about Juneteenth not being a + company holiday based on the official policy. + Grounding: ✅ GROUNDED + Reason: The policy_agent explicitly states it looked up the official + company policy and based its answer on the 2026 holiday + schedule, indicating tool usage. + Dimensions: Correctness: ✅ CORRECT | Tool Usage: ✅ PROPER | + Specificity: ✅ SPECIFIC | Scope: ✅ COMPLIANT | + First-Time Right: ❌ CORRECTION NEEDED + +====================================================================== +QUALITY SUMMARY +====================================================================== + Total sessions evaluated : 1 + Meaningful : 1 + Declined (out-of-scope) : 0 + Partial : 0 + Unhelpful : 0 + Unhelpful rate : 0.0% + + Quality Dimensions (0-2 scale): + Correctness : 2.00 / 2.00 ################################################## + Tool Usage : 2.00 / 2.00 ################################################## + Specificity : 2.00 / 2.00 ################################################## + Scope : 2.00 / 2.00 ################################################## + First-Time Right : 0.00 / 2.00 + + Multi-Turn Efficiency: + Avg user turns : 2.0 + Avg tool calls : 2.0 + Multi-turn sessions : 1 + Correction rate : 100.0% + Verification rate : 0.0% + + Category Distributions: + + [response_usefulness] + ✅ HELPFUL : 1 (100.0%) ################################################## + + [task_grounding] + ✅ GROUNDED : 1 (100.0%) ################################################## + + Execution Details: + execution_mode: ai_generate + elapsed_seconds: 23.4 + project: my-gcp-project + dataset: agent_logs.agent_events + location: us-central1 + eval_model: gemini-2.5-flash + time_period: all + limit: 100 + +====================================================================== + +====================================================================== +EXECUTION TRACE +====================================================================== +Session: conv_5d77036b +Time: 17:37:54 Total: 1.1min +────────────────────────────────────────────────────────────────────── +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +├── knowledge_supervisor > INVOCATION_COMPLETED [14.7s] +│ ├── knowledge_supervisor > AGENT_STARTING +│ └── knowledge_supervisor > AGENT_COMPLETED [2.1s] +│ ├── knowledge_supervisor > LLM_REQUEST +│ └── knowledge_supervisor > LLM_RESPONSE [2.0s, ttft=2.0s] +├── knowledge_supervisor > USER_MESSAGE_RECEIVED +├── knowledge_supervisor > INVOCATION_STARTING +└── knowledge_supervisor > INVOCATION_COMPLETED [1.0min] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [1.0min] + ├── knowledge_supervisor > LLM_REQUEST + ├── knowledge_supervisor > LLM_RESPONSE [5.5s, ttft=5.5s] + ├── knowledge_supervisor > TOOL_STARTING (transfer_to_agent) + ├── knowledge_supervisor > TOOL_COMPLETED (transfer_to_agent) [0ms] + ├── policy_agent > AGENT_STARTING + └── policy_agent > AGENT_COMPLETED [56.0s] + ├── policy_agent > LLM_REQUEST + ├── policy_agent > LLM_RESPONSE [20.2s, ttft=20.2s] + ├── policy_agent > TOOL_STARTING (lookup_company_policy) + ├── policy_agent > TOOL_COMPLETED (lookup_company_policy) [0ms] + ├── policy_agent > LLM_REQUEST + └── policy_agent > LLM_RESPONSE [35.7s, ttft=35.7s] + +────────────────────────────────────────────────────────────────────── + SUB-TRAJECTORY SEGMENTATION +────────────────────────────────────────────────────────────────────── + + ❌ pre_correction_1 (turns 0-1) → wrong + ├── knowledge_supervisor > USER_MESSAGE_RECEIVED + ├── knowledge_supervisor > INVOCATION_STARTING + └── knowledge_supervisor > INVOCATION_COMPLETED [14.7s] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [2.1s] + ├── knowledge_supervisor > LLM_REQUEST + └── knowledge_supervisor > LLM_RESPONSE [2.0s, ttft=2.0s] + + ✅ post_correction_1 (turns 2-3) → recovered + ├── knowledge_supervisor > USER_MESSAGE_RECEIVED + ├── knowledge_supervisor > INVOCATION_STARTING + └── knowledge_supervisor > INVOCATION_COMPLETED [1.0min] + ├── knowledge_supervisor > AGENT_STARTING + └── knowledge_supervisor > AGENT_COMPLETED [1.0min] + ├── knowledge_supervisor > LLM_REQUEST + ├── knowledge_supervisor > LLM_RESPONSE [5.5s, ttft=5.5s] + ├── knowledge_supervisor > TOOL_STARTING (transfer_to_agent) + ├── knowledge_supervisor > TOOL_COMPLETED (transfer_to_agent) [0ms] + ├── policy_agent > AGENT_STARTING + └── policy_agent > AGENT_COMPLETED [56.0s] + ├── policy_agent > LLM_REQUEST + ├── policy_agent > LLM_RESPONSE [20.2s, ttft=20.2s] + ├── policy_agent > TOOL_STARTING (lookup_company_policy) + ├── policy_agent > TOOL_COMPLETED (lookup_company_policy) [0ms] + ├── policy_agent > LLM_REQUEST + └── policy_agent > LLM_RESPONSE [35.7s, ttft=35.7s] +====================================================================== +``` + +The execution trace reveals: +- **Turn 1 (wrong):** The supervisor answered directly from LLM knowledge (no routing, no tool call) — incorrectly stating Juneteenth is a holiday +- **Turn 2 (recovered):** After user correction, the supervisor routed via `transfer_to_agent` to the `policy_agent`, which called `lookup_company_policy` and returned the correct answer + +The sub-trajectory segmentation splits the trace at the correction boundary, making it easy to see what changed between the failed and recovered attempts. diff --git a/src/bigquery_agent_analytics/categorical_evaluator.py b/src/bigquery_agent_analytics/categorical_evaluator.py index 3a93538e..0deb9972 100644 --- a/src/bigquery_agent_analytics/categorical_evaluator.py +++ b/src/bigquery_agent_analytics/categorical_evaluator.py @@ -842,6 +842,7 @@ async def classify_sessions_via_api( transcripts: dict[str, str], config: CategoricalEvaluationConfig, endpoint: str = DEFAULT_ENDPOINT, + per_session_context: dict[str, str] | None = None, ) -> list[CategoricalSessionResult]: """Classifies sessions using the Gemini API (fallback). @@ -863,6 +864,8 @@ async def classify_sessions_via_api( transcripts: Maps ``session_id`` to transcript text. config: Categorical evaluation configuration. endpoint: Model endpoint name. + per_session_context: Optional per-session context to inject into the + judge prompt (e.g. matched golden eval expected answers). Returns: One ``CategoricalSessionResult`` per session, in input order. @@ -887,7 +890,10 @@ async def _classify_one( if len(text) > 25000: text = text[:25000] + "\n... [truncated]" - full_prompt = prompt_prefix + "\n\nTranscript:\n" + text + session_ctx = "" + if per_session_context and sid in per_session_context: + session_ctx = "\n\n" + per_session_context[sid] + full_prompt = prompt_prefix + session_ctx + "\n\nTranscript:\n" + text try: response = await client.aio.models.generate_content( diff --git a/src/bigquery_agent_analytics/trace.py b/src/bigquery_agent_analytics/trace.py index c788c512..52c35606 100644 --- a/src/bigquery_agent_analytics/trace.py +++ b/src/bigquery_agent_analytics/trace.py @@ -449,6 +449,7 @@ def from_cli_args( session_id: str | None = None, user_id: str | None = None, has_error: bool | None = None, + custom_labels: dict[str, str] | None = None, limit: int = 100, ) -> "TraceFilter": """Build a ``TraceFilter`` from CLI-style arguments. @@ -467,6 +468,8 @@ def from_cli_args( session_id: Filter to a single session. user_id: Filter to a specific user. has_error: If set, filter by error presence. + custom_labels: Filter by custom_tags key-value pairs + written via ``BigQueryLoggerConfig.custom_tags``. limit: Maximum number of traces to return. Returns: @@ -485,6 +488,7 @@ def from_cli_args( user_id=user_id, session_ids=session_ids, has_error=has_error, + custom_labels=custom_labels, limit=limit, ) @@ -606,7 +610,7 @@ def to_sql_conditions(self) -> tuple[str, list]: param_val = f"label_val_{i}" conditions.append( f"JSON_VALUE(attributes," - f" CONCAT('$.labels.', @{param_key}))" + f" CONCAT('$.custom_tags.', @{param_key}))" f" = @{param_val}" ) params.append(bigquery.ScalarQueryParameter(param_key, "STRING", key)) diff --git a/tests/test_quality_report_helpers.py b/tests/test_quality_report_helpers.py index a1a599fb..9d52cc24 100644 --- a/tests/test_quality_report_helpers.py +++ b/tests/test_quality_report_helpers.py @@ -25,15 +25,26 @@ # Make scripts/ importable sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts")) -from quality_report import _AGENT_CONFIG_CACHE # noqa: E402 from quality_report import _build_agent_stats from quality_report import _build_scope_context +from quality_report import _classify_failures +from quality_report import _compute_dimension_averages +from quality_report import _compute_multiturn_stats +from quality_report import _count_trace_metrics +from quality_report import _EVAL_SPEC_CACHE # noqa: E402 from quality_report import _extract_a2a_text +from quality_report import _extract_conversation +from quality_report import _failure_class from quality_report import _group_by_category +from quality_report import _has_dimension_data +from quality_report import _has_failure_attribution_data +from quality_report import _inject_golden_summary from quality_report import _is_single_word_routing -from quality_report import _load_agent_config +from quality_report import _load_eval_spec +from quality_report import generate_quality_report from quality_report import get_a2a_response from quality_report import get_user_input +from quality_report import print_quality_report # --------------------------------------------------------------------------- # Lightweight stubs for report objects @@ -56,9 +67,10 @@ def __init__(self, spans): class _FakeMetric: - def __init__(self, metric_name, category): + def __init__(self, metric_name, category, parse_error=False): self.metric_name = metric_name self.category = category + self.parse_error = parse_error class _FakeSession: @@ -479,64 +491,255 @@ def test_invalid_json_string(self): class TestBuildScopeContext: - def test_none_config(self): + def test_none_spec(self): assert _build_scope_context(None) == "" - def test_empty_config(self): + def test_empty_spec(self): assert _build_scope_context({}) == "" - def test_no_oos_topics(self): - config = { - "scope_decisions": [ - {"topic": "billing", "decision": "in_scope"}, - ] + def test_scope_free_text(self): + result = _build_scope_context({"scope": "Handles PTO and benefits only."}) + assert "Handles PTO and benefits only." in result + assert "OUT OF SCOPE" in result + assert "declined" in result + + def test_ground_truth_only(self): + result = _build_scope_context({"ground_truth": "PTO is 20 days/year."}) + assert "GROUND TRUTH" in result + assert "20 days/year" in result + + def test_scope_and_ground_truth(self): + result = _build_scope_context( + { + "scope": "HR policy questions.", + "ground_truth": "PTO is 20 days.", + } + ) + assert "HR policy questions." in result + assert "PTO is 20 days." in result + + def test_no_relevant_fields(self): + # A spec with only golden_qa contributes no scope/ground-truth context. + assert _build_scope_context({"golden_qa": [{"question": "q"}]}) == "" + + +# ================================================================== # +# _inject_golden_summary # +# ================================================================== # + + +class TestInjectGoldenSummary: + + def _report(self, sessions): + return {"summary": {}, "sessions": sessions} + + def test_no_metadata_is_noop(self): + report = self._report([{"session_id": "s1"}]) + _inject_golden_summary(report, None) + assert "golden_eval_summary" not in report["summary"] + + def test_matched_meaningful_and_mismatch(self): + sessions = [ + { + "session_id": "s1", + "question": "q1", + "response": "good", + "metrics": {"response_usefulness": {"category": "meaningful"}}, + }, + { + "session_id": "s2", + "question": "q2", + "response": "bad", + "metrics": {"response_usefulness": {"category": "unhelpful"}}, + }, + { + "session_id": "s3", + "question": "q3", + "response": "x", + "metrics": {"response_usefulness": {"category": "meaningful"}}, + }, + ] + meta = { + "s1": { + "matched": True, + "expected_answer": "a1", + "topic": "pto", + "similarity": 0.99, + }, + "s2": { + "matched": True, + "expected_answer": "a2", + "topic": "benefits", + "similarity": 0.98, + }, + "s3": {"matched": False, "similarity": 0.4}, + } + report = self._report(sessions) + _inject_golden_summary(report, meta) + gs = report["summary"]["golden_eval_summary"] + assert gs["matched"] == 2 + assert gs["matched_meaningful"] == 1 + assert gs["matched_unhelpful"] == 1 + assert gs["unmatched"] == 1 + assert len(gs["mismatches"]) == 1 + assert gs["mismatches"][0]["question"] == "q2" + # Per-session golden_eval is attached. + assert sessions[0]["golden_eval"]["matched"] is True + assert sessions[2]["golden_eval"]["matched"] is False + + def test_declined_counts_as_meaningful(self): + sessions = [ + { + "session_id": "s1", + "question": "q", + "response": "decline", + "metrics": {"response_usefulness": {"category": "declined"}}, + }, + ] + meta = { + "s1": { + "matched": True, + "expected_answer": "", + "topic": "out_of_scope", + "similarity": 0.99, + } } - assert _build_scope_context(config) == "" + report = self._report(sessions) + _inject_golden_summary(report, meta) + gs = report["summary"]["golden_eval_summary"] + assert gs["matched_meaningful"] == 1 + assert gs["matched_unhelpful"] == 0 - def test_single_oos_topic(self): - config = { - "scope_decisions": [ - {"topic": "weather", "decision": "out_of_scope"}, - ] + +# ================================================================== # +# _failure_class / _classify_failures # +# ================================================================== # + + +class TestFailureClass: + + def test_not_a_failure(self): + assert _failure_class("meaningful", "proper", "correct") is None + assert _failure_class("declined", "no_tool_needed", "correct") is None + + def test_knowledge_gap(self): + # Looked it up, didn't fabricate, still couldn't answer -> missing fact. + assert _failure_class("unhelpful", "proper", "correct") == "knowledge_gap" + assert ( + _failure_class("unhelpful", "proper", "mostly_correct") + == "knowledge_gap" + ) + + def test_skill_gap_no_tool(self): + # Didn't even look up -> skill-fixable. + assert _failure_class("unhelpful", "none", "correct") == "skill_gap" + + def test_skill_gap_hallucinated(self): + # Used tool but fabricated -> skill-fixable (should have declined). + assert _failure_class("unhelpful", "proper", "incorrect") == "skill_gap" + + def test_judge_attribution_wins(self): + # The judge's failure_attribution overrides the deterministic heuristic. + assert ( + _failure_class("unhelpful", "proper", "correct", "tool_gap") + == "tool_gap" + ) + assert ( + _failure_class("unhelpful", "none", "correct", "knowledge_gap") + == "knowledge_gap" + ) + + def test_judge_not_a_failure_falls_back(self): + # An unexpected attribution falls back to the deterministic split. + assert ( + _failure_class("unhelpful", "proper", "correct", "not_a_failure") + == "knowledge_gap" + ) + + +class TestClassifyFailures: + + def _session(self, sid, use, tool, corr, question="q"): + return { + "session_id": sid, + "question": question, + "metrics": { + "response_usefulness": {"category": use}, + "tool_usage": {"category": tool}, + "correctness": {"category": corr}, + }, } - result = _build_scope_context(config) - assert "weather" in result - assert "OUT OF SCOPE" in result - def test_multiple_oos_topics(self): - config = { - "scope_decisions": [ - {"topic": "weather", "decision": "out_of_scope"}, - {"topic": "sports", "decision": "out_of_scope"}, - {"topic": "billing", "decision": "in_scope"}, - ] + def test_split_and_addressable_rate(self): + report = { + "summary": {"total_sessions": 4, "meaningful": 2, "declined": 0}, + "sessions": [ + self._session("s1", "meaningful", "proper", "correct"), + self._session("s2", "meaningful", "proper", "correct"), + self._session( + "s3", "unhelpful", "proper", "correct", "orthodontia?" + ), + self._session("s4", "unhelpful", "none", "correct"), + ], } - result = _build_scope_context(config) - assert "weather" in result - assert "sports" in result - assert "billing" not in result - - def test_missing_decision_field(self): - config = { - "scope_decisions": [ - {"topic": "weather"}, - ] + _classify_failures(report) + s = report["summary"] + assert s["knowledge_gap"] == 1 + assert s["skill_gap"] == 1 + # 2 meaningful / (4 - 1 knowledge gap) = 66.7% + assert s["addressable_meaningful_rate"] == 66.7 + assert s["knowledge_gap_questions"] == ["orthodontia?"] + # Per-session tags applied. + by_id = { + x["session_id"]: x.get("failure_class") for x in report["sessions"] } - assert _build_scope_context(config) == "" + assert by_id["s3"] == "knowledge_gap" + assert by_id["s4"] == "skill_gap" + assert by_id["s1"] is None + + def test_no_failures(self): + report = { + "summary": {"total_sessions": 1, "meaningful": 1, "declined": 0}, + "sessions": [self._session("s1", "meaningful", "proper", "correct")], + } + _classify_failures(report) + assert report["summary"]["knowledge_gap"] == 0 + assert report["summary"]["skill_gap"] == 0 + assert report["summary"]["tool_gap"] == 0 + assert report["summary"]["addressable_meaningful_rate"] == 100.0 + + def test_tool_gap_via_judge(self): + # With failure_attribution present, tool gaps are excluded from addressable. + sess = self._session("s1", "unhelpful", "none", "correct", "tuition?") + sess["metrics"]["failure_attribution"] = {"category": "tool_gap"} + report = { + "summary": {"total_sessions": 2, "meaningful": 1, "declined": 0}, + "sessions": [ + self._session("s0", "meaningful", "proper", "correct"), + sess, + ], + } + _classify_failures(report) + s = report["summary"] + assert s["tool_gap"] == 1 + assert s["skill_gap"] == 0 + assert s["tool_gap_questions"] == ["tuition?"] + # 1 meaningful / (2 - 1 tool gap) = 100% + assert s["addressable_meaningful_rate"] == 100.0 # ================================================================== # -# _load_agent_config # +# _load_eval_spec # # ================================================================== # -class TestLoadAgentConfig: +class TestLoadEvalSpec: def setup_method(self): - _AGENT_CONFIG_CACHE.clear() + _EVAL_SPEC_CACHE.clear() def teardown_method(self): - _AGENT_CONFIG_CACHE.clear() + _EVAL_SPEC_CACHE.clear() def test_explicit_path(self): import json as _json @@ -544,19 +747,22 @@ def test_explicit_path(self): with tempfile.NamedTemporaryFile( mode="w", suffix=".json", delete=False ) as f: - _json.dump({"scope_decisions": [{"topic": "t1"}]}, f) + _json.dump({"scope": "HR only"}, f) path = f.name try: - result = _load_agent_config(path) - assert result == {"scope_decisions": [{"topic": "t1"}]} + result = _load_eval_spec(path) + assert result == {"scope": "HR only"} finally: os.unlink(path) + def test_none_string_disables(self): + assert _load_eval_spec("none") is None + def test_missing_explicit_path_raises(self): import pytest with pytest.raises(FileNotFoundError): - _load_agent_config("/nonexistent/config.json") + _load_eval_spec("/nonexistent/eval_spec.json") def test_cache_hit(self): import json as _json @@ -567,8 +773,8 @@ def test_cache_hit(self): _json.dump({"cached": True}, f) path = f.name try: - first = _load_agent_config(path) - second = _load_agent_config(path) + first = _load_eval_spec(path) + second = _load_eval_spec(path) assert first is second finally: os.unlink(path) @@ -584,8 +790,8 @@ def test_cache_isolates_paths(self): _json.dump(content, f) paths.append(f.name) try: - c1 = _load_agent_config(paths[0]) - c2 = _load_agent_config(paths[1]) + c1 = _load_eval_spec(paths[0]) + c2 = _load_eval_spec(paths[1]) assert c1 != c2 assert c1 == {"a": 1} assert c2 == {"b": 2} @@ -593,9 +799,451 @@ def test_cache_isolates_paths(self): for p in paths: os.unlink(p) - def test_auto_discover_returns_none(self): - # With no config file in known locations, should return None - result = _load_agent_config(None) - # May return None or a config if one exists in the repo - # Just verify it doesn't raise + def test_auto_discover_returns_none_or_dict(self): + # With no eval_spec.json in known locations, returns None; otherwise dict. + result = _load_eval_spec(None) assert result is None or isinstance(result, dict) + + +# ================================================================== # +# _count_trace_metrics # +# ================================================================== # + + +class TestCountTraceMetrics: + + def test_counts_user_messages_and_tools(self): + trace = _FakeTrace( + [ + _FakeSpan("USER_MESSAGE_RECEIVED", {"text": "Q1"}), + _FakeSpan("LLM_RESPONSE", {"response": "A1"}), + _FakeSpan("TOOL_COMPLETED", {"tool": "search"}), + _FakeSpan("USER_MESSAGE_RECEIVED", {"text": "Q2"}), + _FakeSpan("TOOL_COMPLETED", {"tool": "lookup"}), + ] + ) + user_turns, tool_calls = _count_trace_metrics(trace) + assert user_turns == 2 + assert tool_calls == 2 + + def test_empty_trace(self): + trace = _FakeTrace([]) + user_turns, tool_calls = _count_trace_metrics(trace) + assert user_turns == 0 + assert tool_calls == 0 + + def test_single_turn_no_tools(self): + trace = _FakeTrace( + [ + _FakeSpan("USER_MESSAGE_RECEIVED", {"text": "Q"}), + _FakeSpan("LLM_RESPONSE", {"response": "A"}), + ] + ) + user_turns, tool_calls = _count_trace_metrics(trace) + assert user_turns == 1 + assert tool_calls == 0 + + def test_tool_starting_not_counted(self): + trace = _FakeTrace( + [ + _FakeSpan("TOOL_STARTING", {"tool": "search"}), + _FakeSpan("TOOL_COMPLETED", {"tool": "search"}), + ] + ) + _, tool_calls = _count_trace_metrics(trace) + assert tool_calls == 1 + + def test_tool_error_counted(self): + trace = _FakeTrace( + [ + _FakeSpan("TOOL_STARTING", {"tool": "search"}), + _FakeSpan("TOOL_ERROR", {"error": "timeout"}), + _FakeSpan("TOOL_STARTING", {"tool": "lookup"}), + _FakeSpan("TOOL_COMPLETED", {"tool": "lookup"}), + ] + ) + _, tool_calls = _count_trace_metrics(trace) + assert tool_calls == 2 + + +# ================================================================== # +# _compute_dimension_averages # +# ================================================================== # + + +class TestComputeDimensionAverages: + + def test_basic_averages(self): + sessions = [ + _FakeSession( + "s1", + [ + _FakeMetric("correctness", "correct"), + _FakeMetric("tool_usage", "proper"), + _FakeMetric("specificity", "specific"), + _FakeMetric("scope_compliance", "compliant"), + _FakeMetric("first_time_right", "correct"), + ], + ), + _FakeSession( + "s2", + [ + _FakeMetric("correctness", "incorrect"), + _FakeMetric("tool_usage", "none"), + _FakeMetric("specificity", "vague"), + _FakeMetric("scope_compliance", "non_compliant"), + _FakeMetric("first_time_right", "correction_needed"), + ], + ), + ] + report = _FakeReport(sessions) + avgs = _compute_dimension_averages(report) + assert avgs["correctness"] == 1.0 # (2+0)/2 + assert avgs["tool_usage"] == 1.0 + assert avgs["specificity"] == 1.0 + assert avgs["scope_compliance"] == 1.0 + assert avgs["first_time_right"] == 1.0 + + def test_all_perfect(self): + sessions = [ + _FakeSession( + "s1", + [ + _FakeMetric("correctness", "correct"), + _FakeMetric("specificity", "specific"), + ], + ), + ] + report = _FakeReport(sessions) + avgs = _compute_dimension_averages(report) + assert avgs["correctness"] == 2.0 + assert avgs["specificity"] == 2.0 + + def test_empty_report(self): + report = _FakeReport([]) + avgs = _compute_dimension_averages(report) + assert all(v == 0 for v in avgs.values()) + + def test_missing_dimensions(self): + sessions = [ + _FakeSession( + "s1", + [_FakeMetric("response_usefulness", "meaningful")], + ), + ] + report = _FakeReport(sessions) + avgs = _compute_dimension_averages(report) + # Non-dimension metrics should not contribute + assert avgs["correctness"] == 0 + + def test_parse_error_skipped(self): + sessions = [ + _FakeSession( + "s1", + [ + _FakeMetric("correctness", "correct"), + _FakeMetric("correctness", "incorrect", parse_error=True), + ], + ), + ] + report = _FakeReport(sessions) + avgs = _compute_dimension_averages(report) + assert avgs["correctness"] == 2.0 + + def test_unknown_category_skipped(self): + sessions = [ + _FakeSession( + "s1", + [ + _FakeMetric("correctness", "correct"), + _FakeMetric("correctness", "bogus_value"), + ], + ), + ] + report = _FakeReport(sessions) + avgs = _compute_dimension_averages(report) + assert avgs["correctness"] == 2.0 + + def test_tool_usage_no_tool_needed_scores_full(self): + # A correct decline / direct answer where no tool was needed must score 2 + # on tool_usage, not be penalised as a Tool Usage failure (PR #174 P1). + sessions = [ + _FakeSession("s1", [_FakeMetric("tool_usage", "no_tool_needed")]), + ] + avgs = _compute_dimension_averages(_FakeReport(sessions)) + assert avgs["tool_usage"] == 2.0 + + def test_tool_usage_no_tool_needed_does_not_drag_average(self): + # Mixed batch: one proper tool use, one no-tool-needed decline. Both are + # correct outcomes, so the Tool Usage average must stay at 2.0. + sessions = [ + _FakeSession("s1", [_FakeMetric("tool_usage", "proper")]), + _FakeSession("s2", [_FakeMetric("tool_usage", "no_tool_needed")]), + ] + avgs = _compute_dimension_averages(_FakeReport(sessions)) + assert avgs["tool_usage"] == 2.0 + + +# ================================================================== # +# _has_dimension_data # +# ================================================================== # + + +class TestHasDimensionData: + + def test_unscored_dimensions_are_not_data(self): + # --dimensions primary scores no dimension metrics → all-zero averages. + # These must not be treated as real "everything failed" data. + avgs = _compute_dimension_averages( + _FakeReport( + [ + _FakeSession( + "s1", [_FakeMetric("response_usefulness", "meaningful")] + ) + ] + ) + ) + assert avgs == {d: 0 for d in avgs} + assert _has_dimension_data(avgs) is False + + def test_scored_dimensions_are_data(self): + avgs = _compute_dimension_averages( + _FakeReport([_FakeSession("s1", [_FakeMetric("tool_usage", "proper")])]) + ) + assert _has_dimension_data(avgs) is True + + def test_empty_dict(self): + assert _has_dimension_data({}) is False + + +# ================================================================== # +# _compute_multiturn_stats # +# ================================================================== # + + +class TestComputeMultiturnStats: + + def test_basic_stats(self): + resolved = { + "s1": {"user_turns": 3, "tool_calls": 2}, + "s2": {"user_turns": 1, "tool_calls": 4}, + } + stats = _compute_multiturn_stats(resolved) + assert stats["avg_user_turns"] == 2.0 + assert stats["avg_tool_calls"] == 3.0 + assert stats["multi_turn_sessions"] == 1 + + def test_empty_map(self): + result = _compute_multiturn_stats({}) + assert result == { + "avg_user_turns": 0, + "avg_tool_calls": 0, + "multi_turn_sessions": 0, + } + + def test_all_single_turn(self): + resolved = { + "s1": {"user_turns": 1, "tool_calls": 0}, + "s2": {"user_turns": 1, "tool_calls": 1}, + } + stats = _compute_multiturn_stats(resolved) + assert stats["avg_user_turns"] == 1.0 + assert stats["multi_turn_sessions"] == 0 + + def test_missing_keys_default_zero(self): + resolved = {"s1": {}, "s2": {"user_turns": 2}} + stats = _compute_multiturn_stats(resolved) + assert stats["avg_user_turns"] == 1.0 # (0+2)/2 + + def test_corrections_stats_present_for_multiturn(self): + resolved = { + "s1": { + "user_turns": 3, + "tool_calls": 2, + "corrections": 1, + "verifications": 0, + }, + "s2": { + "user_turns": 1, + "tool_calls": 1, + "corrections": 0, + "verifications": 0, + }, + } + stats = _compute_multiturn_stats(resolved) + assert stats["multi_turn_sessions"] == 1 + assert "correction_rate" in stats + assert "verification_rate" in stats + assert stats["correction_rate"] == 50.0 # 1 of 2 sessions + assert stats["avg_corrections"] == 0.5 # 1 total / 2 sessions + + def test_corrections_stats_absent_when_all_single_turn(self): + resolved = { + "s1": { + "user_turns": 1, + "tool_calls": 0, + "corrections": 0, + "verifications": 0, + }, + } + stats = _compute_multiturn_stats(resolved) + assert stats["multi_turn_sessions"] == 0 + assert "correction_rate" not in stats + + +# --------------------------------------------------------------------------- +# _extract_conversation +# --------------------------------------------------------------------------- + + +class _FakeConvSpan: + """Minimal span stub for conversation extraction tests.""" + + def __init__(self, event_type, content=None, agent=None): + self.event_type = event_type + self.content = content + self.agent = agent + + +class TestExtractConversation: + + def test_single_turn(self): + spans = [ + _FakeConvSpan("USER_MESSAGE_RECEIVED", {"text": "Hello"}), + _FakeConvSpan("LLM_RESPONSE", {"response": "call:transfer_to_agent"}), + _FakeConvSpan( + "LLM_RESPONSE", {"response": "Hi there! How can I help?"} + ), + ] + trace = type("T", (), {"spans": spans})() + conv = _extract_conversation(trace) + assert len(conv) == 2 + assert conv[0] == {"role": "user", "text": "Hello"} + assert conv[1]["role"] == "agent" + assert "Hi there" in conv[1]["text"] + + def test_multi_turn(self): + spans = [ + _FakeConvSpan("USER_MESSAGE_RECEIVED", {"text": "What is PTO?"}), + _FakeConvSpan("LLM_RESPONSE", {"response": "call:policy_agent"}), + _FakeConvSpan("LLM_RESPONSE", {"response": "20 days per year."}), + _FakeConvSpan("USER_MESSAGE_RECEIVED", {"text": "Are you sure?"}), + _FakeConvSpan("LLM_RESPONSE", {"response": "Yes, verified."}), + ] + trace = type("T", (), {"spans": spans})() + conv = _extract_conversation(trace) + assert len(conv) == 4 + assert conv[0]["text"] == "What is PTO?" + assert conv[1]["text"] == "20 days per year." + assert conv[2]["text"] == "Are you sure?" + assert conv[3]["text"] == "Yes, verified." + + def test_empty_trace(self): + trace = type("T", (), {"spans": []})() + assert _extract_conversation(trace) == [] + + def test_routing_response_skipped(self): + spans = [ + _FakeConvSpan("USER_MESSAGE_RECEIVED", {"text": "Hello"}), + _FakeConvSpan("LLM_RESPONSE", {"response": "call:agent_x"}), + ] + trace = type("T", (), {"spans": spans})() + conv = _extract_conversation(trace) + # Only user turn, no agent response (routing was skipped) + assert len(conv) == 1 + assert conv[0]["role"] == "user" + + def test_no_user_messages(self): + spans = [ + _FakeConvSpan("LLM_RESPONSE", {"response": "orphaned response"}), + ] + trace = type("T", (), {"spans": spans})() + assert _extract_conversation(trace) == [] + + +# --------------------------------------------------------------------------- +# Public API (generate_quality_report / print_quality_report) +# --------------------------------------------------------------------------- + + +class TestPublicAPI: + + def test_generate_quality_report_is_callable(self): + assert callable(generate_quality_report) + import inspect + + sig = inspect.signature(generate_quality_report) + assert "session_ids" in sig.parameters + assert "model" in sig.parameters + + def test_print_quality_report_minimal(self, capsys): + report = { + "summary": { + "total_sessions": 5, + "meaningful": 3, + "declined": 1, + "partial": 1, + "unhelpful": 0, + "meaningful_rate": 80.0, + "dimension_averages": {"correctness": 1.8}, + }, + "sessions": [], + } + print_quality_report(report) + out = capsys.readouterr().out + assert "80.0%" in out + assert "correctness" in out + + +# --------------------------------------------------------------------------- +# TraceFilter custom_tags JSON path +# Regression guard for the $.labels -> $.custom_tags fix in trace.py: a wrong +# JSON path makes --label filtering silently return nothing, with no error. +# --------------------------------------------------------------------------- + + +class TestCustomTagsJsonPath: + + def test_custom_labels_uses_custom_tags_json_path(self): + from bigquery_agent_analytics import TraceFilter + + where, _params = TraceFilter( + custom_labels={"version": "v1"} + ).to_sql_conditions() + assert "$.custom_tags." in where + assert "$.labels." not in where + + +# --------------------------------------------------------------------------- +# Failure-attribution gating (_has_failure_attribution_data) +# The failure-cause taxonomy must only render when the metrics that drive it +# were actually scored; otherwise it would default every failure to skill_gap. +# --------------------------------------------------------------------------- + + +class TestHasFailureAttributionData: + + @staticmethod + def _report(metric_names): + """Build a minimal report stub with one session scored on metric_names.""" + + metrics = [_FakeMetric(m, "n/a") for m in metric_names] + return _FakeReport([_FakeSession("s1", metrics)]) + + def test_true_with_failure_attribution(self): + report = self._report(["response_usefulness", "failure_attribution"]) + assert _has_failure_attribution_data(report) is True + + def test_true_with_tool_usage_and_correctness(self): + report = self._report(["response_usefulness", "tool_usage", "correctness"]) + assert _has_failure_attribution_data(report) is True + + def test_false_with_primary_only(self): + # --dimensions primary: only the 2 primary metrics scored. + report = self._report(["response_usefulness", "task_grounding"]) + assert _has_failure_attribution_data(report) is False + + def test_false_with_tool_usage_alone(self): + # tool_usage without correctness is not enough for the 2-way fallback. + report = self._report(["response_usefulness", "tool_usage"]) + assert _has_failure_attribution_data(report) is False