Add per_session_context support for golden eval

evekhm · evekhm · commit 8313d65633ff · 2026-05-26T23:16:10.000Z
Thread per_session_context through quality_report.py into
classify_sessions_via_api() so golden eval expected answers
can be injected into the judge prompt per session.
diff --git a/scripts/quality_report.py b/scripts/quality_report.py
@@ -1158,6 +1158,7 @@ def run_evaluation_from_conversations(
     concurrency=10,
     tag_turns=False,
     eval_config=None,
+    per_session_context=None,
 ):
   """Evaluate local conversations without BigQuery.
 
@@ -1173,6 +1174,8 @@ def run_evaluation_from_conversations(
       concurrency: Max parallel API calls (default 10).
       tag_turns: When True, run the full turn tagger to classify each user
           turn and identify correction boundaries / sub-trajectories.
+      per_session_context: Optional dict mapping session_id to additional
+          context string for the judge prompt (e.g. matched golden eval).
 
   Returns:
       Dict with ``report`` (CategoricalEvaluationReport) and
@@ -1213,6 +1216,7 @@ def run_evaluation_from_conversations(
   async def _run_all():
     classify_task = classify_sessions_via_api(
         transcripts, cat_config, model,
+        per_session_context=per_session_context,
     )
     resolve_task = _build_resolved_map_from_conversations(
         conversations, model, concurrency=concurrency,
@@ -1238,6 +1242,7 @@ def generate_quality_report_from_conversations(
     concurrency=10,
     tag_turns=False,
     trajectory_samples=0,
+    per_session_context=None,
 ) -> dict:
   """Evaluate local conversations and return a structured quality report.
 
@@ -1253,6 +1258,8 @@ def generate_quality_report_from_conversations(
       tag_turns: When True, run the full turn tagger to add per-turn tags,
           correction boundaries, and sub-trajectories to the output.
       trajectory_samples: Number of execution traces to fetch from BigQuery.
+      per_session_context: Optional dict mapping session_id to additional
+          context string for the judge prompt (e.g. matched golden eval).
 
   Returns:
       Dict with ``summary`` and ``sessions`` keys.
@@ -1263,6 +1270,7 @@ def generate_quality_report_from_conversations(
   result = run_evaluation_from_conversations(
       conversations, model=model, config_path=config_path,
       concurrency=concurrency, tag_turns=tag_turns,
+      per_session_context=per_session_context,
   )
   elapsed = time.time() - t0
 
diff --git a/src/bigquery_agent_analytics/categorical_evaluator.py b/src/bigquery_agent_analytics/categorical_evaluator.py
@@ -832,6 +832,7 @@ async def classify_sessions_via_api(
     transcripts: dict[str, str],
     config: CategoricalEvaluationConfig,
     endpoint: str = DEFAULT_ENDPOINT,
+    per_session_context: dict[str, str] | None = None,
 ) -> list[CategoricalSessionResult]:
   """Classifies sessions using the Gemini API (fallback).
 
@@ -843,6 +844,8 @@ async def classify_sessions_via_api(
       transcripts: Maps ``session_id`` to transcript text.
       config: Categorical evaluation configuration.
       endpoint: Model endpoint name.
+      per_session_context: Optional per-session context to inject into the
+          judge prompt (e.g. matched golden eval expected answers).
 
   Returns:
       One ``CategoricalSessionResult`` per session.
@@ -861,7 +864,10 @@ async def classify_sessions_via_api(
       if len(text) > 25000:
         text = text[:25000] + "\n... [truncated]"
 
-      full_prompt = prompt_prefix + "\n\nTranscript:\n" + text
+      session_ctx = ""
+      if per_session_context and sid in per_session_context:
+        session_ctx = "\n\n" + per_session_context[sid]
+      full_prompt = prompt_prefix + session_ctx + "\n\nTranscript:\n" + text
 
       try:
         response = await client.aio.models.generate_content(