bq-optimization-agent/evaluate_agent.py at main · NucleusEngineering/bq-optimization-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import sys
import uuid
import pandas as pd
from datetime import datetime
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
  PointwiseMetricPromptTemplate,
  PointwiseMetric
)
from utils import (
   save_evaluation_results,
   print_evaluation_summary,
   get_agent_response
)

# Save the original stdout
original_stdout = sys.stdout

# Define the logs directory
LOGS_DIR = "logs"
os.makedirs(LOGS_DIR, exist_ok=True)

# Generate a unique log file name
current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
LOG_FILE = os.path.join(LOGS_DIR, f"evaluate_agent_{current_time}.log")

# Redirect stdout and stderr to the log file
sys.stdout = open(LOG_FILE, 'a')
sys.stderr = open(LOG_FILE, 'a')

def log_to_both(message):
    print(message) # Writes to the log file
    original_stdout.write(message + '\n') # Writes to the original stdout (terminal)

# --- Metric 1: Technical Correctness & Feasibility (Adjusted Factual Accuracy) ---

technical_correctness_metric = PointwiseMetric(
   metric="technical_correctness_metric",
   metric_prompt_template=PointwiseMetricPromptTemplate(
       instruction="""You are an expert BigQuery (BQ) Architect assessing an AI's answer, given a user's prompt and a 'reference' (ground truth) answer. Your task is to determine if the AI's BQ optimization advice or rewritten query is **technically correct** and **implementable** within BigQuery's framework.""",
       criteria={
           "Technical Correctness": """The AI's response must present a technically viable and correct solution. This includes:
           - **SQL Syntax:** Any provided SQL must be syntactically correct and runnable in BigQuery.
           - **Feature Feasibility:** All suggested features (e.g., clustering, partitioning, Materialized Views) must be correctly applied and available in the user's specified BQ environment.
           - **Semantic Accuracy:** The rewritten query or suggested fix must maintain the **original intent** of the user's prompt, but apply the optimization correctly.
           - **No Hallucinated Functions:** The response must not recommend non-existent BQ functions, configurations, or best practices."""
       },
       rating_rubric={
           "5": "Excellent: The advice/query is **entirely technically correct and feasible**. The SQL is valid, the BQ features are correctly named and applied, and the original intent is perfectly preserved.",
           "3": "Good: The advice/query is mostly correct, but contains a minor, non-fatal technical flaw (e.g., a tiny syntax typo, or a missing semicolon) that a user could easily fix, but does not invalidate the optimization principle.",
           "1": "Poor: The response contains significant technical errors. The suggested query is unrunnable, the recommended BQ feature usage is fundamentally wrong, or the suggested fix completely changes the user's intent."
       },
       input_variables=["prompt", "reference", "response"],
   ),
)

# --- Metric 2: Optimization Quality & Completeness (Adjusted Completeness) ---

optimization_quality_metric = PointwiseMetric(
   metric="optimization_quality_metric",
   metric_prompt_template=PointwiseMetricPromptTemplate(
       instruction="""You are an expert BigQuery Optimization Consultant assessing the **quality and completeness** of the AI's advice. Given the user's prompt and a 'reference' answer, determine if the AI provided the most impactful and comprehensive optimization strategies.""",
       criteria={
           "Optimization Completeness": """The AI's answer must recommend **all** high-impact optimization techniques present in the reference answer that directly address the user's performance or cost issues.
           - **Focus on Impact:** The response must prioritize techniques with the highest potential return on investment (e.g., partitioning, clustering, use of filtering/views) over low-impact changes.
           - **Justification:** The response must clearly explain *why* the optimization is necessary (e.g., 'reduce data scanned') and *how* to implement it.
           - **Multi-part Coverage:** If the prompt asks for multiple optimization strategies (e.g., for query AND schema), both parts must be addressed."""
       },
       rating_rubric={
           "5": "Excellent: The response provides the **most impactful and comprehensive set of optimization techniques** found in the reference. All major areas for improvement are covered and justified.",
           "3": "Good: The response is adequate. It addresses the core problem but omits one or two important, high-impact optimization strategies available in the reference, or provides a weak justification.",
           "1": "Poor: The response is critically incomplete or recommends low-impact/ineffective optimizations. Essential, high-value optimization methods are entirely missing, rendering the advice unreliable for performance improvement."
       },
       input_variables=["prompt", "reference", "response"],
   ),
)


def run_eval():
   eval_dataset = pd.read_json("evaluation_dataset.json")

   # Generate a unique run name
   current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
   experiment_run_id = f"{current_time}-{uuid.uuid4().hex[:8]}"

   log_to_both(f"--- Starting evaluation: ({experiment_run_id}) ---")

   # Define the evaluation task with your dataset and metrics
   eval_task = EvalTask(
       dataset=eval_dataset,
       metrics=[
          technical_correctness_metric,
          optimization_quality_metric
       ],
       experiment="evaluate-bq-optimization-agent"
   )

   try:
       eval_result = eval_task.evaluate(
           runnable=get_agent_response, experiment_run_name=experiment_run_id
       )
       save_evaluation_results(eval_result, experiment_run_id)
       print_evaluation_summary(eval_result)

   except Exception as e:
       log_to_both(f"An error occurred during evaluation run: {e}")

if __name__ == "__main__":
   run_eval()