Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 82 additions & 1 deletion tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,67 @@
],
}

TRACE_SET_DATA = [
{
"name": "execute_tool get_city_weather",
"span_id": 5421848634094108689,
"trace_id": 115143748123782151752771111946932434777,
"start_time": 1754884672226444000,
"end_time": 1754884672226993000,
"attributes": {
"gen_ai.tool.name": "get_city_weather",
"gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English",
"gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl",
"gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}',
},
"parent_span_id": 7997784243558253239,
},
{
"name": "call_llm",
"span_id": 7997784243558253239,
"trace_id": 115143748123782151752771111946932434777,
"attributes": {
"session.id": "veadk_example_session",
"user.id": "veadk_default_user",
},
"parent_span_id": 14844888006539887900,
},
{
"name": "call_llm",
"span_id": 7789424022423491416,
"trace_id": 115143748123782151752771111946932434777,
"attributes": {
"session.id": "veadk_example_session",
"user.id": "veadk_default_user",
},
"parent_span_id": 14844888006539887900,
},
{
"name": "agent_run [chat_robot]",
"span_id": 14844888006539887900,
"trace_id": 115143748123782151752771111946932434777,
"attributes": {
"session.id": "veadk_example_session",
"user.id": "veadk_default_user",
},
"parent_span_id": 2943363177785645047,
},
{
"name": "invocation [veadk_default_app]",
"span_id": 2943363177785645047,
"trace_id": 115143748123782151752771111946932434777,
"start_time": 1754884660687962000,
"end_time": 1754884676664833000,
"attributes": {
"input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}',
"user.id": "veadk_default_user",
"session.id": "veadk_example_session",
"output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}',
},
"parent_span_id": None,
},
]


def test_evaluator():
base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")
Expand All @@ -68,7 +129,7 @@ def test_evaluator():
with open(eval_set_file_path, "w") as f:
json.dump(EVAL_SET_DATA, f)

base_evaluator.generate_eval_data(eval_set_file_path=eval_set_file_path)
base_evaluator.generate_eval_data(file_path=eval_set_file_path)

assert len(base_evaluator.invocation_list) == 1
assert len(base_evaluator.invocation_list[0].invocations) == 1
Expand All @@ -78,3 +139,23 @@ def test_evaluator():
)

os.remove(eval_set_file_path)


def test_tracing_file_to_evalset():
base_evaluator = BaseEvaluator(agent=None, name="test_evaluator")

# save data to file
tracing_file_path = "./tracing_for_test_evaluator.json"
with open(tracing_file_path, "w") as f:
json.dump(TRACE_SET_DATA, f)

base_evaluator.generate_eval_data(file_path=tracing_file_path)

assert len(base_evaluator.invocation_list) == 1
assert len(base_evaluator.invocation_list[0].invocations) == 1
assert (
base_evaluator.invocation_list[0].invocations[0].invocation_id
== "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441"
)

os.remove(tracing_file_path)
133 changes: 130 additions & 3 deletions veadk/evaluation/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.


import json
import time
import uuid
from abc import abstractmethod
Expand All @@ -24,6 +25,8 @@
from google.genai import types
from pydantic import BaseModel

from veadk.utils.misc import formatted_timestamp


class InvocationTestData(BaseModel):
invocation_id: str = ""
Expand Down Expand Up @@ -79,15 +82,139 @@ def __init__(
self.result_list: list[EvalResultData] = []
self.agent_information_list: list[dict] = []

def load_eval_set(self, eval_set_file: str) -> list[EvalSet]:
def _load_eval_set(self, eval_set_file: str) -> EvalSet:
from .eval_set_file_loader import load_eval_set_from_file

return load_eval_set_from_file(eval_set_file)

def generate_eval_data(self, eval_set_file_path: str):
def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
try:
with open(tracing_file, "r") as f:
tracing_data = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
except Exception as e:
raise ValueError(f"Error reading file {tracing_file}: {e}")

# Group spans by trace_id
trace_groups = {}
for span in tracing_data:
trace_id = span["trace_id"]
if trace_id not in trace_groups:
trace_groups[trace_id] = []
trace_groups[trace_id].append(span)

# Convert to evalset format
eval_cases, conversation = [], []
app_name, user_id = "", ""
creation_timestamp = 0
for trace_id, spans in trace_groups.items():
tool_uses = []

# Extract tool_uses from spans with name starting with "execute_tool"
for span in spans:
if span["name"].startswith("execute_tool"):
tool_uses.append(
{
"id": span["attributes"].get("gen_ai.tool.call.id", None),
"args": json.loads(
span["attributes"].get(
"gcp.vertex.agent.tool_call_args", "{}"
)
),
"name": span["attributes"].get("gen_ai.tool.name", None),
}
)

# Extract conversation data from spans with name starting with "invocation"
for span in spans:
if span["name"].startswith("invocation"):
# Parse input.value and output.value as JSON
input_value = json.loads(
span["attributes"].get("input.value", "{}")
)
output_value = json.loads(
span["attributes"].get("output.value", "{}")
)

user_content = json.loads(input_value.get("new_message", {}))
final_response = json.loads(json.dumps(user_content))
final_response["parts"][0]["text"] = (
output_value.get("content", {})
.get("parts", [{}])[0]
.get("text", None)
)
final_response["role"] = None
conversation.append(
{
"invocation_id": output_value.get(
"invocation_id", str(uuid.uuid4())
),
"user_content": user_content,
"final_response": final_response,
"intermediate_data": {
"tool_uses": tool_uses,
"intermediate_responses": [],
},
"creation_timestamp": span["start_time"] / 1e9,
}
)
user_id = input_value.get("user_id", None)
app_name = (
span["name"].replace("invocation", "").strip().strip("[]")
)
creation_timestamp = span["start_time"] / 1e9

eval_cases.append(
{
"eval_id": f"veadk_eval_{formatted_timestamp()}",
"conversation": conversation,
"session_input": {
"app_name": app_name,
"user_id": user_id,
"state": {},
},
"creation_timestamp": creation_timestamp,
}
)

evalset = EvalSet(
eval_set_id="default",
name="default",
description=None,
eval_cases=eval_cases,
creation_timestamp=creation_timestamp,
)

return evalset

def generate_eval_data(self, file_path: str):
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
eval_case_data_list: list[EvalCaseData] = []

eval_cases = self.load_eval_set(eval_set_file_path).eval_cases
try:
with open(file_path, "r") as f:
file_content = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
except Exception as e:
raise ValueError(f"Error reading file {file_path}: {e}")

if isinstance(file_content, dict) and "eval_cases" in file_content:
eval_cases = self._load_eval_set(file_path).eval_cases
elif (
isinstance(file_content, list)
and len(file_content) > 0
and all(
isinstance(span, dict) and "trace_id" in span for span in file_content
)
):
eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
else:
raise ValueError(
f"Unsupported file format in {file_path}. Please provide a valid file."
)

for eval_case in eval_cases:
eval_case_data = EvalCaseData(invocations=[])
self.agent_information_list.append(
Expand Down