diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index e66b0ec9..f48529ec 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -59,6 +59,67 @@ ], } +TRACE_SET_DATA = [ + { + "name": "execute_tool get_city_weather", + "span_id": 5421848634094108689, + "trace_id": 115143748123782151752771111946932434777, + "start_time": 1754884672226444000, + "end_time": 1754884672226993000, + "attributes": { + "gen_ai.tool.name": "get_city_weather", + "gen_ai.tool.description": "Retrieves the weather information of a given city. the args must in English", + "gen_ai.tool.call.id": "call_6ow5622pvcouw3tpvr7rfqtl", + "gcp.vertex.agent.tool_call_args": '{"city": "Xi\'an"}', + }, + "parent_span_id": 7997784243558253239, + }, + { + "name": "call_llm", + "span_id": 7997784243558253239, + "trace_id": 115143748123782151752771111946932434777, + "attributes": { + "session.id": "veadk_example_session", + "user.id": "veadk_default_user", + }, + "parent_span_id": 14844888006539887900, + }, + { + "name": "call_llm", + "span_id": 7789424022423491416, + "trace_id": 115143748123782151752771111946932434777, + "attributes": { + "session.id": "veadk_example_session", + "user.id": "veadk_default_user", + }, + "parent_span_id": 14844888006539887900, + }, + { + "name": "agent_run [chat_robot]", + "span_id": 14844888006539887900, + "trace_id": 115143748123782151752771111946932434777, + "attributes": { + "session.id": "veadk_example_session", + "user.id": "veadk_default_user", + }, + "parent_span_id": 2943363177785645047, + }, + { + "name": "invocation [veadk_default_app]", + "span_id": 2943363177785645047, + "trace_id": 115143748123782151752771111946932434777, + "start_time": 1754884660687962000, + "end_time": 1754884676664833000, + "attributes": { + "input.value": '{"user_id": "veadk_default_user", "session_id": "veadk_example_session", "new_message": "{\\"parts\\": [{\\"video_metadata\\": null, \\"thought\\": null, \\"inline_data\\": null, \\"file_data\\": null, \\"thought_signature\\": null, \\"code_execution_result\\": null, \\"executable_code\\": null, \\"function_call\\": null, \\"function_response\\": null, \\"text\\": \\"How is the weather like in Xi\'an?\\"}], \\"role\\": \\"user\\"}", "run_config": "{\\"speech_config\\": null, \\"response_modalities\\": null, \\"save_input_blobs_as_artifacts\\": false, \\"support_cfc\\": false, \\"streaming_mode\\": \\"StreamingMode.NONE\\", \\"output_audio_transcription\\": null, \\"input_audio_transcription\\": null, \\"realtime_input_config\\": null, \\"enable_affective_dialog\\": null, \\"proactivity\\": null, \\"max_llm_calls\\": 500}"}', + "user.id": "veadk_default_user", + "session.id": "veadk_example_session", + "output.value": '{"content":{"parts":[{"text":"The weather in Xi\'an is cool, with a temperature of 18\u00b0C."}],"role":"model"},"partial":false,"usage_metadata":{"candidates_token_count":132,"prompt_token_count":547,"total_token_count":679},"invocation_id":"e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441","author":"chat_robot","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{}},"id":"c0929124-9be0-4f75-a6ba-f7a531c9ccb6","timestamp":1754884672.227546}', + }, + "parent_span_id": None, + }, +] + def test_evaluator(): base_evaluator = BaseEvaluator(agent=None, name="test_evaluator") @@ -68,7 +129,7 @@ def test_evaluator(): with open(eval_set_file_path, "w") as f: json.dump(EVAL_SET_DATA, f) - base_evaluator.generate_eval_data(eval_set_file_path=eval_set_file_path) + base_evaluator.generate_eval_data(file_path=eval_set_file_path) assert len(base_evaluator.invocation_list) == 1 assert len(base_evaluator.invocation_list[0].invocations) == 1 @@ -78,3 +139,23 @@ def test_evaluator(): ) os.remove(eval_set_file_path) + + +def test_tracing_file_to_evalset(): + base_evaluator = BaseEvaluator(agent=None, name="test_evaluator") + + # save data to file + tracing_file_path = "./tracing_for_test_evaluator.json" + with open(tracing_file_path, "w") as f: + json.dump(TRACE_SET_DATA, f) + + base_evaluator.generate_eval_data(file_path=tracing_file_path) + + assert len(base_evaluator.invocation_list) == 1 + assert len(base_evaluator.invocation_list[0].invocations) == 1 + assert ( + base_evaluator.invocation_list[0].invocations[0].invocation_id + == "e-ea6bb35b-c3f0-4c5c-b127-c71c7d6d6441" + ) + + os.remove(tracing_file_path) diff --git a/veadk/evaluation/base_evaluator.py b/veadk/evaluation/base_evaluator.py index a5bf250c..84f87045 100644 --- a/veadk/evaluation/base_evaluator.py +++ b/veadk/evaluation/base_evaluator.py @@ -13,6 +13,7 @@ # limitations under the License. +import json import time import uuid from abc import abstractmethod @@ -24,6 +25,8 @@ from google.genai import types from pydantic import BaseModel +from veadk.utils.misc import formatted_timestamp + class InvocationTestData(BaseModel): invocation_id: str = "" @@ -79,15 +82,139 @@ def __init__( self.result_list: list[EvalResultData] = [] self.agent_information_list: list[dict] = [] - def load_eval_set(self, eval_set_file: str) -> list[EvalSet]: + def _load_eval_set(self, eval_set_file: str) -> EvalSet: from .eval_set_file_loader import load_eval_set_from_file return load_eval_set_from_file(eval_set_file) - def generate_eval_data(self, eval_set_file_path: str): + def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet: + try: + with open(tracing_file, "r") as f: + tracing_data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}") + except Exception as e: + raise ValueError(f"Error reading file {tracing_file}: {e}") + + # Group spans by trace_id + trace_groups = {} + for span in tracing_data: + trace_id = span["trace_id"] + if trace_id not in trace_groups: + trace_groups[trace_id] = [] + trace_groups[trace_id].append(span) + + # Convert to evalset format + eval_cases, conversation = [], [] + app_name, user_id = "", "" + creation_timestamp = 0 + for trace_id, spans in trace_groups.items(): + tool_uses = [] + + # Extract tool_uses from spans with name starting with "execute_tool" + for span in spans: + if span["name"].startswith("execute_tool"): + tool_uses.append( + { + "id": span["attributes"].get("gen_ai.tool.call.id", None), + "args": json.loads( + span["attributes"].get( + "gcp.vertex.agent.tool_call_args", "{}" + ) + ), + "name": span["attributes"].get("gen_ai.tool.name", None), + } + ) + + # Extract conversation data from spans with name starting with "invocation" + for span in spans: + if span["name"].startswith("invocation"): + # Parse input.value and output.value as JSON + input_value = json.loads( + span["attributes"].get("input.value", "{}") + ) + output_value = json.loads( + span["attributes"].get("output.value", "{}") + ) + + user_content = json.loads(input_value.get("new_message", {})) + final_response = json.loads(json.dumps(user_content)) + final_response["parts"][0]["text"] = ( + output_value.get("content", {}) + .get("parts", [{}])[0] + .get("text", None) + ) + final_response["role"] = None + conversation.append( + { + "invocation_id": output_value.get( + "invocation_id", str(uuid.uuid4()) + ), + "user_content": user_content, + "final_response": final_response, + "intermediate_data": { + "tool_uses": tool_uses, + "intermediate_responses": [], + }, + "creation_timestamp": span["start_time"] / 1e9, + } + ) + user_id = input_value.get("user_id", None) + app_name = ( + span["name"].replace("invocation", "").strip().strip("[]") + ) + creation_timestamp = span["start_time"] / 1e9 + + eval_cases.append( + { + "eval_id": f"veadk_eval_{formatted_timestamp()}", + "conversation": conversation, + "session_input": { + "app_name": app_name, + "user_id": user_id, + "state": {}, + }, + "creation_timestamp": creation_timestamp, + } + ) + + evalset = EvalSet( + eval_set_id="default", + name="default", + description=None, + eval_cases=eval_cases, + creation_timestamp=creation_timestamp, + ) + + return evalset + + def generate_eval_data(self, file_path: str): + """Generate evaluation data from a given file and assign it to the class attribute `invocation_list`.""" eval_case_data_list: list[EvalCaseData] = [] - eval_cases = self.load_eval_set(eval_set_file_path).eval_cases + try: + with open(file_path, "r") as f: + file_content = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON format in file {file_path}: {e}") + except Exception as e: + raise ValueError(f"Error reading file {file_path}: {e}") + + if isinstance(file_content, dict) and "eval_cases" in file_content: + eval_cases = self._load_eval_set(file_path).eval_cases + elif ( + isinstance(file_content, list) + and len(file_content) > 0 + and all( + isinstance(span, dict) and "trace_id" in span for span in file_content + ) + ): + eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases + else: + raise ValueError( + f"Unsupported file format in {file_path}. Please provide a valid file." + ) + for eval_case in eval_cases: eval_case_data = EvalCaseData(invocations=[]) self.agent_information_list.append(