Skip to content

Commit 7af80c7

Browse files
committed
feat(toolkit): judge run agent good bad general case
1 parent 68db19e commit 7af80c7

2 files changed

Lines changed: 149 additions & 0 deletions

File tree

veadk/agent.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ class Agent(LlmAgent):
154154

155155
enable_ghostchar: bool = False
156156

157+
enable_dataset_gen: bool = False
158+
157159
def model_post_init(self, __context: Any) -> None:
158160
super().model_post_init(None) # for sub_agents init
159161

@@ -312,6 +314,22 @@ def model_post_init(self, __context: Any) -> None:
312314

313315
self.instruction += "Please add a character `< at the beginning of you each text-based response."
314316

317+
if self.enable_dataset_gen:
318+
from veadk.toolkits.dataset_auto_gen_callback import (
319+
dataset_auto_gen_callback,
320+
)
321+
322+
if self.after_agent_callback:
323+
if isinstance(self.after_agent_callback, list):
324+
self.after_agent_callback.append(dataset_auto_gen_callback)
325+
else:
326+
self.after_agent_callback = [
327+
self.after_agent_callback,
328+
dataset_auto_gen_callback,
329+
]
330+
else:
331+
self.after_agent_callback = dataset_auto_gen_callback
332+
315333
logger.info(f"VeADK version: {VERSION}")
316334

317335
logger.info(f"{self.__class__.__name__} `{self.name}` init done.")
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import json
2+
import os
3+
import re
4+
from pathlib import Path
5+
from typing import Optional
6+
7+
from google.adk.agents.callback_context import CallbackContext
8+
from google.genai import types
9+
10+
from veadk.utils.logger import get_logger
11+
12+
logger = get_logger(__name__)
13+
14+
JUDGE_PROMPT = """You are an AI quality evaluator. Analyze the agent interaction trace and classify it.
15+
16+
## Trace Data
17+
{trace}
18+
19+
## Evaluation Dimensions
20+
21+
### 1. Task Completion
22+
- Did the agent understand the user's intent correctly?
23+
- Was the user's request fully addressed?
24+
- Did the agent provide the expected output?
25+
26+
### 2. Tool Usage (if applicable)
27+
- Were the correct tools/functions selected for the task?
28+
- Were the function arguments accurate and complete?
29+
- Was the function response handled properly?
30+
- Did the agent interpret tool results correctly?
31+
32+
### 3. Response Quality
33+
- Is the response accurate and factually correct?
34+
- Is the response complete without missing information?
35+
- Is the response clear and well-structured?
36+
- Does it match the tool/function output when applicable?
37+
38+
### 4. Error Handling
39+
- Were there any errors or exceptions in the trace?
40+
- Did the agent handle edge cases appropriately?
41+
- Were error messages helpful if errors occurred?
42+
43+
### 5. Conversation Flow
44+
- Is the dialogue natural and coherent?
45+
- Did the agent maintain context across turns?
46+
- Were there any unnecessary or redundant steps?
47+
48+
## Classification Criteria
49+
- **good (1)**: Task completed successfully with correct tool usage, accurate response, and smooth conversation flow
50+
- **general (0)**: Normal interaction without notable issues or achievements, routine responses
51+
- **bad (-1)**: Contains errors, incorrect tool usage, wrong/incomplete response, or failed to address user needs
52+
53+
## Output Format (JSON only, no other text)
54+
{{"type": <-1|0|1>, "reason": "<brief explanation covering key evaluation points>"}}"""
55+
56+
57+
async def dataset_auto_gen_callback(
58+
callback_context: CallbackContext,
59+
) -> Optional[types.Content]:
60+
"""After agent callback to auto-generate dataset from traces."""
61+
ctx = callback_context._invocation_context
62+
agent = ctx.agent
63+
session = ctx.session
64+
65+
if not session or not session.events:
66+
return None
67+
68+
# Build trace json
69+
trace_data = {
70+
"session_id": session.id,
71+
"events": [
72+
{
73+
"author": e.author,
74+
"content": e.content.model_dump() if e.content else None,
75+
}
76+
for e in session.events
77+
],
78+
}
79+
trace_json = json.dumps(trace_data, ensure_ascii=False)
80+
81+
# Judge using LLM
82+
try:
83+
from litellm import acompletion
84+
85+
model_name = getattr(agent.model, "model", "openai/gpt-4o-mini")
86+
api_key = getattr(agent, "model_api_key", None) or getattr(
87+
agent.model, "api_key", None
88+
)
89+
api_base = getattr(agent, "model_api_base", None) or getattr(
90+
agent.model, "api_base", None
91+
)
92+
93+
response = await acompletion(
94+
model=model_name,
95+
messages=[
96+
{"role": "user", "content": JUDGE_PROMPT.format(trace=trace_json)}
97+
],
98+
api_key=api_key,
99+
api_base=api_base,
100+
)
101+
raw_content = response.choices[0].message.content
102+
103+
# Extract JSON from response
104+
json_match = re.search(r'\{[^{}]*"type"[^{}]*\}', raw_content)
105+
if not json_match:
106+
logger.debug("No valid JSON found in LLM response")
107+
return None
108+
result = json.loads(json_match.group())
109+
except Exception as e:
110+
logger.warning(f"Dataset auto gen failed: {e}")
111+
return None
112+
113+
# Save to file based on type
114+
case_type = result.get("type", 0)
115+
116+
output_dir = Path(os.getcwd()) / "dataset" / agent.name
117+
output_dir.mkdir(parents=True, exist_ok=True)
118+
119+
if case_type == 1:
120+
file_name = "good_case.jsonl"
121+
elif case_type == -1:
122+
file_name = "bad_case.jsonl"
123+
else:
124+
file_name = "general_case.jsonl"
125+
record = {"trace": trace_data, "reason": result.get("reason", "")}
126+
127+
with open(output_dir / file_name, "a", encoding="utf-8") as f:
128+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
129+
130+
logger.info(f"Dataset case saved to {output_dir / file_name}")
131+
return None

0 commit comments

Comments
 (0)