Skip to content

Commit b18a889

Browse files
authored
Merge pull request #158 from beform88/test
feat: use bash scripts for batch automated testing, modified some prompts in the human_simulator.py
2 parents 95f0cfc + d8d220c commit b18a889

8 files changed

Lines changed: 352 additions & 261 deletions

File tree

evaluate/base/evaluation.py

Lines changed: 185 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -115,175 +115,201 @@ def multi_turn_evaluation_task(dataset_item):
115115
return result
116116

117117

118-
async def evaluation_threads_task(file_path, max_turn_count=10):
119-
"""与ADK agent进行多轮对话测试"""
120-
print('=' * 80)
121-
print('🤖 与ADK Agent多轮对话测试')
122-
print('=' * 80)
118+
import asyncio
119+
import json
120+
import re
121+
import time
122+
from typing import Dict, Any, List
123123

124-
dataset_json = json.loads(load_dataset_json(file_path))
125-
eval_results = []
126-
for index, dataset_item in enumerate(dataset_json):
127-
time.sleep(10) # 避免请求过于频繁
128-
session_service = InMemorySessionService()
129-
session = await session_service.create_session(
130-
app_name='matmaster_agent',
131-
user_id='human_simulator_test',
124+
125+
async def _run_conversation(dataset_item: Dict[str, Any], max_turn_count: int, save_mode: str = 'w') -> Dict[str, Any]:
126+
"""
127+
执行一次对话测试,并返回结果
128+
:param dataset_item: 单条测试数据
129+
:param max_turn_count: 最大对话轮次
130+
:param save_mode: 写文件模式 ("w" 覆盖 / "a" 追加)
131+
"""
132+
session_service = InMemorySessionService()
133+
session = await session_service.create_session(
134+
app_name='matmaster_agent',
135+
user_id='human_simulator_test',
136+
)
137+
138+
logger.info(f"Test Session: {session.id}")
139+
140+
runner = Runner(
141+
app_name='matmaster_agent',
142+
agent=root_agent,
143+
session_service=session_service
144+
)
145+
146+
simulator = HumanSimulator(max_turn_count=max_turn_count)
147+
148+
# 场景初始化
149+
scenario = {
150+
'name': dataset_item['initial_question'],
151+
'goal': ConversationGoal(
152+
initial_question=dataset_item['initial_question'],
153+
expected_outcomes=dataset_item['expected_outcomes'],
154+
success_criteria=dataset_item['success_criteria']
132155
)
156+
}
157+
print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}")
133158

134-
logger.info(f"Test Session: {session.id}")
159+
simulator.set_goal(scenario['goal'])
160+
initial_question = simulator.get_initial_question()
135161

136-
runner = Runner(
137-
app_name='matmaster_agent',
138-
agent=root_agent,
139-
session_service=session_service
140-
)
162+
print(f"🎯 对话目标: {initial_question}")
163+
print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}")
164+
print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}")
165+
166+
# 初始化结果
167+
eval_results = {
168+
'initial_question': initial_question,
169+
'expected_outcomes': scenario['goal'].expected_outcomes,
170+
'success_criteria': scenario['goal'].success_criteria,
171+
}
172+
for i in range(1, max_turn_count + 1):
173+
eval_results[f'agent_response_{i}'] = ''
174+
eval_results[f'user_response_{i}'] = ''
175+
176+
# 对话循环
177+
turn_count = 0
178+
while turn_count < max_turn_count:
179+
turn_count += 1
180+
print(f"\n🔄 第 {turn_count} 轮对话:")
181+
182+
# 获取用户输入
183+
user_input = initial_question if turn_count == 1 else simulator.get_last_user_response()
184+
print(f"🧑 模拟用户: {user_input}")
185+
186+
# 调用 agent
187+
try:
188+
content = types.Content(role='user', parts=[types.Part(text=user_input)])
189+
agent_response = ''
190+
191+
events = runner.run_async(
192+
user_id=session.user_id,
193+
session_id=session.id,
194+
new_message=content,
195+
run_config=RunConfig(streaming_mode=StreamingMode.SSE)
196+
)
197+
198+
async for event in events:
199+
if event.content and event.content.parts:
200+
for part in event.content.parts:
201+
if part.text:
202+
agent_response += part.text
203+
except asyncio.CancelledError:
204+
msg = '任务被取消,可能是超时或作用域取消导致'
205+
logger.error(msg)
206+
eval_results[f'agent_response_{turn_count}'] = msg
207+
break
208+
except Exception as e:
209+
logger.error(f"获取agent响应失败: {e}")
210+
eval_results[f'agent_response_{turn_count}'] = str(e)
211+
break
141212

142-
# 创建人类模拟器
143-
simulator = HumanSimulator(max_turn_count=max_turn_count)
144-
145-
# 数据预处理
146-
scenario = {
147-
'name': dataset_item['initial_question'],
148-
'goal': ConversationGoal(
149-
initial_question=dataset_item['initial_question'],
150-
expected_outcomes=dataset_item['expected_outcomes'],
151-
success_criteria=dataset_item['success_criteria']
152-
)}
153-
154-
print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}")
155-
156-
# 设置对话目标
157-
simulator.set_goal(scenario['goal'])
158-
initial_question = simulator.get_initial_question()
159-
160-
print(f"🎯 对话目标: {initial_question}")
161-
print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}")
162-
print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}")
163-
164-
# 初始化记录
165-
eval_results.append({})
166-
eval_results[index]['initial_question'] = initial_question
167-
eval_results[index]['expected_outcomes'] = scenario['goal'].expected_outcomes
168-
eval_results[index]['success_criteria'] = scenario['goal'].success_criteria
169-
for i in range(1, 6):
170-
eval_results[index][f'agent_response_{i}'] = ''
171-
eval_results[index][f'user_response_{i}'] = ''
172-
173-
# 开始对话
174-
conversation_ended = False
175-
turn_count = 0
176-
177-
while not conversation_ended and turn_count < 10:
178-
turn_count += 1
179-
print(f"\n🔄 第 {turn_count} 轮对话:")
180-
181-
# 获取用户输入(从模拟器)
182-
if turn_count == 1:
183-
user_input = initial_question
184-
else:
185-
# 从模拟器获取响应
186-
user_input = simulator.get_last_user_response()
187-
188-
print(f"🧑 模拟用户: {user_input}")
189-
190-
# 调用ADK agent
213+
eval_results[f'agent_response_{turn_count}'] = agent_response
214+
print(f"🤖 ADK Agent: {agent_response}")
215+
216+
# 提取 job_id
217+
job_jsons = re.findall(r'<bohrium-chat-msg>(.*?)</bohrium-chat-msg>', agent_response)
218+
job_ids: List[str] = []
219+
for job_json in job_jsons:
191220
try:
192-
content = types.Content(role='user', parts=[types.Part(text=user_input)])
193-
194-
agent_response = ''
195-
196-
events = runner.run_async(
197-
user_id=session.user_id,
198-
session_id=session.id,
199-
new_message=content,
200-
run_config=RunConfig(streaming_mode=StreamingMode.SSE)
201-
)
202-
203-
# 收集agent响应
204-
async for event in events:
205-
if event.content and event.content.parts:
206-
for part in event.content.parts:
207-
if part.text:
208-
agent_response += part.text
209-
except asyncio.CancelledError:
210-
logger.error('任务被取消,可能是超时或作用域取消导致')
211-
print(f"✅ 对话在第{turn_count}轮结束")
212-
eval_results[index][f'agent_response_{turn_count}'] = '任务被取消,可能是超时或作用域取消导致'
213-
break
221+
job_json = json.loads(job_json)
222+
if 'eventData' in job_json and 'content' in job_json['eventData']:
223+
content = job_json['eventData']['content']
224+
if 'job_list' in content and 'job_id' in content['job_list']:
225+
job_ids.append(content['job_list']['job_id'])
214226
except Exception as e:
215-
logger.error(f"获取agent响应失败: {e}")
216-
print(f"✅ 对话在第{turn_count}轮结束")
217-
eval_results[index][f'agent_response_{turn_count}'] = str(e)
218-
break
219-
220-
eval_results[index][f'agent_response_{turn_count}'] = agent_response
221-
print(f"🤖 ADK Agent: {agent_response}")
222-
223-
job_jsons = re.findall(r'<bohrium-chat-msg>(.*?)</bohrium-chat-msg>', agent_response)
224-
job_ids = []
225-
if job_jsons:
226-
for job_json in job_jsons:
227-
try:
228-
job_json = json.loads(job_json)
229-
if 'eventData' in job_json and 'content' in job_json['eventData']:
230-
content = job_json['eventData']['content']
231-
if 'job_list' in content and 'job_id' in content['job_list']:
232-
job_id = content['job_list']['job_id']
233-
job_ids.append(job_id)
234-
except Exception as e:
235-
logger.error(f"提取job_id失败: {e}")
236-
237-
# 查询job状态
238-
if job_ids:
239-
job_ids = list(set(job_ids))
240-
while True:
241-
time.sleep(10)
242-
all_finished = True
243-
for job_id in job_ids:
244-
bohrium_client = Bohrium()
245-
job_info = bohrium_client.job.detail(job_id)
246-
logger.info(f"查询到job状态: {job_id} - 状态: {job_info["status"]}")
247-
if job_info['status'] not in [-1, 2]:
248-
all_finished = False
249-
if all_finished:
250-
break
251-
252-
# 使用模拟器生成用户响应
253-
user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids)
254-
eval_results[index][f'user_response_{turn_count}'] = user_response
255-
print(f"🧑 模拟用户: {user_response}")
256-
else:
257-
# 使用模拟器生成用户响应
258-
user_response, should_continue = simulator.generate_response(agent_response)
259-
eval_results[index][f'user_response_{turn_count}'] = user_response
260-
print(f"🧑 模拟用户: {user_response}")
261-
262-
if not should_continue:
263-
print(f"✅ 对话在第{turn_count}轮结束")
264-
break
265-
266-
# 获取对话摘要
267-
summary = simulator.get_conversation_summary()
268-
eval_results[index]['total_turns'] = summary['total_turns']
269-
eval_results[index]['final_state'] = summary['final_state']
270-
eval_results[index]['duration_minutes'] = summary['duration_minutes']
271-
print(f"\n📊 对话摘要:")
272-
print(f" - 总轮次: {summary['total_turns']}")
273-
print(f" - 最终状态: {summary['final_state']}")
274-
print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟")
275-
276-
with open('evaluation_results.json', 'w') as f:
277-
json.dump(eval_results, f, indent=4, ensure_ascii=False)
278-
279-
# 简单的成功判断
280-
if summary['final_state'] == 'satisfied':
281-
print('✅ 测试通过: 对话成功完成')
227+
logger.error(f"提取job_id失败: {e}")
228+
229+
# 查询 job 状态
230+
if job_ids:
231+
job_ids = list(set(job_ids))
232+
while True:
233+
time.sleep(10)
234+
all_finished = True
235+
for job_id in job_ids:
236+
bohrium_client = Bohrium()
237+
job_info = bohrium_client.job.detail(job_id)
238+
logger.info(f"查询到job状态: {job_id} - 状态: {job_info['status']}")
239+
if job_info['status'] not in [-1, 2]:
240+
all_finished = False
241+
if all_finished:
242+
break
243+
244+
user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids)
282245
else:
283-
print('❌ 测试失败: 对话未成功完成')
246+
user_response, should_continue = simulator.generate_response(agent_response)
247+
248+
eval_results[f'user_response_{turn_count}'] = user_response
249+
print(f"🧑 模拟用户: {user_response}")
284250

285-
await runner.close()
251+
if not should_continue:
252+
print(f"✅ 对话在第{turn_count}轮结束")
253+
break
254+
255+
# 对话总结
256+
summary = simulator.get_conversation_summary()
257+
eval_results.update({
258+
'total_turns': summary['total_turns'],
259+
'final_state': summary['final_state'],
260+
'duration_minutes': summary['duration_minutes'],
261+
})
262+
263+
print(f"\n📊 对话摘要:")
264+
print(f" - 总轮次: {summary['total_turns']}")
265+
print(f" - 最终状态: {summary['final_state']}")
266+
print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟")
267+
268+
# 保存结果
269+
with open('evaluation_results.json', save_mode) as f:
270+
json.dump(eval_results, f, indent=4, ensure_ascii=False)
271+
272+
if summary['final_state'] == 'satisfied':
273+
print('✅ 测试通过: 对话成功完成')
274+
else:
275+
print('❌ 测试失败: 对话未成功完成')
276+
277+
await runner.close()
278+
return eval_results
279+
280+
281+
async def evaluation_threads_task(file_path: str, max_turn_count: int = 10):
282+
"""批量测试所有数据"""
283+
print('=' * 80)
284+
print('🤖 与ADK Agent多轮对话测试')
285+
print('=' * 80)
286+
287+
dataset_json = json.loads(load_dataset_json(file_path))
288+
results = []
289+
for i, dataset_item in enumerate(dataset_json):
290+
time.sleep(10) # 避免请求过于频繁
291+
result = await _run_conversation(dataset_item, max_turn_count, save_mode='w' if i == 0 else 'a')
292+
results.append(result)
286293

287294
print('\n' + '=' * 80)
288295
print('🎉 多轮对话测试完成!')
289296
print('=' * 80)
297+
return results
298+
299+
300+
async def evaluation_threads_single_task(file_path: str, item_id: int, max_turn_count: int = 10):
301+
"""测试单个数据"""
302+
print('=' * 80)
303+
print('🤖 与ADK Agent多轮对话测试')
304+
print('=' * 80)
305+
306+
dataset_json = json.loads(load_dataset_json(file_path))
307+
dataset_item = dataset_json[item_id]
308+
time.sleep(10) # 避免请求过于频繁
309+
310+
result = await _run_conversation(dataset_item, max_turn_count, save_mode='a')
311+
312+
print('\n' + '=' * 80)
313+
print('🎉 单条多轮对话测试完成!')
314+
print('=' * 80)
315+
return result

evaluate/base/human_simulator.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,20 +142,22 @@ def _build_response_prompt(self, agent_message: str) -> str:
142142
Agent最新回复:
143143
{agent_message}
144144
145-
请分析agent的回复是否满足任务需求,并生成合适的响应
145+
请分析agent的回复是否满足 初始问题 需求。如果回复大致符合初始任务要求,请结束对话。如果不符合,请分析不符合的点在哪儿,并生成简洁的用户回复,继续引导agent完成任务
146146
147147
重要限制:
148-
- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}
149-
- 除首轮对话外,其他轮次尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散
150-
- 如果agent在询问具体参数或设置,提供简洁明确的回答
151-
- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话
152-
- 禁止回复可能导致agent产生误解或偏离目标的内容
153-
154-
请以JSON格式回复:
148+
- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}轮;
149+
- 尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散,避免执行轮数超出限制;
150+
- 如果agent在询问具体参数或设置,提供简洁明确的回答;
151+
- 如果agent明确指出当前任务无法完成,请礼貌地结束对话;
152+
- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话;
153+
- agent仅能以文本形式回复,禁止要求agent提供可视化结果;
154+
155+
请以如下JSON格式回复:
155156
{{
156157
"response": "你的回复内容",
157158
"continue": true/false // 是否继续对话
158159
}}
160+
159161
"""
160162

161163
def get_bohr_results(self, agent_message: str, job_id: List[str]) -> Tuple[str, bool]:

0 commit comments

Comments
 (0)